In [1]:
# Test evalutions

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [118]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from tqdm import tqdm

In [161]:
# Load the model and tokenizer, and move the model to the GPU
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')

# Define cosine similarity function
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Function to get embeddings on GPU
def get_embeddings(texts, batch_size=32):
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to('cuda')
            outputs = model(**tokens)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return embeddings

# Function to calculate embeddings for all texts
def calculate_embeddings(df):
    df['untrained_embedding'] = get_embeddings(df['untrained_prediction'].tolist())
    df['trained_embedding'] = get_embeddings(df['trained_prediction'].tolist())
    df['answer_embedding'] = get_embeddings(df['answer'].tolist())
    df['google_gemma_prediction'] = get_embeddings(df['google_gemma_prediction'].tolist())
    return df



In [9]:
# Load the dataset
df = pd.read_csv('predictions.csv')
df.head()

Unnamed: 0,question,context,prompt,answer,untrained_prediction,trained_prediction
0,How does sector rotation work?,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Sector rotation involves shifting investments ...,The text does not provide information on secto...,Sector rotation involves shifting investments ...
1,What are the benefits of investing in the luxu...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,The luxury goods market caters to affluent con...,The provided information does not mention the ...,"The luxury goods market, which includes high-e..."
2,What were the book value and fair value of Gen...,We own certain corporate debt securities of Ge...,<|begin_of_text|><|start_header_id|>system<|en...,The book value was $379 million and the fair v...,The book value of Gentiva Hospice's corporate ...,The book value of Gentiva Hospice’s corporate ...
3,What was the total value of the aggregate tran...,"On September 5, 2023, we acquired Black Knight...",<|begin_of_text|><|start_header_id|>system<|en...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...
4,Should I consider investing in companies with ...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Companies with high market share often enjoy c...,The provided information does not directly ans...,High market share companies tend to be more st...


In [163]:
# Function to calculate precision, recall, F1 score, and accuracy
df = calculate_embeddings(df)
def calculate_metrics(predictions, answers, threshold=0.8):
    similarity_scores = [
        cosine_similarity(pred_embedding, ans_embedding)
        for pred_embedding, ans_embedding in zip(predictions, answers)
    ]
    
    y_true = np.ones(len(answers))  # Ground truth: all answers are positive
    y_pred = np.array([1 if score >= threshold else 0 for score in similarity_scores])

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    return precision, recall, f1, accuracy

# Calculate metrics for untrained predictions
untrained_metrics = calculate_metrics(df['untrained_embedding'], df['answer_embedding'], threshold=0.8)
print(f'Untrained Model - Precision: {untrained_metrics[0]:.2%}, Recall: {untrained_metrics[1]:.2%}, F1 Score: {untrained_metrics[2]:.2%}, Accuracy: {untrained_metrics[3]:.2%}')

# Calculate metrics for trained predictions
trained_metrics = calculate_metrics(df['trained_embedding'], df['answer_embedding'], threshold=0.8)
print(f'Trained Model - Precision: {trained_metrics[0]:.2%}, Recall: {trained_metrics[1]:.2%}, F1 Score: {trained_metrics[2]:.2%}, Accuracy: {trained_metrics[3]:.2%}')

trained_metrics = calculate_metrics(df['google_gemma_prediction'], df['answer_embedding'], threshold=0.8)
print(f'Trained Model - Precision: {trained_metrics[0]:.2%}, Recall: {trained_metrics[1]:.2%}, F1 Score: {trained_metrics[2]:.2%}, Accuracy: {trained_metrics[3]:.2%}')


Untrained Model - Precision: 100.00%, Recall: 53.00%, F1 Score: 69.28%, Accuracy: 53.00%
Trained Model - Precision: 100.00%, Recall: 71.00%, F1 Score: 83.04%, Accuracy: 71.00%
Trained Model - Precision: 0.00%, Recall: 0.00%, F1 Score: 0.00%, Accuracy: 0.00%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
def calculate_text_metrics(predictions, answers):
    # Setting up the Rouge and Bleu configuration
    rouge = Rouge()
    smoothie = SmoothingFunction().method1  # Choose method1 for smoothing as an example

    # Calculating BLEU scores with smoothing
    bleu_scores = [
        sentence_bleu([ans.split()], pred.split(), smoothing_function=smoothie)
        for pred, ans in zip(predictions, answers)
    ]

    # Calculating ROUGE scores, specifically focusing on ROUGE-L
    rouge_scores = [rouge.get_scores(pred, ans, avg=True)['rouge-l'] for pred, ans in zip(predictions, answers)]

    # Average scores for metrics
    avg_bleu = np.mean(bleu_scores)
    avg_rouge_l_f1 = np.mean([score['f'] for score in rouge_scores])  # Focusing on the F1 score of ROUGE-L

    return avg_bleu, avg_rouge_l_f1

In [196]:
# Load the dataset
df = pd.read_csv('predictions.csv')
df.head()

Unnamed: 0,question,context,prompt,answer,untrained_prediction,trained_prediction
0,How does sector rotation work?,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Sector rotation involves shifting investments ...,The text does not provide information on secto...,Sector rotation involves shifting investments ...
1,What are the benefits of investing in the luxu...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,The luxury goods market caters to affluent con...,The provided information does not mention the ...,"The luxury goods market, which includes high-e..."
2,What were the book value and fair value of Gen...,We own certain corporate debt securities of Ge...,<|begin_of_text|><|start_header_id|>system<|en...,The book value was $379 million and the fair v...,The book value of Gentiva Hospice's corporate ...,The book value of Gentiva Hospice’s corporate ...
3,What was the total value of the aggregate tran...,"On September 5, 2023, we acquired Black Knight...",<|begin_of_text|><|start_header_id|>system<|en...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...
4,Should I consider investing in companies with ...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Companies with high market share often enjoy c...,The provided information does not directly ans...,High market share companies tend to be more st...


In [33]:
bleu_score, rouge_l_f1 = calculate_text_metrics(df['trained_prediction'], df['answer'])
bleu_score1, rouge_l_f1_1 = calculate_text_metrics(df['untrained_prediction'], df['answer'])
print(f'Trained Result - BLEU Score: {bleu_score:.2%}, ROUGE-L F1 Score: {rouge_l_f1:.2%}')
print(f'Untrained Result - BLEU Score: {bleu_score1:.2%}, ROUGE-L F1 Score: {rouge_l_f1_1:.2%}')

Trained Result - BLEU Score: 29.90%, ROUGE-L F1 Score: 59.00%
Untrained Result - BLEU Score: 17.94%, ROUGE-L F1 Score: 44.57%


In [95]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
import torch

In [96]:
from datasets import Dataset, load_dataset
from textwrap import dedent

In [98]:
from huggingface_hub import login
login(token="hf-xxx")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/jovyan/.cache/huggingface/token
Login successful


In [99]:
# Model name for the tokenizer and model
MODEL_NAME = "google/gemma-2-2b-it"

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load the model with quantization config, ensure to specify the correct class
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME
)

Downloading shards: 100%|██████████| 2/2 [00:27<00:00, 13.54s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.26s/it]


In [100]:

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    return_full_text=False,
    device = "cuda"
)

In [199]:

dataset = load_dataset(
    "json",
    data_files={"train": "train.json", "validation": "val.json", "test": "test.json"},
)
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer', 'text', 'token_count'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['question', 'context', 'answer', 'text', 'token_count'],
        num_rows: 500
    })
    test: Dataset({
        features: ['question', 'context', 'answer', 'text', 'token_count'],
        num_rows: 100
    })
})

In [114]:
def format_example(row: dict):
    prompt = dedent(
        f"""
    {row["question"]}

    Information:

    ```
    {row["context"]}
    ```
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Use only the information to answer the question",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [115]:

row = dataset["test"][0]
prompt = create_test_prompt(row)
print(prompt)

{'question': 'How does sector rotation work?', 'context': 'This system provides tailored financial advice by analyzing individual risk tolerance and investment goals, offering optimized asset allocation strategies to balance risk and return. It helps users make informed decisions by considering current market conditions and their personal financial objectives'}


In [116]:

%%time
outputs = pipe(str(prompt))
response = f"""
answer:     {row["answer"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)


answer:     Sector rotation involves shifting investments into sectors expected to perform well during specific economic cycles. For example, moving into defensive sectors during a downturn or into growth sectors during recovery. Would you like to incorporate sector rotation into your strategy?
prediction: 

The provided context describes a financial advice system that uses a combination of risk tolerance, investment goals, and market conditions to create personalized asset allocation strategies. 

**Sector rotation** is a different concept in investing. It's not directly related to the advice system described. 

**Here's how sector rotation works:**

* **Identifying Overvalued and Undervalued Sectors:**  Investors analyze the performance of different sectors (e.g., technology, healthcare, energy, financials) and identify those that are currently overvalued or undervalued. 
* **Shifting Investment Focus:**  Based on their analysis, investors shift their portfolio allocation from overv

In [214]:
# Assume dataset is loaded in the following format
# dataset = {"test": [{"question": "What is the capital of France?", "answer": "The capital of France is Paris.", "context": "Paris is the largest city in France."}]}

# Convert your dataset to a Hugging Face Dataset
hf_dataset = Dataset.from_dict({"test": dataset["test"]})

# Function to create prompts for the entire dataset
def create_test_prompts(batch):
    return [f"{item['question']}\n\nContext:\n{item['context']}\n" for item in batch]

# Generate predictions in batches
def generate_predictions(batch):
    prompts = create_test_prompts(batch)
    outputs = pipe(str(prompts), max_new_tokens=128, return_full_text=False)
    return [output['generated_text'] for output in outputs]

# Collecting results into a DataFrame
rows = []
predictions = []

# Loop through the dataset in batches
batch_size = 3
for i in tqdm(range(0, len(hf_dataset), batch_size)):  
    batch = hf_dataset["test"][i:i + batch_size]  # Get the current batch
    preds = generate_predictions(batch)  # Generate predictions for the current batch
    predictions.extend(preds)  # Collect predictions

# Now append the predictions to the rows
for i, row in enumerate(hf_dataset["test"]):
    # Use the prediction if available, else provide a fallback or default value
    prediction = predictions[i] if i < len(predictions) else "No prediction generated"
    
    rows.append(
        {
            "question": row["question"],
            "context": row["context"],
            "prompt": create_test_prompts([row])[0],
            "answer": row["answer"],
            "untrained_prediction": prediction,  # Use the prediction directly
        }
    )

In [154]:
predictions_df = pd.DataFrame(rows)

In [155]:
# Display or save the predictions DataFrame as needed
print(predictions_df[:1])

                         question  \
0  How does sector rotation work?   

                                             context  \
0  This system provides tailored financial advice...   

                                              prompt  \
0  How does sector rotation work?\n\nContext:\nTh...   

                                              answer  \
0  Sector rotation involves shifting investments ...   

                                untrained_prediction  
0  \n\nThese are a mix of questions about finance...  


In [158]:
df['google_gemma_prediction'] = predictions_df['untrained_prediction']

# Display the updated DataFrame
print(df.head())

                                            question  \
0                     How does sector rotation work?   
1  What are the benefits of investing in the luxu...   
2  What were the book value and fair value of Gen...   
3  What was the total value of the aggregate tran...   
4  Should I consider investing in companies with ...   

                                             context  \
0  This system provides tailored financial advice...   
1  This system provides tailored financial advice...   
2  We own certain corporate debt securities of Ge...   
3  On September 5, 2023, we acquired Black Knight...   
4  This system provides tailored financial advice...   

                                              prompt  \
0  <|begin_of_text|><|start_header_id|>system<|en...   
1  <|begin_of_text|><|start_header_id|>system<|en...   
2  <|begin_of_text|><|start_header_id|>system<|en...   
3  <|begin_of_text|><|start_header_id|>system<|en...   
4  <|begin_of_text|><|start_header_id|>system<

In [157]:
bleu_score11, rouge_l_f1_11 = calculate_text_metrics(df['google_gemma_prediction'], df['answer'])
print(f'Trained Result - BLEU Score: {bleu_score11:.2%}, ROUGE-L F1 Score: {rouge_l_f1_11:.2%}')

Trained Result - BLEU Score: 0.05%, ROUGE-L F1 Score: 1.08%


In [206]:
import openai
import pandas as pd
import nest_asyncio
import asyncio
import aiohttp
from tqdm import tqdm

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
hf_dataset = Dataset.from_dict({"test": dataset["test"]})
df = pd.DataFrame(hf_dataset)
# Set your OpenAI API key
openai.api_key = 'sk-proj-xx'

In [207]:
df.head()

Unnamed: 0,test
0,{'answer': 'Sector rotation involves shifting ...
1,{'answer': 'The luxury goods market caters to ...
2,{'answer': 'The book value was $379 million an...
3,{'answer': 'The total value of the aggregate t...
4,{'answer': 'Companies with high market share o...


In [209]:
async def fetch_prediction(session, prompt):
    """Fetch prediction from OpenAI API for a single prompt."""
    response = await session.post(
        "https://api.openai.com/v1/chat/completions",
        json={
            "model": "gpt-4o-mini",  # or "gpt-3.5-turbo"
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 128,
            "temperature": 0,
        },
        headers={"Authorization": f"Bearer {openai.api_key}"},
    )
    response_json = await response.json()
    return response_json['choices'][0]['message']['content']

async def get_predictions(prompts):
    """Fetch predictions for a list of prompts."""
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_prediction(session, prompt) for prompt in prompts]
        predictions = await asyncio.gather(*tasks)
    return predictions

def create_prompts(df):
    """Create prompts from the DataFrame."""
    prompts = []
    for index in range(len(df)):
        row = df[index]  # Access the Series for the specific row
        prompt = f"{row['question']}\n\nInformation:\n```\n{row['context']}\n```"
        prompts.append(prompt)
    return prompts

# Prepare prompts
prompts = create_prompts(df['test'])

# Run the async function to get predictions
predictions = asyncio.run(get_predictions(prompts))

# Add predictions to the DataFrame
df['predictions'] = predictions

# Display or save the DataFrame
print(df.head())

# Optionally, save to CSV
df.to_csv('gpt4_predictions1.csv', index=False)

['How does sector rotation work?\n\nInformation:\n```\nThis system provides tailored financial advice by analyzing individual risk tolerance and investment goals, offering optimized asset allocation strategies to balance risk and return. It helps users make informed decisions by considering current market conditions and their personal financial objectives\n```', 'What are the benefits of investing in the luxury goods market?\n\nInformation:\n```\nThis system provides tailored financial advice by analyzing individual risk tolerance and investment goals, offering optimized asset allocation strategies to balance risk and return. It helps users make informed decisions by considering current market conditions and their personal financial objectives\n```', 'What were the book value and fair value of Gentiva Hospice’s corporate debt securities at December 31, 2023?\n\nInformation:\n```\nWe own certain corporate debt securities of Gentiva Hospice. The book value and fair value are $379 million

In [228]:
df = pd.read_csv('gpt4_predictions1.csv')
df.head()

Unnamed: 0,test,predictions
0,{'answer': 'Sector rotation involves shifting ...,Sector rotation is an investment strategy that...
1,{'answer': 'The luxury goods market caters to ...,Investing in the luxury goods market can offer...
2,{'answer': 'The book value was $379 million an...,"At December 31, 2023, the book value of Gentiv..."
3,{'answer': 'The total value of the aggregate t...,The total value of the aggregate transaction f...
4,{'answer': 'Companies with high market share o...,Investing in companies with high market share ...


In [229]:
df["predictions"][0]

'Sector rotation is an investment strategy that involves shifting investments among different sectors of the economy based on their performance and the economic cycle. The idea is to capitalize on the cyclical nature of various sectors, which tend to perform differently at different stages of the economic cycle. Here’s how it works:\n\n### Understanding Economic Cycles\n1. **Economic Expansion**: During periods of economic growth, sectors like technology, consumer discretionary, and industrials often perform well as consumer spending increases and businesses invest in growth.\n2. **Peak**: As the economy reaches its peak, inflation may rise, and interest rates might increase, leading to a shift in'

In [230]:
# Load the dataset
df1 = pd.read_csv('predictions.csv')
df1.head()

Unnamed: 0,question,context,prompt,answer,untrained_prediction,trained_prediction
0,How does sector rotation work?,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Sector rotation involves shifting investments ...,The text does not provide information on secto...,Sector rotation involves shifting investments ...
1,What are the benefits of investing in the luxu...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,The luxury goods market caters to affluent con...,The provided information does not mention the ...,"The luxury goods market, which includes high-e..."
2,What were the book value and fair value of Gen...,We own certain corporate debt securities of Ge...,<|begin_of_text|><|start_header_id|>system<|en...,The book value was $379 million and the fair v...,The book value of Gentiva Hospice's corporate ...,The book value of Gentiva Hospice’s corporate ...
3,What was the total value of the aggregate tran...,"On September 5, 2023, we acquired Black Knight...",<|begin_of_text|><|start_header_id|>system<|en...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...
4,Should I consider investing in companies with ...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Companies with high market share often enjoy c...,The provided information does not directly ans...,High market share companies tend to be more st...


In [231]:
df1["gpt-4o"]=df["predictions"]
df1["google-gemma"]=predictions_df['untrained_prediction']
df1.head()

Unnamed: 0,question,context,prompt,answer,untrained_prediction,trained_prediction,gpt-4o,google-gemma
0,How does sector rotation work?,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Sector rotation involves shifting investments ...,The text does not provide information on secto...,Sector rotation involves shifting investments ...,Sector rotation is an investment strategy that...,\n\nThese are a mix of questions about finance...
1,What are the benefits of investing in the luxu...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,The luxury goods market caters to affluent con...,The provided information does not mention the ...,"The luxury goods market, which includes high-e...",Investing in the luxury goods market can offer...,\n\nThese are all questions about financial pe...
2,What were the book value and fair value of Gen...,We own certain corporate debt securities of Ge...,<|begin_of_text|><|start_header_id|>system<|en...,The book value was $379 million and the fair v...,The book value of Gentiva Hospice's corporate ...,The book value of Gentiva Hospice’s corporate ...,"At December 31, 2023, the book value of Gentiv...",\n\nThese are some examples of questions that ...
3,What was the total value of the aggregate tran...,"On September 5, 2023, we acquired Black Knight...",<|begin_of_text|><|start_header_id|>system<|en...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,\n\nThe provided context is a mix of questions...
4,Should I consider investing in companies with ...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Companies with high market share often enjoy c...,The provided information does not directly ans...,High market share companies tend to be more st...,Investing in companies with high market share ...,\n\nThese are some examples of questions that ...


In [227]:
df1.head()

Unnamed: 0,question,context,prompt,answer,untrained_prediction,trained_prediction,gpt-4o,google-gemma,untrained_embedding,trained_embedding,answer_embedding
0,How does sector rotation work?,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Sector rotation involves shifting investments ...,The text does not provide information on secto...,Sector rotation involves shifting investments ...,"[-0.064, -0.0755, -0.09753, -0.06866, 0.0925, ...",\n\nThese are a mix of questions about finance...,"[-0.0164, -0.0681, -0.138, -0.0717, 0.0292, -0...","[-0.03275, -0.2083, -0.1153, -0.08466, -0.0345...","[-0.234, -0.10455, -0.0928, -0.2202, -0.0819, ..."
1,What are the benefits of investing in the luxu...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,The luxury goods market caters to affluent con...,The provided information does not mention the ...,"The luxury goods market, which includes high-e...","[0.145, 0.1299, 0.04214, 0.002623, 0.11957, 0....",\n\nThese are all questions about financial pe...,"[0.2258, -0.06506, 0.1164, 0.2308, 0.1228, 0.1...","[0.07886, -0.0835, -0.04477, 0.0831, 0.0429, 0...","[0.1447, -0.1837, 0.0639, 0.1255, 0.02403, 0.0..."
2,What were the book value and fair value of Gen...,We own certain corporate debt securities of Ge...,<|begin_of_text|><|start_header_id|>system<|en...,The book value was $379 million and the fair v...,The book value of Gentiva Hospice's corporate ...,The book value of Gentiva Hospice’s corporate ...,"[-0.0333, 0.06683, -0.11664, -0.2288, -0.2192,...",\n\nThese are some examples of questions that ...,"[0.05026, 0.1636, -0.1477, -0.2177, -0.2258, -...","[-0.01825, 0.05038, -0.1407, -0.1808, -0.2812,...","[-0.1414, -0.1853, -0.09186, 0.005196, -0.2354..."
3,What was the total value of the aggregate tran...,"On September 5, 2023, we acquired Black Knight...",<|begin_of_text|><|start_header_id|>system<|en...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,The total value of the aggregate transaction f...,"[-0.181, -0.097, -0.3398, 0.2184, -0.0729, 0.0...",\n\nThe provided context is a mix of questions...,"[-0.1783, -0.09607, -0.3352, 0.2179, -0.07263,...","[-0.1699, -0.10535, -0.3662, 0.2168, -0.08966,...","[-0.1705, -0.1019, -0.36, 0.2183, -0.08527, 0...."
4,Should I consider investing in companies with ...,This system provides tailored financial advice...,<|begin_of_text|><|start_header_id|>system<|en...,Companies with high market share often enjoy c...,The provided information does not directly ans...,High market share companies tend to be more st...,"[0.1523, -0.1338, -0.1187, -0.1749, -0.01111, ...",\n\nThese are some examples of questions that ...,"[0.1135, -0.3542, 0.02855, -0.1225, -0.1396, 0...","[0.1769, -0.405, -0.1086, -0.0555, 0.0377, 0.1...","[0.03387, -0.585, 0.05948, 0.0505, -0.03882, 0..."


In [232]:
# Load the model and tokenizer, and move the model to the GPU
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')

# Define cosine similarity function
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Function to get embeddings on GPU
def get_embeddings(texts, batch_size=32):
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to('cuda')
            outputs = model(**tokens)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return embeddings

# Function to calculate embeddings for all texts
def calculate_embeddings(df):
    df['untrained_embedding'] = get_embeddings(df['untrained_prediction'].tolist())
    df['trained_embedding'] = get_embeddings(df['trained_prediction'].tolist())
    df['answer_embedding'] = get_embeddings(df['answer'].tolist())
    df['gpt-4o_embeddings'] = get_embeddings(df['gpt-4o'].tolist())
    return df



In [233]:
# Function to calculate precision, recall, F1 score, and accuracy
df = calculate_embeddings(df1)

In [236]:
def calculate_metrics(predictions, answers, threshold=0.8):
    similarity_scores = [
        cosine_similarity(pred_embedding, ans_embedding)
        for pred_embedding, ans_embedding in zip(predictions, answers)
    ]
    
    y_true = np.ones(len(answers))  # Ground truth: all answers are positive
    y_pred = np.array([1 if score >= threshold else 0 for score in similarity_scores])

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    return precision, recall, f1, accuracy

# Calculate metrics for untrained predictions
untrained_metrics = calculate_metrics(df['untrained_embedding'], df['answer_embedding'], threshold=0.8)
print(f'Llama 3 Model - Precision: {untrained_metrics[0]:.2%}, Recall: {untrained_metrics[1]:.2%}, F1 Score: {untrained_metrics[2]:.2%}, Accuracy: {untrained_metrics[3]:.2%}')

# Calculate metrics for trained predictions
trained_metrics = calculate_metrics(df['trained_embedding'], df['answer_embedding'], threshold=0.8)
print(f'Finetuned Model - Precision: {trained_metrics[0]:.2%}, Recall: {trained_metrics[1]:.2%}, F1 Score: {trained_metrics[2]:.2%}, Accuracy: {trained_metrics[3]:.2%}')

trained_metrics = calculate_metrics(df['gpt-4o_embeddings'], df['answer_embedding'], threshold=0.8)
print(f'GPT-4 Model - Precision: {trained_metrics[0]:.2%}, Recall: {trained_metrics[1]:.2%}, F1 Score: {trained_metrics[2]:.2%}, Accuracy: {trained_metrics[3]:.2%}')



Llama 3 Model - Precision: 100.00%, Recall: 53.00%, F1 Score: 69.28%, Accuracy: 53.00%
Finetuned Model - Precision: 100.00%, Recall: 71.00%, F1 Score: 83.04%, Accuracy: 71.00%
GPT-4 Model - Precision: 100.00%, Recall: 32.00%, F1 Score: 48.48%, Accuracy: 32.00%


In [226]:
def calculate_text_metrics(predictions, answers):
    # Setting up the Rouge and Bleu configuration
    rouge = Rouge()
    smoothie = SmoothingFunction().method1  # Choose method1 for smoothing as an example

    # Calculating BLEU scores with smoothing
    bleu_scores = [
        sentence_bleu([ans.split()], pred.split(), smoothing_function=smoothie)
        for pred, ans in zip(predictions, answers)
    ]

    # Calculating ROUGE scores, specifically focusing on ROUGE-L
    rouge_scores = [rouge.get_scores(pred, ans, avg=True)['rouge-l'] for pred, ans in zip(predictions, answers)]

    # Average scores for metrics
    avg_bleu = np.mean(bleu_scores)
    avg_rouge_l_f1 = np.mean([score['f'] for score in rouge_scores])  # Focusing on the F1 score of ROUGE-L

    return avg_bleu, avg_rouge_l_f1

In [237]:
bleu_score, rouge_l_f1 = calculate_text_metrics(df1['trained_prediction'], df1['answer'])
bleu_score1, rouge_l_f1_1 = calculate_text_metrics(df1['untrained_prediction'], df1['answer'])
bleu_score11, rouge_l_f1_11 = calculate_text_metrics(df1['gpt-4o'], df1['answer'])
print(f'Finetuned Result - BLEU Score: {bleu_score:.2%}, ROUGE-L F1 Score: {rouge_l_f1:.2%}')
print(f'Llama 3 Result - BLEU Score: {bleu_score1:.2%}, ROUGE-L F1 Score: {rouge_l_f1_1:.2%}')
print(f'GPT-4 Result - BLEU Score: {bleu_score11:.2%}, ROUGE-L F1 Score: {rouge_l_f1_11:.2%}')

Finetuned Result - BLEU Score: 29.90%, ROUGE-L F1 Score: 59.00%
Llama 3 Result - BLEU Score: 17.94%, ROUGE-L F1 Score: 44.57%
GPT-4 Result - BLEU Score: 12.85%, ROUGE-L F1 Score: 33.23%
