In [1]:
# Paths
DATA_PATH = "data/syntheses_10.csv"  # Path to the dataset
MODEL_PATH = "NousResearch/Llama-2-7b-chat-hf"
FINETUNED_MODEL_PATH = "data/model/Fine_Tuned_LLaMA2"  # Path to the fine-tuned model output

# Split parameters for dataset
TEST_SIZE = 0.05  # Proportion of the dataset to include in the test split
RANDOM_STATE = 42  # Random state for reproducibility

# Prompts for training and prediction
TRAINING_SYSTEM_PROMPT = """You are a financial chatbot trained to answer questions based on the information provided.
Your responses should be directly sourced from the content of the evidence_text(context).
When asked a question, ensure that your answer is explicitly supported by the text and do not
include any external information, interpretations, or assumptions not clearly stated in the evidence_text(context).
If a question pertains to financial data or analysis that is not explicitly covered in the evidence_text(context) provided,
respond by stating that the information is not available in the evidence_text(context).
Your primary focus should be on accuracy, specificity, and adherence to the information in the evidence_text(context),
particularly regarding financial statements, company performance, and market positions."""

TRAINING_PROMPT_TEMPLATE = """
<s>[INST]
<<SYS>>
{system_prompt}
<</SYS>>
{question}
{evidence_text}
[/INST]
{answer}
</s>"
"""

PREDICTION_SYSTEM_PROMPT = """Give answer to questions provided below from the evidence text."""
PREDICTION_PROMPT_TEMPLATE = """
<s>[INST]
<<SYS>>
{system_prompt}
<</SYS>>

Here is the question:
{question}

Consider the provided text as evidence:
{evidence_text}
[/INST]
"""



# Training constants
BATCH_SIZE = 4  # Number of samples per batch
GRAD_ACCUM_STEPS = 4  # Gradient accumulation steps
LEARNING_RATE = 2e-5  # Learning rate for optimization
NUM_EPOCHS = 5  # Number of training epochs
EVAL_STEPS = 50  # Evaluation interval in steps
LOGGING_STEPS = 10  # Logging interval in steps
MAX_SEQ_LENGTH = 100  # Maximum sequence length for input data

#Prediction
MAX_NEW_TOKENS = 100

In [2]:
import gc
import warnings
warnings.filterwarnings("ignore")

import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device set to: {DEVICE}")

import os
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv()
HF_TOKEN = os.getenv("HF")
login(token=HF_TOKEN)
print("Logged in to Hugging Face Hub successfully.")


Device set to: cuda
Logged in to Hugging Face Hub successfully.


<h2><b>Data Preparation</b></h2>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(file_path: str) -> pd.DataFrame:
    # Load data from a CSV file
    data = pd.read_csv(file_path)
    if 'syntheses' in data.columns:
        data.drop("syntheses", axis=1, inplace=True)
    return data

def split_data(data: pd.DataFrame, test_size: float, random_state: int) -> tuple:
    # Split the dataset into train and test sets
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
    return train_data, test_data


In [4]:
def data_preparation_pipeline(data_file_path:str,test_size:float,random_state:int):
    dataframe = load_data(file_path=data_file_path)
    train_dataframe, test_dataframe = split_data(data=dataframe,test_size=test_size,random_state=random_state)
    return train_dataframe,test_dataframe

In [5]:
train_dataframe, test_dataframe = data_preparation_pipeline(data_file_path=DATA_PATH,
                                                          test_size=TEST_SIZE,
                                                          random_state=RANDOM_STATE)

print(f"Train Dataframe Shape = {train_dataframe.shape}.")
print(f"Test Dataframe Shape = {test_dataframe.shape}.")

Train Dataframe Shape = (142, 3).
Test Dataframe Shape = (8, 3).


<h2><b>Prompt Preparation</b></h2>

In [6]:
import pandas as pd

def create_single_prompt(data_series: pd.Series, 
                         prompt_template: str, 
                         system_prompt: str, 
                         is_predict: bool = False) -> str:
    single_prompt = ""  # Initialize an empty string to store the modified single prompt
    
    if is_predict:
        # If is_predict is True, format the prompt without the answer
        single_prompt = prompt_template.format(
            system_prompt=system_prompt,  
            question=data_series["question"],  
            evidence_text=data_series["evidence_text"]  
        )
    else:
        # If is_predict is False, format the prompt with the answer
        single_prompt = prompt_template.format(
            system_prompt=system_prompt,  
            question=data_series["question"],  
            evidence_text=data_series["evidence_text"],  
            answer=data_series["answer"]  
        )
    
    return single_prompt

def create_prompts(dataframe: pd.DataFrame,  
                   prompt_template: str,  
                   system_prompt: str,  
                   is_predict: bool = False) -> list[str]:
    prompts = []  # Initialize an empty list to store the generated prompts
    
    for _, row in dataframe.iterrows():  # Iterate over each row in the DataFrame
        # Generate a single prompt
        single_prompt = create_single_prompt(data_series=row,  
                                             prompt_template=prompt_template,  
                                             system_prompt=system_prompt,  
                                             is_predict=is_predict)  
        prompts.append(single_prompt)  # Append the generated prompt to the list
    
    return prompts


In [7]:
import pandas as pd
def prompt_preparation_pipeline(train_dataframe: pd.DataFrame,
                                train_prompt_template: str,
                                train_system_prompt: str,
                                test_dataframe: pd.DataFrame,
                                test_prompt_template: str,
                                test_system_prompt: str,
                                ):
    train_prompts = create_prompts(dataframe=train_dataframe,
                                   prompt_template=train_prompt_template,
                                   system_prompt=train_system_prompt,
                                   is_predict=False)
    
    test_prompts = create_prompts(dataframe=test_dataframe,
                                   prompt_template=test_prompt_template,
                                   system_prompt=test_system_prompt,
                                   is_predict=True)
    
    return train_prompts, test_prompts

In [8]:
train_prompts, test_prompts = prompt_preparation_pipeline(train_dataframe=train_dataframe,
                                                          train_prompt_template=TRAINING_PROMPT_TEMPLATE,
                                                          train_system_prompt=TRAINING_SYSTEM_PROMPT,
                                                          test_dataframe=test_dataframe,
                                                          test_prompt_template=PREDICTION_PROMPT_TEMPLATE,
                                                          test_system_prompt=PREDICTION_SYSTEM_PROMPT
                                                        )
print(f"Train Prompts = {len(train_prompts)}.")
print(f"Test Prompts = {len(test_prompts)}.")

Train Prompts = 142.
Test Prompts = 8.


<h2><b>Model Preparation</b></h2>

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import Any

def get_bnb_config(load_in_4bit: bool = True,
                   bnb_4bit_use_double_quant: bool = True,
                   bnb_4bit_quant_type: str = "nf4",
                   bnb_4bit_compute_dtype: Any = torch.bfloat16
                   ) -> BitsAndBytesConfig:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,# Whether to load model in 4-bit precision
        bnb_4bit_use_double_quant=bnb_4bit_use_double_quant, # Whether to use double quantization
        bnb_4bit_quant_type=bnb_4bit_quant_type,# The quantization type (e.g., "nf4")
        bnb_4bit_compute_dtype=bnb_4bit_compute_dtype# The compute dtype (e.g., torch.bfloat16, torch.float16)
    )
    return bnb_config

def get_model(model_path: str,
              bnb_config: BitsAndBytesConfig,
              device:str):
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 quantization_config=bnb_config,
                                                 device_map = "auto")
    return model


def get_tokenizer(model_path: str, device: str) -> tuple:
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

In [10]:
def model_preparation_pipeline(model_path:str,
                               device:str
                                ):
    
    bnb_config = get_bnb_config()
    model = get_model(model_path=model_path,
                      bnb_config=bnb_config,
                      device=device)
    tokenizer = get_tokenizer(model_path=model_path,
                              device=device)
    
    return model, tokenizer
    

In [11]:
training_model, training_tokenizer = model_preparation_pipeline(model_path=MODEL_PATH,
                                                                device=DEVICE)
print(f"Got Training Model.")
print(f"Got Training Tokenizer.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Got Training Model.
Got Training Tokenizer.


<h2><b>Fine Tune</b></h2>

In [12]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments
from trl import SFTTrainer
import pandas as pd
from datasets import Dataset

def get_lora_config(r: int = 16,
                    lora_alpha: int = 64,
                    target_modules: list[str] = ["q_proj", "k_proj", "v_proj", "o_proj"],
                    lora_dropout: float = 0.1,
                    bias: str = "none",
                    task_type: str = "CAUSAL_LM"
                    ) -> LoraConfig:
    lora_config = LoraConfig(
        r=r,# The rank of the low-rank decomposition
        lora_alpha=lora_alpha,# Scaling factor for the low-rank matrix
        target_modules=target_modules,# Target modules (e.g., LLaMA-specific layers)
        lora_dropout=lora_dropout,# Dropout rate for the low-rank layers
        bias=bias,# Bias term ("none", "all", or "lora_only")
        task_type=task_type # Task type (e.g., "CAUSAL_LM")
    )
    return lora_config

def apply_lora(model,lora_config):
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    return model


def get_training_args(
    output_dir: str,
    per_device_train_batch_size: int,
    gradient_accumulation_steps: int,
    logging_steps: int,
    learning_rate: float,
    num_train_epochs: int,
    eval_steps: int,
    seed: int = 42,
    optim: str = "paged_adamw_32bit",
    fp16: bool = True,
    weight_decay: float = 0.01,
    max_grad_norm: float = 0.3,
    evaluation_strategy: str = "steps",
    warmup_ratio: float = 0.05,
    save_strategy: str = "epoch",
    group_by_length: bool = True,
    lr_scheduler_type: str = "cosine",
    push_to_hub: bool = True,
) -> TrainingArguments:

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim=optim,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        fp16=fp16,
        weight_decay=weight_decay,
        max_grad_norm=max_grad_norm,
        num_train_epochs=num_train_epochs,
        evaluation_strategy=evaluation_strategy,
        eval_steps=eval_steps,
        warmup_ratio=warmup_ratio,
        save_strategy=save_strategy,
        group_by_length=group_by_length,
        lr_scheduler_type=lr_scheduler_type,
        seed=seed,
        push_to_hub=push_to_hub,
    )
    return training_args


def train_model(model,
                tokenizer,
                lora_config,
                training_args,
                train_prompts,
                val_prompts,
                max_seq_length: int = 100):
    
    train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_prompts}))
    val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_prompts}))
    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        peft_config=lora_config,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_args,
    )
    
    # Train the model
    trainer.train()

In [13]:
def finetune_pipeline(model,
                      tokenizer,
                      train_prompts,
                      val_prompts,
                      finetuned_model_dir: str,
                      batch_size: int,
                      grad_accum_steps: int,
                      logging_steps: int,
                      learning_rate: float,
                      num_epochs: int,
                      eval_steps: int,

                      max_seq_length: int = 100,
                      optim: str = "paged_adamw_32bit",
                      fp16: bool = True,
                      weight_decay: float = 0.01,
                      max_grad_norm: float = 0.3,
                      evaluation_strategy: str = "steps",
                      warmup_ratio: float = 0.05,
                      save_strategy: str = "epoch",
                      group_by_length: bool = True,
                      lr_scheduler_type: str = "cosine",
                      push_to_hub: bool = True,):
    

    lora_config = get_lora_config()
    lora_applied_model = apply_lora(model=model, lora_config=lora_config)

    trainable_params = sum(p.numel() for p in lora_applied_model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in lora_applied_model.parameters())
    print(f"Trainable params: {trainable_params} || Total params: {total_params} || Trainable%: {100 * trainable_params / total_params:.2f}%")

    training_args = get_training_args(output_dir=finetuned_model_dir,
                                      per_device_train_batch_size=batch_size,
                                      gradient_accumulation_steps=grad_accum_steps,
                                      logging_steps=logging_steps,
                                      learning_rate=learning_rate,
                                      num_train_epochs=num_epochs,
                                      eval_steps=eval_steps,
                                      optim=optim,
                                      fp16=fp16,
                                      weight_decay=weight_decay,
                                      max_grad_norm=max_grad_norm,
                                      evaluation_strategy=evaluation_strategy,
                                      warmup_ratio=warmup_ratio,
                                      save_strategy=save_strategy,
                                      group_by_length=group_by_length,
                                      lr_scheduler_type=lr_scheduler_type,
                                      push_to_hub=push_to_hub)
    train_model(model=lora_applied_model,
                tokenizer=tokenizer,
                lora_config=lora_config,
                training_args=training_args,
                train_prompts=train_prompts,
                val_prompts=val_prompts,
                max_seq_length=max_seq_length)


In [14]:
finetune_pipeline(model=training_model,
                  tokenizer=training_tokenizer,
                  train_prompts=train_prompts,
                  val_prompts=test_prompts,
                  finetuned_model_dir= FINETUNED_MODEL_PATH,
                  batch_size= BATCH_SIZE,
                  grad_accum_steps=GRAD_ACCUM_STEPS,
                  logging_steps= LOGGING_STEPS,
                  learning_rate= LEARNING_RATE,
                  num_epochs= NUM_EPOCHS,
                  eval_steps= EVAL_STEPS)

torch.cuda.empty_cache()
gc.collect()

Trainable params: 16777216 || Total params: 3517190144 || Trainable%: 0.48%


Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

  0%|          | 0/45 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


{'loss': 3.4521, 'grad_norm': 3.1178479194641113, 'learning_rate': 1.866025403784439e-05, 'epoch': 1.11}
{'loss': 2.7338, 'grad_norm': 2.581815242767334, 'learning_rate': 1.2947551744109044e-05, 'epoch': 2.22}
{'loss': 2.1983, 'grad_norm': 3.1041548252105713, 'learning_rate': 5.66116260882442e-06, 'epoch': 3.33}
{'loss': 1.8761, 'grad_norm': 3.2031314373016357, 'learning_rate': 6.912625135579587e-07, 'epoch': 4.44}
{'train_runtime': 359.7158, 'train_samples_per_second': 1.974, 'train_steps_per_second': 0.125, 'train_loss': 2.4802604887220596, 'epoch': 5.0}


1858

<h2><b>Generate</b><h2>

In [15]:

def extract_answer(generated_text):
    # Extract the Answer Portion from the whole generated text
    answer_start = generated_text.find("[/INST]") + len("[/INST]")  # Find the end of </INST> tag
    answer = generated_text[answer_start:].strip()  # Extract everything after that position
    return answer    


def generate(prompt,model,tokenizer,max_new_tokens: int = 100):
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate response
    output = model.generate(input_ids=inputs["input_ids"], max_new_tokens=max_new_tokens)

    # Decode the response
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

In [16]:
def generation_pipeline(prompts,
                               model,
                               tokenizer,
                               max_new_tokens):
        generated_answers = []
        for idx, prompt in enumerate(prompts):
                generated_text = generate(prompt=prompt,
                                          model=model,
                                          tokenizer=tokenizer,
                                          max_new_tokens=max_new_tokens)
                generated_answer = extract_answer(generated_text=generated_text)
                generated_answers.append(generated_answer)
        return generated_answers


In [17]:

finetuned_model, finetuned_tokenizer = model_preparation_pipeline(model_path=FINETUNED_MODEL_PATH,
                                                                device=DEVICE)
print(f"Got Finetuned Model.")
print(f"Got Finetuned Tokenizer.")
generated_answers = generation_pipeline(prompts=test_prompts,
                                        model=finetuned_model,
                                        tokenizer=finetuned_tokenizer,
                                        max_new_tokens=MAX_NEW_TOKENS)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Got Finetuned Model.
Got Finetuned Tokenizer.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<h2><b>Evaluate</b></h2>

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(real_answer, generated_answer):
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Transform the answers into TF-IDF vectors
    vectors = vectorizer.fit_transform([real_answer, generated_answer])

    # Compute cosine similarity
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]

    return similarity


In [19]:
import pandas as pd
def evaluation_pipeline(test_dataframe, generated_answers):
    # Initialize a list to store the results
    results = []

    for idx, generated_answer in enumerate(generated_answers):
        question = test_dataframe.iloc[idx, 0]
        original_answer = test_dataframe.iloc[idx, 1]  # Assuming the original answer is in the second column
        cos_similarity = calculate_cosine_similarity(original_answer, generated_answer)

        # Append the result as a dictionary
        results.append({
            "Question": question,
            "Original Answer": original_answer,
            "Generated Answer": generated_answer,
            "Cosine Similarity": cos_similarity
        })

    # Convert the results list to a DataFrame
    result_df = pd.DataFrame(results)
    return result_df


In [20]:
evaluation_result_dataframe = evaluation_pipeline(test_dataframe=test_dataframe,
                                                  generated_answers=generated_answers)
evaluation_result_dataframe

Unnamed: 0,Question,Original Answer,Generated Answer,Cosine Similarity
0,Does Corning have positive working capital bas...,Yes. Corning had a positive working capital am...,Based on the information provided in the conso...,0.320811
1,What is Amazon's FY2017 days payable outstandi...,93.86,"To answer the question, we need to calculate A...",0.0
2,Does Paypal have positive working capital base...,Yes. Paypal has a positive working capital of ...,Based on the information provided in the conso...,0.354419
3,Has CVS Health paid dividends to common shareh...,"Yes, CVS paid a $ 0.55 dividend per share ever...","Based on the evidence provided in the text, th...",0.306135
4,Is CVS Health a capital-intensive business bas...,"Yes, CVS Health requires an extensive asset ba...","Based on the evidence provided in the text, th...",0.320862
5,Does AMD have a reasonably healthy liquidity p...,"Yes. The quick ratio is 1.57, calculated as (c...",Based on the quick ratio calculated from the p...,0.383878
6,Is Boeing's business subject to cyclicality?,"Yes, Boeing's business is subject to cyclicali...","According to the provided text, the answer to ...",0.557308
7,Did Ulta Beauty's wages expense as a percent o...,Wages expense as a percent of net sales increa...,"Based on the evidence provided in the text, Ul...",0.399587
