<h2><b>Login Hugging Face</b></h2>

In [1]:
import gc
import warnings
warnings.filterwarnings("ignore")

import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device set to: {DEVICE}")

import os
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv()
HF_TOKEN = os.getenv("HF")
login(token=HF_TOKEN)
print("Logged in to Hugging Face Hub successfully.")


Device set to: cuda
Logged in to Hugging Face Hub successfully.


<h3><b>Settings</b></h3>

In [2]:
FINETUNED_MODEL_PATH="data/model/Fine_Tuned_LLaMA2"
DATA_FILE_PATH = "data/syntheses_10.csv"

<h2><b>Store Finatial Data (Evidence) To Vector Database</b></h2>

<h4><b>Prepare Functions</b></h4>

In [3]:
import pandas as pd
import chromadb
from uuid import uuid4

def get_data_to_store_in_chromaDB(file_path: str) -> pd.DataFrame:
    # Load data from a CSV file
    df = pd.read_csv(file_path)
    df = df[['evidence_text']]
    return df

def store_to_chromaDB(df:pd.DataFrame, 
                      collection_name: str, 
                      database_path:str):
    # Initialize the client
    client = chromadb.PersistentClient(database_path)
    
    # Get or create the collection
    collection = client.get_or_create_collection(name=collection_name)

    # If the collection is empty, add the documents
    if not collection.count():
        for _, row in df.iterrows():
            # Add document to collection with a unique ID
            collection.add(
                documents=row['evidence_text'],  # Ensure the column name is correct
                ids=[str(uuid4())]  # Generate a unique ID using uuid4
            )
    print(f"Financial Data added to collection '{collection_name}'.")


<h4><b>Prepare Pipeline</b></h4>

In [4]:
def data_store_pipeline(source_data_path:str='data/syntheses_10.csv',
                        collection_name: str="FinancialData", 
                        database_path:str="data/vetorstore"):
    
    data_to_store_in_chromaDB = get_data_to_store_in_chromaDB(file_path=source_data_path)
    store_to_chromaDB(df=data_to_store_in_chromaDB,
                      collection_name=collection_name,
                      database_path=database_path)
    


<h5><b>Execute Pipeline</b></h5>

In [5]:
data_store_pipeline(source_data_path='data/syntheses_10.csv',
                                 collection_name="FinancialData",
                                 database_path="data/DB/vetorstore")

Financial Data added to collection 'FinancialData'.


<h2><b>Load the Finetuned Model</b></h2>

<h4><b>Prepare Functions</b></h4>

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import Any

def get_bnb_config(load_in_4bit: bool = True,
                   bnb_4bit_use_double_quant: bool = True,
                   bnb_4bit_quant_type: str = "nf4",
                   bnb_4bit_compute_dtype: Any = torch.bfloat16
                   ) -> BitsAndBytesConfig:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,# Whether to load model in 4-bit precision
        bnb_4bit_use_double_quant=bnb_4bit_use_double_quant, # Whether to use double quantization
        bnb_4bit_quant_type=bnb_4bit_quant_type,# The quantization type (e.g., "nf4")
        bnb_4bit_compute_dtype=bnb_4bit_compute_dtype# The compute dtype (e.g., torch.bfloat16, torch.float16)
    )
    return bnb_config

def get_model(model_path: str,
              bnb_config: BitsAndBytesConfig,
              device:str):
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 quantization_config=bnb_config,
                                                 device_map = "cuda")
    return model


def get_tokenizer(model_path: str, device: str) -> tuple:
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

<h4><b>Prepare Pipeline</b></h4>

In [7]:
def model_preparation_pipeline(model_path:str,
                               device:str
                                ):
    
    bnb_config = get_bnb_config()
    model = get_model(model_path=model_path,
                      bnb_config=bnb_config,
                      device=device)
    tokenizer = get_tokenizer(model_path=model_path,
                              device=device)
    
    return model, tokenizer

<h5><b>Execute Pipeline</b></h5>

In [8]:
finetuned_model, finetuned_tokenizer = model_preparation_pipeline(model_path=FINETUNED_MODEL_PATH,
                                                                device=DEVICE)
print(f"Got Finetuned Model.")
print(f"Got Finetuned Tokenizer.")

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


Got Finetuned Model.
Got Finetuned Tokenizer.


<h2><b>Generate Answer From LLM</b></h2>

<h4><b>Prepare Functions</b></h4>

In [9]:
def load_data(file_path: str) -> pd.DataFrame:
    # Load data from a CSV file
    data = pd.read_csv(file_path)
    if 'syntheses' in data.columns:
        data.drop("syntheses", axis=1, inplace=True)
    return data

def create_single_prompt(data_series: pd.Series) -> str:
        
        system_prompt = """Give answer to questions provided below from the evidence text."""
        prompt_template = """
        <s>[INST]
        <<SYS>>
        {system_prompt}
        <</SYS>>

        Here is the question:
        {question}

        Consider the provided text as evidence:
        {evidence_text}
        [/INST]
        """
        single_prompt = prompt_template.format(
            system_prompt=system_prompt,  
            question=data_series["question"],  
            evidence_text=data_series["evidence_text"]  
        )
    
        return single_prompt

def generate(prompt, model, tokenizer, max_new_tokens: int = 100, device="cuda"):
    # Move model to the correct device
    model = model.to(device)
    
    # Tokenize the input and move to the same device as the model
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate response
    output = model.generate(input_ids=inputs["input_ids"], max_new_tokens=max_new_tokens)

    # Decode the response
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

def extract_answer(generated_text):
    # Extract the Answer Portion from the whole generated text
    answer_start = generated_text.find("[/INST]") + len("[/INST]")  # Find the end of </INST> tag
    answer = generated_text[answer_start:].strip()  # Extract everything after that position
    return answer    




<h4><b>Prepare Pipeline</b></b>

In [10]:
def generate_answer_pipeline(singe_data_series,
                             model,
                             tokenizer):
    single_prompt = create_single_prompt(singe_data_series)
    generated_text = generate(single_prompt,model,tokenizer)
    answer = extract_answer(generated_text)
    return answer

<h5><b>Execute Pipeline</b></h5>

In [11]:
dataframe = load_data(file_path=DATA_FILE_PATH)
single_dataseries = dataframe.iloc[0]
generated_answer = generate_answer_pipeline(single_dataseries,
                                            finetuned_model,
                                            finetuned_tokenizer)

<h2><b>Query with the Generated Answer in ChromaDB</b></h2>

<h4><b>Prepare Function</b></h4>

In [12]:
import chromadb

def query_collection(generated_answer, 
                     collection_name="FinancialData", 
                     database_path="data/DB/vetorstore", 
                     n_results=5):
    # Initialize the client
    client = chromadb.PersistentClient(database_path)
    
    # Get or create the collection
    collection = client.get_or_create_collection(name=collection_name)
    
    # Query the collection with the provided answer
    query_results = collection.query(
        query_texts=[generated_answer],  # Text to query
        n_results=n_results  # Number of results to retrieve
    )
    
    # Display the query results
    documents = query_results.get('documents')
    return documents

<h5><b>Execute</b></h5>

In [13]:
query_results = query_collection(generated_answer)
for query_result in query_results:
    print("#################### Result ####################")
    print(f"\n{query_result}\n")
    print("*************************************************")

#################### Result ####################

['3M Company and Subsidiaries\n Consolidated Statement of Income\n Years ended December 31\n (Millions, except per share amounts) 2022 2021 2020\n Net sales $ 34,229 $ 35,355 $ 32,184___FINANCEBENCH_DELIMITER___3M Company and Subsidiaries\n Consolidated Balance Sheet\n At December 31\n (Dollars in millions, except per share amount) 2022 2021 Property, plant and equipment — net 9,178 9,429 Total assets $ 46,455 $ 47,072___FINANCEBENCH_DELIMITER___3M Company and Subsidiaries\n Consolidated Statement of Cash Flows\n Years ended December 31\n (Millions) 2022 2021 2020 Cash Flows from Investing Activities\n Purchases of property, plant and equipment (PP&E) (1,749) (1,603) (1,501)', 'This marked the 65th consecutive year of dividend increases for 3M.', '3M Company and Subsidiaries\nConsolidated Balance Sheet\n(Unaudited)\n(Dollars in millions, except per share amount) June 30, 2023 December 31, 2022\nAssets\nCurrent assets\nCash and cash equi