In [None]:
## List of experiments to run
## 1] Default scenario
## 2] flashattention
## 3] expandable_segments: Set expandable segments -- check if it works in notebook
## 4] flash_expandable: flashattention + expandable segments
## 5] torch.compile
## 6] sdpa
## 7] sdpa_torch.compile: sdpa + torch.compile
## 8] 8_bit: 8-bit quantization

In [None]:
prompts_for_experiment = ["What is the firm period for Ognibene?", "What is the firm period for Hengli?", "What are the material planning requirements for Ognibene?", "What are the material planning requirements for Hengli?", "What are the warranty requirements for Hengli?", "Can you create a table showing the warranty requirements for Ognibene and Hengli?"]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig  
import torch
from transformers.agents import ReactCodeAgent
import pandas as pd
import HF_TOKEN
import time
import os
import gc
from torch.nn.attention import SDPBackend, sdpa_kernel

In [None]:
contract_file_full = "/home/vp899/projects/Agent_System/Input/Contracts/Full_Contracts_Consol.md"

In [None]:
with open(contract_file_full, 'r') as file_contract_full:
    input_text_contract_full = file_contract_full.read()

In [None]:
system_prompt_contract_asst = "You are a helpful digital assistant. You will provide clear and concise answers on the input text you have been provided. You must answer in complete sentences. The input text is enclosed within <Input> and </Input>. The input text contains information on contracts with suppliers. Each individual supplier contract is enclosed within tags <Contract Between {Vendor Name} and CNH Industrial Italia SpA> and </Contract Between {Vendor Name} and CNH Industrial Italia SpA>. For example the contract information with Wipro Enterprises (P) Limited would be enclosed between the tags </Contract Between Wipro Enterprises (P) Limited and CNH Industrial Italia SpA> and </Contract Between Wipro Enterprises (P) Limited and CNH Industrial Italia SpA>. At the beginning of the contract text, there are also tags specifying the supplier name. \n <Input> \n" +  input_text_contract_full + "\n </Input>"

In [None]:
prompt_for_chat = [{"role": "system", "content": system_prompt_contract_asst}]

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llama31_hf_token = HF_TOKEN.HF_TOKEN

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token = llama31_hf_token)
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

In [None]:
# create a dataframe with columns: Experiment_Name, Prompt, Context_Length, pre_gen_max_mem_allocated_gpu0, pre_gen_max_mem_allocated_gpu1, pre_gen_reserved_mem_gpu0, pre_gen_reserved_mem_gpu1, post_gen_max_mem_allocated_gpu0, post_gen_max_mem_allocated_gpu1, post_gen_reserved_mem_gpu0, post_gen_reserved_mem_gpu1, latency, llm_answer
experiment_results_df = pd.DataFrame(columns = ["Experiment_Name","PYTORCH_CUDA_ALLOC_CONF", "Prompt", "Context_Length", "pre_gen_max_mem_allocated_gpu0", "pre_gen_max_mem_allocated_gpu1", "pre_gen_reserved_mem_gpu0", "pre_gen_reserved_mem_gpu1", "post_gen_max_mem_allocated_gpu0", "post_gen_max_mem_allocated_gpu1", "post_gen_reserved_mem_gpu0", "post_gen_reserved_mem_gpu1", "latency", "llm_answer", "status"])

In [None]:
experiment_list = ["default", "flashattention", "8 bit", "flashattention+8"]
PYTORCH_CUDA_ALLOC_CONF_value = "expandable_segments:not set"
# PYTORCH_CUDA_ALLOC_CONF_value = "expandable_segments:True"

In [None]:
# prompts_for_experiment = ["What is the firm period for Ognibene?"]
# special_notes = "Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment"
# special_notes = "NOT Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment && multiple prompts"
# special_notes = "Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment && single prompt"
special_notes = "Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment && multiple prompts"
for experiment_name in experiment_list:
    print("Running experiment: ", experiment_name)
    if experiment_name == "flashattention":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token, attn_implementation="flash_attention_2",)
    elif experiment_name == "default":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,)
    elif experiment_name == "8_bit":
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,  quantization_config=quantization_config)
    elif experiment_name == "8_bit_flash":
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token, attn_implementation="flash_attention_2", quantization_config=quantization_config)
    print("     Model loaded for experiment: ", experiment_name)
    for prompt in prompts_for_experiment:
        # print experiment_name and prompt with prefix "Running experiment: " in a single line
        print("             Running experiment: ", experiment_name, " with prompt: ", prompt)

        pre_gen_max_mem_allocated_gpu0 = "{:.2f}".format(torch.cuda.max_memory_allocated(0)/1024**3) 
        pre_gen_max_mem_allocated_gpu1 = "{:.2f}".format(torch.cuda.max_memory_allocated(1)/1024**3)
        mem_stats_gpu0 = torch.cuda.memory_stats(0)
        mem_stats_gpu1 = torch.cuda.memory_stats(1)
        pre_gen_reserved_mem_gpu0 = "{:.2f}".format(mem_stats_gpu0["reserved_bytes.all.peak"]/1024**3)
        # pre_gen_active_mem_gpu0 = "{:.2f}".format(mem_stats_gpu0["active_bytes.all.peak"]/1024**3)
        pre_gen_reserved_mem_gpu1 = "{:.2f}".format(mem_stats_gpu1["reserved_bytes.all.peak"]/1024**3)
        prompt_for_chat.append({"role": "user", "content": prompt})    
        input_ids = tokenizer.apply_chat_template(prompt_for_chat, add_generation_prompt=True, return_tensors="pt").to(model.device)
        start_time = time.time() 
        # if experiment name is sdpa or sdpa_torch.compile, use the sdpa_kernel context manager
        try:
            outputs = model.generate(input_ids, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9,)
            response =  outputs[0][input_ids.shape[-1]:]
            llm_answer = tokenizer.decode(response, skip_special_tokens=True)
            end_time = time.time()
            post_gen_max_mem_allocated_gpu0 = "{:.2f}".format(torch.cuda.max_memory_allocated(0)/1024**3)
            post_gen_max_mem_allocated_gpu1 = "{:.2f}".format(torch.cuda.max_memory_allocated(1)/1024**3)
            mem_stats_gpu0 = torch.cuda.memory_stats(0)
            mem_stats_gpu1 = torch.cuda.memory_stats(1)
            post_gen_reserved_mem_gpu0 = "{:.2f}".format(mem_stats_gpu0["reserved_bytes.all.peak"]/1024**3)
            post_gen_reserved_mem_gpu1 = "{:.2f}".format(mem_stats_gpu1["reserved_bytes.all.peak"]/1024**3)
            latency = end_time - start_time
            status = "Success"
            experiment_results_df.loc[len(experiment_results_df)] = [experiment_name, PYTORCH_CUDA_ALLOC_CONF_value, prompt, input_ids.shape[-1], pre_gen_max_mem_allocated_gpu0, pre_gen_max_mem_allocated_gpu1, pre_gen_reserved_mem_gpu0, pre_gen_reserved_mem_gpu1, post_gen_max_mem_allocated_gpu0, post_gen_max_mem_allocated_gpu1, post_gen_reserved_mem_gpu0, post_gen_reserved_mem_gpu1, latency, llm_answer, status]
           
        except:
            llm_answer = ""
            end_time = time.time()
            status = "Failed"
            experiment_results_df.loc[len(experiment_results_df)] = [experiment_name, PYTORCH_CUDA_ALLOC_CONF_value, prompt, input_ids.shape[-1], pre_gen_max_mem_allocated_gpu0, pre_gen_max_mem_allocated_gpu1, pre_gen_reserved_mem_gpu0, pre_gen_reserved_mem_gpu1, 0, 0, 0, 0, 0, "", status]
            del model
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            break
            
    del model
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    print("     Model deleted for experiment: ", experiment_name)


In [None]:
# save the dataframe to a csv file
experiment_results_df.to_csv("/home/vp899/projects/Long_Context_Chat/Long_Context_Chat/Output/experiment_results_batch_22Nov2024.csv", index=False)

In [None]:
experiment_results_df

In [None]:
experiment_results_df.shape

In [None]:
# print rows 5-9 of the dataframe
experiment_results_df[5:10]