In [1]:
## List of experiments to run
## 1] Default scenario
## 2] flashattention
## 3] expandable_segments: Set expandable segments -- check if it works in notebook
## 4] flash_expandable: flashattention + expandable segments
## 5] torch.compile
## 6] sdpa
## 7] sdpa_torch.compile: sdpa + torch.compile
## 8] 8_bit: 8-bit quantization

In [20]:
prompts_for_experiment = ["What is the firm period for Ognibene?", "What is the firm period for Hengli?", "What are the material planning requirements for Ognibene?", "What are the material planning requirements for Hengli?", "What are the warranty requirements for Hengli?", "Can you create a table showing the warranty requirements for Ognibene and Hengli?"]

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig  
import torch
from transformers.agents import ReactCodeAgent
import pandas as pd
import HF_TOKEN
import time
import os
import gc
from torch.nn.attention import SDPBackend, sdpa_kernel

In [4]:
contract_file_full = "/home/vp899/projects/Agent_System/Input/Contracts/Full_Contracts_Consol.md"

In [5]:
with open(contract_file_full, 'r') as file_contract_full:
    input_text_contract_full = file_contract_full.read()

In [6]:
system_prompt_contract_asst = "You are a helpful digital assistant. You will provide clear and concise answers on the input text you have been provided. You must answer in complete sentences. The input text is enclosed within <Input> and </Input>. The input text contains information on contracts with suppliers. Each individual supplier contract is enclosed within tags <Contract Between {Vendor Name} and CNH Industrial Italia SpA> and </Contract Between {Vendor Name} and CNH Industrial Italia SpA>. For example the contract information with Wipro Enterprises (P) Limited would be enclosed between the tags </Contract Between Wipro Enterprises (P) Limited and CNH Industrial Italia SpA> and </Contract Between Wipro Enterprises (P) Limited and CNH Industrial Italia SpA>. At the beginning of the contract text, there are also tags specifying the supplier name. \n <Input> \n" +  input_text_contract_full + "\n </Input>"

In [7]:
prompt_for_chat = [{"role": "system", "content": system_prompt_contract_asst}]

In [8]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llama31_hf_token = HF_TOKEN.HF_TOKEN

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token = llama31_hf_token)
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

In [10]:
# create a dataframe with columns: Experiment_Name, Prompt, Context_Length, pre_gen_max_mem_allocated_gpu0, pre_gen_max_mem_allocated_gpu1, pre_gen_reserved_mem_gpu0, pre_gen_reserved_mem_gpu1, post_gen_max_mem_allocated_gpu0, post_gen_max_mem_allocated_gpu1, post_gen_reserved_mem_gpu0, post_gen_reserved_mem_gpu1, latency, llm_answer
experiment_results_df = pd.DataFrame(columns = ["Experiment_Name", "Prompt", "Context_Length", "pre_gen_max_mem_allocated_gpu0", "pre_gen_max_mem_allocated_gpu1", "pre_gen_reserved_mem_gpu0", "pre_gen_reserved_mem_gpu1", "post_gen_max_mem_allocated_gpu0", "post_gen_max_mem_allocated_gpu1", "post_gen_reserved_mem_gpu0", "post_gen_reserved_mem_gpu1", "latency", "llm_answer", "special_notes"])

In [11]:
# experiment_list = ["flashattention_expandable", "torch.compile", "sdpa", "torch.compile_dynamic_mode", "sdpa_torch.compile", "8_bit", "8_bit_flash"]
experiment_list = ["flashattention_expandable", "torch.compile", "sdpa", "sdpa_torch.compile", "8_bit", "8_bit_flash"]
experiment_list = ["default", "expandable_segments", "flashattention_expandable", "8_bit", "8_bit_flash"]

In [12]:
# experiment_results_df["special_notes"] = "None"
# set the experiment name in experiment_results_df to values in the list in the same order ["default", "expandable_segments", "flashattention_expandable", "8_bit", "8_bit_flash"]
# experiment_results_df["Experiment_Name"] = experiment_list


In [13]:
len(experiment_results_df)

0

In [21]:
prompts_for_experiment

['What is the firm period for Ognibene?',
 'What is the firm period for Hengli?',
 'What are the material planning requirements for Ognibene?',
 'What are the material planning requirements for Hengli?',
 'What are the warranty requirements for Hengli?',
 'Can you create a table showing the warranty requirements for Ognibene and Hengli?']

In [None]:
# prompts_for_experiment = ["What is the firm period for Ognibene?"]
# special_notes = "Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment"
# special_notes = "NOT Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment && multiple prompts"
# special_notes = "Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment && single prompt"
special_notes = "Set pytorch_cuda_alloc_conf to expandable_segments:True in terminal before running the experiment && multiple prompts"
for experiment_name in experiment_list:
    print("Running experiment: ", experiment_name)
    if experiment_name != "default":
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
        print(os.environ["PYTORCH_CUDA_ALLOC_CONF"])
    if experiment_name == "flashattention":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token, attn_implementation="flash_attention_2",)
    elif experiment_name == "flashattention_expandable":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token, attn_implementation="flash_attention_2",)
    elif experiment_name == "expandable_segments":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,)
    elif experiment_name == "torch.compile":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,)
        model.generation_config.cache_implementation = "static"
        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
    elif experiment_name == "default":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,)
    elif experiment_name == "sdpa_torch.compile":
        try:
            model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,)
            model.generation_config.cache_implementation = "static"
            model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
        except:
            print("Error in running sdpa_torch.compile")            
    elif experiment_name == "torch.compile_dynamic_mode":
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,)
        model.generation_config.cache_implementation = "static"
        model.forward = torch.compile(model.forward, mode="reduce-overhead", dynamic=True)
    elif experiment_name == "8_bit":
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token,  quantization_config=quantization_config)
    elif experiment_name == "8_bit_flash":
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", token = llama31_hf_token, attn_implementation="flash_attention_2", quantization_config=quantization_config)
    print("     Model loaded for experiment: ", experiment_name)
    for prompt in prompts_for_experiment:
        # print experiment_name and prompt with prefix "Running experiment: " in a single line
        print("             Running experiment: ", experiment_name, " with prompt: ", prompt)

        pre_gen_max_mem_allocated_gpu0 = "{:.2f}".format(torch.cuda.max_memory_allocated(0)/1024**3) 
        pre_gen_max_mem_allocated_gpu1 = "{:.2f}".format(torch.cuda.max_memory_allocated(1)/1024**3)
        mem_stats_gpu0 = torch.cuda.memory_stats(0)
        mem_stats_gpu1 = torch.cuda.memory_stats(1)
        pre_gen_reserved_mem_gpu0 = "{:.2f}".format(mem_stats_gpu0["reserved_bytes.all.peak"]/1024**3)
        # pre_gen_active_mem_gpu0 = "{:.2f}".format(mem_stats_gpu0["active_bytes.all.peak"]/1024**3)
        pre_gen_reserved_mem_gpu1 = "{:.2f}".format(mem_stats_gpu1["reserved_bytes.all.peak"]/1024**3)
        prompt_for_chat.append({"role": "user", "content": prompt})    
        input_ids = tokenizer.apply_chat_template(prompt_for_chat, add_generation_prompt=True, return_tensors="pt").to(model.device)
        start_time = time.time() 
        # if experiment name is sdpa or sdpa_torch.compile, use the sdpa_kernel context manager
        if experiment_name == "sdpa" or experiment_name == "sdpa_torch.compile":
            with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
                outputs = model.generate(input_ids, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9,)
        else:
            outputs = model.generate(input_ids, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9,)
        end_time = time.time()
        latency = end_time - start_time
        response =  outputs[0][input_ids.shape[-1]:]
        llm_answer = tokenizer.decode(response, skip_special_tokens=True)
        prompt_for_chat.append({"role": "assistant", "content": llm_answer})
        post_gen_max_mem_allocated_gpu0 = "{:.2f}".format(torch.cuda.max_memory_allocated(0)/1024**3)
        post_gen_max_mem_allocated_gpu1 = "{:.2f}".format(torch.cuda.max_memory_allocated(1)/1024**3)
        mem_stats_gpu0 = torch.cuda.memory_stats(0)
        mem_stats_gpu1 = torch.cuda.memory_stats(1)
        post_gen_reserved_mem_gpu0 = "{:.2f}".format(mem_stats_gpu0["reserved_bytes.all.peak"]/1024**3)
        post_gen_reserved_mem_gpu1 = "{:.2f}".format(mem_stats_gpu1["reserved_bytes.all.peak"]/1024**3)
        # add the information to the dataframe using loc
        experiment_results_df.loc[len(experiment_results_df)] = [experiment_name, prompt, input_ids.shape[-1], pre_gen_max_mem_allocated_gpu0, pre_gen_max_mem_allocated_gpu1, pre_gen_reserved_mem_gpu0, pre_gen_reserved_mem_gpu1, post_gen_max_mem_allocated_gpu0, post_gen_max_mem_allocated_gpu1, post_gen_reserved_mem_gpu0, post_gen_reserved_mem_gpu1, latency, llm_answer, special_notes]
    del model
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    print("     Model deleted for experiment: ", experiment_name)


Running experiment:  default


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


     Model loaded for experiment:  default
             Running experiment:  default  with prompt:  What is the firm period for Ognibene?
     Model deleted for experiment:  default
Running experiment:  expandable_segments
expandable_segments:True


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


     Model loaded for experiment:  expandable_segments
             Running experiment:  expandable_segments  with prompt:  What is the firm period for Ognibene?
     Model deleted for experiment:  expandable_segments
Running experiment:  flashattention_expandable
expandable_segments:True


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


     Model loaded for experiment:  flashattention_expandable
             Running experiment:  flashattention_expandable  with prompt:  What is the firm period for Ognibene?
     Model deleted for experiment:  flashattention_expandable
Running experiment:  8_bit
expandable_segments:True


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


     Model loaded for experiment:  8_bit
             Running experiment:  8_bit  with prompt:  What is the firm period for Ognibene?




     Model deleted for experiment:  8_bit
Running experiment:  8_bit_flash
expandable_segments:True


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


     Model loaded for experiment:  8_bit_flash
             Running experiment:  8_bit_flash  with prompt:  What is the firm period for Ognibene?




In [None]:
# save the dataframe to a csv file
# experiment_results_df.to_csv("/home/vp899/projects/Long_Context_Chat/Long_Context_Chat/Output/experiment_results_expandable.csv", index=False)

In [24]:
experiment_results_df

Unnamed: 0,Experiment_Name,Prompt,Context_Length,pre_gen_max_mem_allocated_gpu0,pre_gen_max_mem_allocated_gpu1,pre_gen_reserved_mem_gpu0,pre_gen_reserved_mem_gpu1,post_gen_max_mem_allocated_gpu0,post_gen_max_mem_allocated_gpu1,post_gen_reserved_mem_gpu0,post_gen_reserved_mem_gpu1,latency,llm_answer,special_notes
0,default,What is the firm period for Ognibene?,70483,6.67,8.29,6.72,8.36,18.28,20.44,22.37,23.88,19.561571,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
1,expandable_segments,What is the firm period for Ognibene?,70546,6.68,20.44,6.74,23.88,18.29,20.45,22.14,23.9,18.410501,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
2,flashattention_expandable,What is the firm period for Ognibene?,70605,6.68,20.45,6.74,23.9,18.3,20.46,22.96,24.71,14.084867,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
3,8_bit,What is the firm period for Ognibene?,70664,3.35,20.46,3.41,24.71,18.69,23.02,21.67,25.74,25.554915,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
4,8_bit_flash,What is the firm period for Ognibene?,70723,18.69,5.42,21.67,5.46,18.71,23.03,21.67,24.66,25.205945,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...


In [17]:
experiment_results_df.shape

(10, 14)

In [22]:
# print rows 5-9 of the dataframe
experiment_results_df[5:10]

Unnamed: 0,Experiment_Name,Prompt,Context_Length,pre_gen_max_mem_allocated_gpu0,pre_gen_max_mem_allocated_gpu1,pre_gen_reserved_mem_gpu0,pre_gen_reserved_mem_gpu1,post_gen_max_mem_allocated_gpu0,post_gen_max_mem_allocated_gpu1,post_gen_reserved_mem_gpu0,post_gen_reserved_mem_gpu1,latency,llm_answer,special_notes
5,default,What is the firm period for Ognibene?,70806,18.71,8.3,21.67,8.38,18.71,20.49,22.2,23.96,18.530892,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
6,expandable_segments,What is the firm period for Ognibene?,70865,18.71,8.3,22.2,8.38,18.71,20.5,22.2,23.96,21.858373,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
7,flashattention_expandable,What is the firm period for Ognibene?,70946,18.71,8.3,22.2,8.38,18.71,20.51,22.2,22.9,16.705445,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
8,8_bit,What is the firm period for Ognibene?,71051,18.71,5.42,22.2,5.46,18.78,23.11,22.2,23.94,42.264166,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...
9,8_bit_flash,What is the firm period for Ognibene?,71172,18.78,5.42,22.2,5.46,18.8,23.14,22.2,24.8,31.015537,According to the Master Supply Agreement betwe...,NOT Set pytorch_cuda_alloc_conf to expandable_...


In [23]:
# delete rows 5 to 9 from the dataframe
experiment_results_df.drop(index=range(5,10), inplace=True)