<a href="https://colab.research.google.com/github/shubha07m/LLM_Dialogue_Generation/blob/main/tuned_llm_benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing and loading library

In [1]:
!pip install -U -q PyDrive
!pip install datasets
! pip install nltk

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00

In [2]:
# Importing library and drive
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd


from google.colab import drive
from transformers import BlenderbotForConditionalGeneration, BlenderbotTokenizer
import torch
import numpy as np
from torch.nn.functional import cross_entropy
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

drive.mount('/content/drive')

Mounted at /content/drive


## Testing the performance of new tuned LLM

In [None]:
# Define the path to the model and tokenizer in Google Drive
model_path = '/content/drive/MyDrive/blenderbot_llm'

# Load the fine-tuned model and tokenizer
model = BlenderbotForConditionalGeneration.from_pretrained(model_path)
tokenizer = BlenderbotTokenizer.from_pretrained(model_path)

# Ensure decoder_start_token_id is set in the model
model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids('<s>')


In [None]:
# Encode input
inputs = tokenizer("Lex Fridman: What is the future of AI?\nLee Cronin:", return_tensors='pt')

# Generate output with proper handling of special tokens
outputs = model.generate(
    inputs['input_ids'],
    max_length=60,
    num_beams=10,  # Increase number of beams for diversity
    length_penalty=1.0,
    do_sample=True,
    top_p=0.9,  # Increase top-p for diversity
    top_k=50,  # Use top-k sampling
    temperature=1.5,  # Increase temperature for more randomness
    early_stopping=True,
    decoder_start_token_id=model.config.decoder_start_token_id,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=model.config.eos_token_id
)

# Decode the output
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Output:", decoded_output)

Generated Output:  lee cronin:  what do you do for a living?  i work as a graphic designer


## Using advanced prompt engineering for conversation generation

In [None]:
def generate_conversation(prompt, model, tokenizer, max_turns=12, max_length=60, max_input_length=128):
    conversation = prompt
    current_speaker = "Lee Cronin"
    for _ in range(max_turns):
        # Encode input with truncation
        inputs = tokenizer(conversation, return_tensors='pt', truncation=True, max_length=max_input_length)

        # Generate output
        outputs = model.generate(
            inputs['input_ids'],
            max_length=max_length,
            num_beams=10,
            length_penalty=1.0,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            temperature=1.5,
            early_stopping=True,
            decoder_start_token_id=model.config.decoder_start_token_id,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=model.config.eos_token_id
        )

        # Decode the output and clean up the response
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_text = generated_text.replace(current_speaker.lower(), "").replace(current_speaker, "").strip().split('. ')[0] + '.'

        # Update conversation with new speaker
        conversation += f"\n{current_speaker}: {generated_text}"

        # Alternate speaker
        current_speaker = "Lex Fridman" if current_speaker == "Lee Cronin" else "Lee Cronin"

        # Ensure conversation does not exceed max length
        if tokenizer(conversation, return_tensors='pt', truncation=True, max_length=max_input_length)['input_ids'].shape[1] > max_input_length:
            conversation = tokenizer.decode(tokenizer(conversation, return_tensors='pt', truncation=True, max_length=max_input_length)['input_ids'][0, -max_input_length:])

    return conversation

In [None]:
# Load model and tokenizer
tokenizer = BlenderbotTokenizer.from_pretrained('/content/drive/MyDrive/blenderbot_llm')
model = BlenderbotForConditionalGeneration.from_pretrained('/content/drive/MyDrive/blenderbot_llm')

# Define initial prompt with one sentence from each speaker
initial_prompt = """Lex Fridman: What is the future of AI?
Lee Cronin: It’s a fascinating question, and it touches on many aspects of science and technology."""

# Generate conversation
conversation = generate_conversation(initial_prompt, model, tokenizer)
print("Generated Conversation:\n", conversation)

Generated Conversation:
 Lex Fridman: What is the future of AI?
Lee Cronin: It’s a fascinating question, and it touches on many aspects of science and technology.
Lee Cronin: :  i think it is fascinating that we have the ability to look at the past and think about the future.
Lex Fridman: I think the future is going to look very different than the past because in the future we will have to look for the future in the present and then we will look back in the past to see what it will look like.
Lee Cronin: That is a very interesting way of looking at it.
Lex Fridman: Listening to the soundtrack on my smartphone right now.
Lee Cronin: Listening to some of my favorite musicians right now, one of my favorites is Led Zeppelin, what about you?.
Lex Fridman: Listening to some of my favorite music from the past right now is the Beatles.
Lee Cronin: Lets see what the future has in store for us.
Lex Fridman: Lets look forward to the future and look at how the future will look and what will it loo

## Testing the performance of EWC tuned LLM on new data

In [None]:
# Define the path to the model and tokenizer in Google Drive
model_path = '/content/drive/MyDrive/ewc_trained_llm'

# Load the fine-tuned model and tokenizer
model = BlenderbotForConditionalGeneration.from_pretrained(model_path)
tokenizer = BlenderbotTokenizer.from_pretrained(model_path)

# Ensure decoder_start_token_id is set in the model
model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids('<s>')


In [None]:
def generate_conversation(prompt, model, tokenizer, max_turns=12, max_length=60, max_input_length=128):
    conversation = prompt
    current_speaker = "Lisa Randall"

    for _ in range(max_turns):
        # Encode input with truncation
        inputs = tokenizer(conversation, return_tensors='pt', truncation=True, max_length=max_input_length)

        # Generate output
        outputs = model.generate(
            inputs['input_ids'].to(model.device),
            max_length=max_length,
            num_beams=10,
            length_penalty=1.0,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            temperature=1.5,
            early_stopping=True,
            decoder_start_token_id=model.config.decoder_start_token_id,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=model.config.eos_token_id
        )

        # Decode the output and clean up the response
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_text = generated_text.replace(current_speaker.lower(), "").replace(current_speaker, "").strip().split('. ')[0] + '.'

        # Update conversation with new speaker
        conversation += f"\n{current_speaker}: {generated_text}"

        # Alternate speaker
        current_speaker = "Lex Fridman" if current_speaker == "Lisa Randall" else "Lisa Randall"

        # Ensure conversation does not exceed max length
        if tokenizer(conversation, return_tensors='pt', truncation=True, max_length=max_input_length)['input_ids'].shape[1] > max_input_length:
            conversation = tokenizer.decode(tokenizer(conversation, return_tensors='pt', truncation=True, max_length=max_input_length)['input_ids'][0, -max_input_length:])

    return conversation

In [None]:
# Define the initial prompt
initial_prompt = """Lex Fridman: How do you envision the future of our understanding of the universe?
Lisa Randall: The future of our understanding lies in the mysteries we have yet to uncover, particularly with dark matter and other cosmic phenomena."""

# Generate conversation
generated_conversation = generate_conversation(initial_prompt, model, tokenizer)
print(generated_conversation)

Lex Fridman: How do you envision the future of our understanding of the universe?
Lisa Randall: The future of our understanding lies in the mysteries we have yet to uncover, particularly with dark matter and other cosmic phenomena.
Lisa Randall: the to is you and that.
Lex Fridman: the is that of a it to.
Lisa Randall: is the to i you so that.
Lex Fridman: the are you is i of.
Lisa Randall: is that the to and it.
Lex Fridman: the of and a have is.
Lisa Randall: that the is in and life i.
Lex Fridman: and is the that i it of.
Lisa Randall: the a and that is in.
Lex Fridman: the you i to is that and.
Lisa Randall: it is the to i and.
Lex Fridman: that is you the in so.


## Loading both the old and new model and tokenizer for testing

In [3]:
# Define paths
old_model_path = '/content/drive/MyDrive/blenderbot_llm'
new_model_path = '/content/drive/MyDrive/ewc_trained_llm'

# Load the tokenizers
tokenizer_old = BlenderbotTokenizer.from_pretrained(old_model_path)
tokenizer_new = BlenderbotTokenizer.from_pretrained(new_model_path)

# Load the models
model_old = BlenderbotForConditionalGeneration.from_pretrained(old_model_path)
model_new = BlenderbotForConditionalGeneration.from_pretrained(new_model_path)

# Ensure the models are on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_old.to(device)
model_new.to(device)

BlenderbotForConditionalGeneration(
  (model): BlenderbotModel(
    (shared): BlenderbotScaledWordEmbedding(8008, 1280, padding_idx=0)
    (encoder): BlenderbotEncoder(
      (embed_tokens): BlenderbotScaledWordEmbedding(8008, 1280, padding_idx=0)
      (embed_positions): BlenderbotLearnedPositionalEmbedding(128, 1280)
      (layers): ModuleList(
        (0-1): 2 x BlenderbotEncoderLayer(
          (self_attn): BlenderbotAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5

## Creating test data from the available conversation

In [4]:
# Function to read files directly from Google Drive

def download_and_read_file_from_drive(file_id, file_name):
    """
    Downloads a file from Google Drive using the file ID and reads it into a Pandas DataFrame if it's a CSV file.

    Args:
    file_id (str): The ID of the file in Google Drive.
    file_name (str): The name to save the file as (including extension).

    Returns:
    DataFrame: A Pandas DataFrame if the file is a CSV file, otherwise None.
    """
    try:

        # Authenticate and create the PyDrive client
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)

        # Create a GoogleDriveFile instance with the file ID
        downloaded = drive.CreateFile({'id': file_id})
        downloaded.GetContentFile(file_name)

        print(f'File {file_name} downloaded successfully.')

        # Check if the file is a CSV file and read it into a DataFrame
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
            print('CSV file read into DataFrame.')
            return df
        else:
            print('File is not a CSV. No DataFrame created.')
            return None

    except Exception as e:
        print(f'An error occurred: {e}')
        return None

In [5]:
# Reading first file

file_id = '18g5y5GmBQNgU8z2fPdushrdu0XfmQjph'
file_name = 'lee_cronin3.csv'

# file_id = '15EGbylkuobQtA0zXkeHmmhhNaIxoz50D'
# file_name = 'lee_cronin3.csv'
df1 = download_and_read_file_from_drive(file_id, file_name)

# Load Data
initial_data = df1

File lee_cronin3.csv downloaded successfully.
CSV file read into DataFrame.


In [6]:
# Reading the second file

file_id = '1Rm-ItCDv44iDqLaaEZTz-Cqu_xQPM6s5'
file_name = 'lisa_randall.csv'

# file_id = '1x3prg2ZD8h4PfOkd3Ftohyy8gtDPR3-v'
# file_name = 'lisa_randall.csv'
df2 = download_and_read_file_from_drive(file_id, file_name)
new_data =df2

File lisa_randall.csv downloaded successfully.
CSV file read into DataFrame.


In [7]:
# Function for creating test data
def create_test_data(initial_data, new_data):
    def format_data(data):
        formatted_data = []
        for i in range(0, len(data) - 1, 2):
            # Ensure the data is valid
            if data.loc[i, 'speaker'] != data.loc[i + 1, 'speaker']:
                formatted_data.append({
                    'input': f"{data.loc[i, 'speaker']}: {data.loc[i, 'text']}",
                    'output': f"{data.loc[i + 1, 'speaker']}: {data.loc[i + 1, 'text']}"
                })
        return formatted_data

    # Create test data
    initial_test_data = format_data(initial_data)
    new_test_data = format_data(new_data)

    # Convert to DataFrame
    df_initial_test = pd.DataFrame(initial_test_data)
    df_new_test = pd.DataFrame(new_test_data)

    # Validate the format of the DataFrames
    def validate_dataframe(df):
        if 'input' not in df.columns or 'output' not in df.columns:
            raise ValueError("DataFrame must contain 'input' and 'output' columns.")
        if df.empty:
            raise ValueError("DataFrame is empty. Ensure data is properly formatted.")
        for index, row in df.iterrows():
            if not isinstance(row['input'], str) or not isinstance(row['output'], str):
                raise ValueError(f"Invalid data at index {index}: {row}")

    # Validate DataFrames
    validate_dataframe(df_initial_test)
    validate_dataframe(df_new_test)

    return df_initial_test, df_new_test

In [8]:
# Example usage
df_initial_test, df_new_test = create_test_data(initial_data, new_data)

In [9]:
(df_initial_test.shape, len(df_initial_test))

((335, 2), 335)

In [10]:
print("Initial Test Data:")
print(df_initial_test.head())

print("New Test Data:")
print(df_new_test.head())

Initial Test Data:
                                               input  \
0  lee cronin:  every star in the sky probably ha...   
1   lee cronin:  time and the ability to communicate   
2  lee cronin:  yeah my biggest fear in a way is ...   
3                                lee cronin:  thanks   
4                             lee cronin:  go for it   

                                              output  
0  lex fridman:  intersect you dont mean in time ...  
1           lex fridman:  the ability to communicate  
2  lex fridman:  the following is a conversation ...  
3  lex fridman:  it created i think its fair to s...  
4  lex fridman:  so assembly theory says that if ...  
New Test Data:
                                               input  \
0  lex fridman:  in theory it behaves just like a...   
1  lisa randall:  theres also just more of it and...   
2  lisa randall:  exactly in my book i make jokes...   
3  lisa randall:  exactly no but it is a metaphor...   
4  lex fridman:  ye

In [11]:
# Convert data to list of tuples
def convert_df_to_tuples(df):
    return list(zip(df['input'], df['output']))

test_data_initial_tuples = convert_df_to_tuples(df_initial_test)
test_data_new_tuples = convert_df_to_tuples(df_new_test)

## Compare using test data performance of both model

In [17]:
# Function to generate a response

def generate_response(model, tokenizer, prompt, device='cuda'):
    model.to(device)
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)

    # Truncate input sequence if it's too long
    max_length = model.config.max_position_embeddings
    if inputs.size(1) > max_length:
        inputs = inputs[:, :max_length]

    # Generate response
    outputs = model.generate(
        inputs,
        max_length=50,
        num_beams=5,
        early_stopping=True,
        decoder_start_token_id=model.config.pad_token_id  # Ensure correct starting token
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [13]:
# Function to compute BLEU score
def compute_bleu(reference, candidate):
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)

In [14]:
def compute_perplexity(model, tokenizer, text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    device = next(model.parameters()).device
    input_ids = input_ids.to(device)

    # Truncate input sequence if it's too long
    max_length = model.config.max_position_embeddings
    if input_ids.size(1) > max_length:
        input_ids = input_ids[:, :max_length]

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = np.exp(loss.item())

    return perplexity

In [15]:
def evaluate_models(test_data, tokenizer_old, model_old, tokenizer_new, model_new):
    # Ensure models are on the correct device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_old.to(device)
    model_new.to(device)

    bleu_scores_old = []
    perplexity_scores_old = []
    bleu_scores_new = []
    perplexity_scores_new = []

    for prompt, reference in test_data:
        response_old = generate_response(model_old, tokenizer_old, prompt, device=device)
        response_new = generate_response(model_new, tokenizer_new, prompt, device=device)

        bleu_old = compute_bleu(reference, response_old)
        bleu_new = compute_bleu(reference, response_new)
        perplexity_old = compute_perplexity(model_old, tokenizer_old, response_old)
        perplexity_new = compute_perplexity(model_new, tokenizer_new, response_new)

        bleu_scores_old.append(bleu_old)
        bleu_scores_new.append(bleu_new)
        perplexity_scores_old.append(perplexity_old)
        perplexity_scores_new.append(perplexity_new)

    # Average BLEU and perplexity scores
    avg_bleu_old = np.mean(bleu_scores_old)
    avg_bleu_new = np.mean(bleu_scores_new)
    avg_perplexity_old = np.mean(perplexity_scores_old)
    avg_perplexity_new = np.mean(perplexity_scores_new)

    return avg_bleu_old, avg_perplexity_old, avg_bleu_new, avg_perplexity_new

In [18]:
# Evaluate models on initial test data
avg_bleu_old_initial, avg_perplexity_old_initial, avg_bleu_new_initial, avg_perplexity_new_initial = evaluate_models(
    test_data_initial_tuples, tokenizer_old, model_old, tokenizer_new, model_new
)

print("Initial Test Data Evaluation:")
print(f"Old Model BLEU Score: {avg_bleu_old_initial}")
print(f"Old Model Perplexity Score: {avg_perplexity_old_initial}")
print(f"New Model BLEU Score: {avg_bleu_new_initial}")
print(f"New Model Perplexity Score: {avg_perplexity_new_initial}")

Token indices sequence length is longer than the specified maximum sequence length for this model (231 > 128). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (231 > 128). Running this sequence through the model will result in indexing errors


Initial Test Data Evaluation:
Old Model BLEU Score: 0.004999871618429517
Old Model Perplexity Score: 4.237836855932256
New Model BLEU Score: 0.007313616651345576
New Model Perplexity Score: 71.33681441645774
