### Loading Corpus

In [7]:
pip install convokit

Collecting convokit
  Downloading convokit-3.0.1.tar.gz (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.0/188.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
INFO: pip is looking at multiple versions of convokit to determine which version is compatible with other requirements. This could take a while.
  Using cached convokit-3.0.0.tar.gz (183 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting msgpack-numpy>=0.4.3.2 (from convokit)
  Using cached msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting spacy>=2.3.5 (from convokit)
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies ... [?25lerror
  [1;31merror[0m: [1msu

In [2]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /root/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem


In [3]:
print("Number of conversations: ", len(corpus.conversations))
print("Number of utterances: ", len(corpus.utterances))
print("Number of  speakers: ", len(corpus.speakers))
ids = [conversation.id for conversation in corpus.iter_conversations()]
print(ids[:5])

Number of conversations:  83097
Number of utterances:  304713
Number of  speakers:  9035
['L1044', 'L984', 'L924', 'L870', 'L866']


In [4]:
for i, conv in enumerate(corpus.iter_conversations()):
    if i >= 5:  # Stop after 5 conversations
        break
    print("Conversation ID:", conv.id)
    for utt in conv.iter_utterances():
        print(f"{utt.speaker.id}: {utt.text}")
    print("\n" + "-"*20 + "\n")  # Separator between conversations


Conversation ID: L1044
u0: They do not!
u2: They do to!

--------------------

Conversation ID: L984
u0: I hope so.
u2: She okay?

--------------------

Conversation ID: L924
u0: Let's go.
u2: Wow

--------------------

Conversation ID: L870
u0: Okay -- you're gonna need to learn how to lie.
u2: No
u0: I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?

--------------------

Conversation ID: L866
u0: Like my fear of wearing pastels?
u2: The "real you".
u0: What good stuff?
u2: I figured you'd get to the good stuff eventually.

--------------------



### Text preprocessing

In [5]:
import pandas as pd

# Initialize an empty list to store the data
data = []

# Iterate through each conversation in the corpus
for conv in corpus.iter_conversations():
    conversation_id = conv.id
    utterances = list(conv.iter_utterances())

    # Iterate in pairs: user utterance -> bot response
    for i in range(len(utterances) - 1):
        input_utt = utterances[i]
        response_utt = utterances[i + 1]

        # Ensure the pair is from alternating speakers
        if input_utt.speaker.id != response_utt.speaker.id:
            # Extract details for input and response
            input_details = {
                'input': input_utt.text,
                'response': response_utt.text
            }
            data.append(input_details)

# Convert the list to a DataFrame
df = pd.DataFrame(data)


In [6]:
import re

# Define the data cleaning function
def data_cleaning(data):
    data = data.lower()                         # lowercase
    data = re.sub(r'[^\x00-\x7F]+', ' ', data)  # remove non-ASCII characters
    data = re.sub(r'\s+', ' ', data).strip()    # replace multiple spaces with a single space
    data = re.sub(r"\'m", " am", data)          # Handle contractions
    data = re.sub(r"\'t", " not", data)
    return data

# Apply the data cleaning function to 'input' and 'response' columns
df['input'] = df['input'].apply(data_cleaning)
df['response'] = df['response'].apply(data_cleaning)


In [7]:
df.head()

Unnamed: 0,input,response
0,they do not!,they do to!
1,i hope so.,she okay?
2,let's go.,wow
3,okay -- you're gonna need to learn how to lie.,no
4,no,i am kidding. you know how sometimes you just ...


In [8]:
df=df.head(150000)

In [9]:
df.shape

(150000, 2)

### Tokenize using BlenderBot

In [10]:
import pandas as pd
from transformers import BlenderbotTokenizer

# Initialize the tokenizer
tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

# Tokenization example
tokenized_inputs = tokenizer(
    df['input'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

tokenized_responses = tokenizer(
    df['response'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

# Create a new DataFrame to store tokenized data along with metadata
tokenized_df = pd.DataFrame({
    'input_ids': list(tokenized_inputs['input_ids']),
    'attention_mask': list(tokenized_inputs['attention_mask']),
    'response_ids': list(tokenized_responses['input_ids']),  # response_ids as input_ids for model training
    #'response_attention_mask': list(tokenized_responses['attention_mask'])
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/310k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]



In [11]:
tokenized_df.shape

(150000, 3)

In [12]:
tokenized_df.head()

Unnamed: 0,input_ids,attention_mask,response_ids
0,"[tensor(382), tensor(361), tensor(368), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[tensor(382), tensor(361), tensor(287), tensor..."
1,"[tensor(607), tensor(1214), tensor(394), tenso...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[tensor(617), tensor(2488), tensor(38), tensor..."
2,"[tensor(939), tensor(341), tensor(425), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[tensor(5133), tensor(2), tensor(0), tensor(0)..."
3,"[tensor(2488), tensor(3101), tensor(304), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[tensor(528), tensor(2), tensor(0), tensor(0),..."
4,"[tensor(528), tensor(2), tensor(0), tensor(0),...","[tensor(1), tensor(1), tensor(0), tensor(0), t...","[tensor(607), tensor(632), tensor(7224), tenso..."


In [13]:
tokenizer.save_pretrained('tokenizer_blender')

('tokenizer_blender/tokenizer_config.json',
 'tokenizer_blender/special_tokens_map.json',
 'tokenizer_blender/vocab.json',
 'tokenizer_blender/merges.txt',
 'tokenizer_blender/added_tokens.json')

### Create train and validation sets

In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd


X = tokenized_df[['input_ids', 'attention_mask']]
y = tokenized_df['response_ids']  # Target labels

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create new DataFrames for the training and validation sets
train_df = X_train.copy()
train_df['response_ids'] = y_train

val_df = X_val.copy()
val_df['response_ids'] = y_val

In [15]:
# Define the ChatbotDataset class 
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
class ChatbotDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.attention_masks = df['attention_mask'].tolist()
        self.response_ids = df['response_ids'].tolist()
        #self.response_attention_masks = df['response_attention_mask'].tolist()  

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'response_ids': self.response_ids[idx],
           # 'response_attention_mask': self.response_attention_masks[idx],  
        }

# Create training and validation datasets and DataLoaders
train_dataset = ChatbotDataset(train_df)
val_dataset = ChatbotDataset(val_df)

In [16]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)  # Keep shuffle=False for context
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

### Load the Pretrained Blenderbot model

In [17]:
import torch
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, AdamW


# Load the BlenderBot model and tokenizer
model_name = "facebook/blenderbot-400M-distill"
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
optimizer = AdamW(model.parameters(), lr=5e-5)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]



BlenderbotForConditionalGeneration(
  (model): BlenderbotModel(
    (shared): BlenderbotScaledWordEmbedding(8008, 1280, padding_idx=0)
    (encoder): BlenderbotEncoder(
      (embed_tokens): BlenderbotScaledWordEmbedding(8008, 1280, padding_idx=0)
      (embed_positions): BlenderbotLearnedPositionalEmbedding(128, 1280)
      (layers): ModuleList(
        (0-1): 2 x BlenderbotEncoderLayer(
          (self_attn): BlenderbotAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5

In [None]:
del tokenized_df

In [None]:
import gc
gc.collect()

328

In [18]:
len(train_loader)

7500

### Finetuning train set on our corpus

In [19]:
# Training function
def train_model(model, train_loader, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        i = 0
        for batch in train_loader:
            i += 1
            print(i) # Included to know the training status since training can take hours
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            response_ids = batch['response_ids']

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=response_ids)
            loss = outputs.loss
            total_loss += loss.item()

            # Get the predicted token IDs
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)

            # Calculate correct predictions
            # Only consider valid token predictions (ignore padding)
            mask = response_ids != 0  
            correct_predictions += (predicted_ids[mask] == response_ids[mask]).sum().item()
            total_predictions += mask.sum().item()

            # Backward pass
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Save the model after training completes
    save_path = "model.pth"
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")


num_epochs = 3  # Set your desired number of epochs
train_model(model, train_loader, optimizer, num_epochs)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689


### Model Evaluation

### Bleu Score

In [20]:
import torch
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu_n_grams(model, val_loader, tokenizer, max_batches=100):
    model.eval()  # Set the model to evaluation mode
    bleu_1_scores, bleu_2_scores, bleu_3_scores, bleu_4_scores = [], [], [], []
    smoothing = SmoothingFunction().method1

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i >= max_batches:  # Limit to 100 batches
                break
            #print(i)
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            references = batch['response_ids']  # Ground truth sentences

            # Decode the reference tensors to strings
            if isinstance(references[0], torch.Tensor):
                references = [tokenizer.decode(ref, skip_special_tokens=True) for ref in references]

            # Generate predictions
            outputs = model.generate(input_ids=inputs, attention_mask=attention_mask)
            predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            for pred, ref in zip(predictions, references):
                ref_tokens = [ref.split()]
                pred_tokens = pred.split()

                # Calculate BLEU scores for different n-grams
                bleu_1 = sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing)
                bleu_2 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
                bleu_3 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
                bleu_4 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

                # Append scores to the respective lists
                bleu_1_scores.append(bleu_1)
                bleu_2_scores.append(bleu_2)
                bleu_3_scores.append(bleu_3)
                bleu_4_scores.append(bleu_4)

    # Calculate average BLEU scores for each n-gram level
    avg_bleu_1 = sum(bleu_1_scores) / len(bleu_1_scores)
    avg_bleu_2 = sum(bleu_2_scores) / len(bleu_2_scores)
    avg_bleu_3 = sum(bleu_3_scores) / len(bleu_3_scores)
    avg_bleu_4 = sum(bleu_4_scores) / len(bleu_4_scores)

    return avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4

bleu_score1, bleu_score2, bleu_score3, bleu_score4 = calculate_bleu_n_grams(model, val_loader, tokenizer)
print(f"BLEU1 Score: {bleu_score1}")
print(f"BLEU2 Score: {bleu_score2}")
print(f"BLEU3 Score: {bleu_score3}")
print(f"BLEU4 Score: {bleu_score4}")


BLEU1 Score: 0.06486433459781382
BLEU2 Score: 0.023337093367101023
BLEU3 Score: 0.014821804805393482
BLEU4 Score: 0.010232238484076452


In [21]:
Average_bleu_score = (bleu_score1, bleu_score2, bleu_score3, bleu_score4)/4
print(f"Average BLEU Score: {Average_bleu_score}")

Average BLEU Score: 0.03159008153435234


### Rouge Score

In [22]:
def calculate_rouge_scores(model, val_loader, tokenizer, max_batches=100):
    model.eval()  # Set the model to evaluation mode
    
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i >= max_batches:  # Limit to 100 batches
                break
            #print(i)
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            references = batch['response_ids']  # Ground truth sentences

            # Decode the reference tensors to strings
            if isinstance(references[0], torch.Tensor):
                references = [tokenizer.decode(ref, skip_special_tokens=True) for ref in references]

            # Generate predictions
            outputs = model.generate(input_ids=inputs, attention_mask=attention_mask)
            predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
           

            for pred, ref in zip(predictions, references):
                scores = scorer.score(ref, pred)
                rouge1_scores.append(scores["rouge1"].fmeasure)
                rouge2_scores.append(scores["rouge2"].fmeasure)
                rougeL_scores.append(scores["rougeL"].fmeasure)

            # Compute average ROUGE scores
            avg_rouge1 = np.mean(rouge1_scores)
            avg_rouge2 = np.mean(rouge2_scores)
            avg_rougeL = np.mean(rougeL_scores)
            
    return avg_rouge1, avg_rouge2, avg_rougeL

            

In [23]:
avg_rouge1, avg_rouge2, avg_rougeL = calculate_rouge_scores(model, val_loader, tokenizer)
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")

Average ROUGE-1: 0.5021
Average ROUGE-2: 0.2357
Average ROUGE-L: 0.4735
