## **Libraries Import**

In [11]:
#import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
import torch
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import evaluate
from tqdm import tqdm
import torch.nn.functional as F

ImportError: cannot import name 'using_pyarrow_string_dtype' from 'pandas._config' (d:\Anaconda\envs\genai\Lib\site-packages\pandas\_config\__init__.py)

## **Task 1**

In [7]:
# Download latest version
path = kagglehub.dataset_download("atharvjairath/empathetic-dialogues-facebook-ai")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/empathetic-dialogues-facebook-ai


In [8]:
# The path provided by kagglehub
dataset_path = "/kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv"

# Load the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)
print(f"Dataset loaded successfully. Total rows: {len(df)}")

# First, split into 80% train and 20% temporary (for val + test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Next, split the 20% temporary set in half to get 10% validation and 10% test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Dataset loaded successfully. Total rows: 64636
Training set size: 51708
Validation set size: 6464
Test set size: 6464


In [9]:
def normalize_text(text):
    text = str(text).lower()
    
    # Rule 1: Remove "customer :" or "agent :" from the BEGINNING of the string
    text = re.sub(r'^(customer|agent)\s*:\s*', '', text, flags=re.I).strip()

    # Rule 2: Remove "agent :" from the END of the string
    text = re.sub(r'agent\s*:\s*$', '', text, flags=re.I).strip()

    # Rule 3: Handle punctuation and whitespace
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [10]:
# Apply the function to the correct columns in all three data splits
for df_split in [train_df, val_df, test_df]:
    df_split['Situation_normalized'] = df_split['Situation'].apply(normalize_text)
    df_split['empathetic_dialogues_normalized'] = df_split['empathetic_dialogues'].apply(normalize_text)
    df_split['labels_normalized'] = df_split['labels'].apply(normalize_text)

In [11]:
# Create a list of all text from the training set to build the vocabulary
corpus = list(train_df['empathetic_dialogues_normalized']) + \
         list(train_df['Situation_normalized']) + \
         list(train_df['labels_normalized'])

# We need an iterator for the tokenizer training
def corpus_iterator():
    for text in corpus:
        yield text

In [12]:
# Initialize a tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

# Define the trainer
trainer = WordLevelTrainer(special_tokens=["<pad>", "<bos>", "<eos>", "<unk>"])

# Train the tokenizer on your training data
tokenizer.train_from_iterator(corpus_iterator(), trainer=trainer)

vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

# Save the tokenizer for later use 
save_path = "/kaggle/working/my_tokenizer.json"
tokenizer.save(save_path)
print(f"Tokenizer saved to {save_path}")

Vocabulary size: 19342
Tokenizer saved to /kaggle/working/my_tokenizer.json


In [13]:
# tokenizer = Tokenizer.from_file("my_tokenizer.json")

print("Special token IDs:")
print(f"<unk> ID: {tokenizer.token_to_id('<unk>')}")
print(f"<pad> ID: {tokenizer.token_to_id('<pad>')}")
print(f"<bos> ID: {tokenizer.token_to_id('<bos>')}")
print(f"<eos> ID: {tokenizer.token_to_id('<eos>')}")

Special token IDs:
<unk> ID: 3
<pad> ID: 0
<bos> ID: 1
<eos> ID: 2


## **Task 2**

In [14]:
def create_input_string(row):
    emotion = row['emotion']
    situation = row['Situation_normalized']
    customer_utterance = row['empathetic_dialogues_normalized']
    
    # Format the string exactly as specified
    input_str = f"Emotion: {emotion} | Situation: {situation} | Customer: {customer_utterance} Agent:"
    return input_str

In [15]:
# Apply the function to create the 'X' and 'Y' columns for all data splits
for df_split in [train_df, val_df, test_df]:
    df_split['X'] = df_split.apply(create_input_string, axis=1)
    # The target 'Y' is the normalized agent's reply from the 'labels' column
    df_split['Y'] = df_split['labels_normalized']

print("X and Y columns created successfully.")

X and Y columns created successfully.


In [16]:
# Inspect a final example from the training set to verify
print("\n--- Example ---")
print("INPUT (X):")
print(train_df['X'].iloc[0])
print("\nTARGET (Y):")
print(train_df['Y'].iloc[0])


--- Example ---
INPUT (X):
Emotion: nostalgic | Situation: i had to go buy legos for my nephew the other day . makes me miss the days when my girls were young enough to play with them . | Customer: were you embarrassed or what happend ? Agent:

TARGET (Y):
no just this feeling overcame me that my kids just have outgrown this time .


## **Task 3**

In [17]:
import torch
import torch.nn as nn
import math

# --- MODEL ARCHITECTURE ---

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, nhead: int, d_hid: int,
                 nlayers_encoder: int, nlayers_decoder: int, dropout: float = 0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=nlayers_encoder,
                                          num_decoder_layers=nlayers_decoder,
                                          dim_feedforward=d_hid, dropout=dropout,
                                          batch_first=False)
        self.generator = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src: torch.Tensor, tgt: torch.Tensor,
                src_padding_mask: torch.Tensor,
                tgt_padding_mask: torch.Tensor,
                tgt_mask: torch.Tensor) -> torch.Tensor:

        src_emb = self.embedding(src) * math.sqrt(self.d_model)
        tgt_emb = self.embedding(tgt) * math.sqrt(self.d_model)
        
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_encoder(tgt_emb)
        
        output = self.transformer(src_emb, tgt_emb,
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=tgt_padding_mask,
                                  memory_key_padding_mask=src_padding_mask,
                                  tgt_mask=tgt_mask)
        
        return self.generator(output)

In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float = 0.1):
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model must be divisible by h"

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)

    # --- THIS IS THE UPDATED PART ---
    @staticmethod
    def attention(query, key, value, mask=None, dropout=None):
        d_k = query.shape[-1]
        
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        
        if mask is not None:
            # THIS IS THE FIX:
            # The mask has shape (seq_len, seq_len)
            # We add 2 dimensions to make it (1, 1, seq_len, seq_len)
            # so it can broadcast to the scores' shape of (batch, h, seq_len, seq_len)
            scores = scores.masked_fill(mask.unsqueeze(0).unsqueeze(0), -1e9)
            
        p_attn = scores.softmax(dim=-1)
        
        if dropout is not None:
            p_attn = dropout(p_attn)
            
        return torch.matmul(p_attn, value), p_attn

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        query = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        key = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        value = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        x, self.attention_scores = self.attention(query, key, value, mask, self.dropout)
        
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        return self.w_o(x)

In [19]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model: int, h: int, d_ff: int, dropout: float = 0.1):
        """
        Args:
            d_model (int): The dimension of the embedding.
            h (int): The number of attention heads.
            d_ff (int): The dimension of the feed-forward network.
            dropout (float): The dropout probability.
        """
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, h, dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): The input tensor from the previous layer.
            mask (torch.Tensor): The mask for the input sequence.
        
        Returns:
            torch.Tensor: The output tensor of the encoder layer.
        """
        # --- First sub-layer: Multi-Head Self-Attention ---
        # The query, key, and value are all the same: the input 'x'. This is "self-attention".
        attn_output = self.self_attn(q=x, k=x, v=x, mask=mask)
        
        # Apply the first residual connection ("Add") and Layer Normalization ("Norm")
        x = self.norm1(x + self.dropout(attn_output))
        
        # --- Second sub-layer: Feed-Forward Network ---
        ff_output = self.feed_forward(x)
        
        # Apply the second residual connection and Layer Normalization
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

In [20]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model: int, h: int, d_ff: int, dropout: float = 0.1):
        """
        Args:
            d_model (int): The dimension of the embedding.
            h (int): The number of attention heads.
            d_ff (int): The dimension of the feed-forward network.
            dropout (float): The dropout probability.
        """
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, h, dropout)
        self.cross_attn = MultiHeadAttention(d_model, h, dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x: torch.Tensor, encoder_output: torch.Tensor, 
                source_mask: torch.Tensor, target_mask: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): The input from the previous decoder layer.
            encoder_output (torch.Tensor): The final output of the encoder stack.
            source_mask (torch.Tensor): The mask for the encoder output.
            target_mask (torch.Tensor): The mask for the decoder input.
        
        Returns:
            torch.Tensor: The output tensor of the decoder layer.
        """
        # --- First sub-layer: Masked Multi-Head Self-Attention ---
        attn_output = self.self_attn(q=x, k=x, v=x, mask=target_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # --- Second sub-layer: Encoder-Decoder Cross-Attention ---
        # Query comes from the decoder, Key and Value come from the encoder.
        attn_output = self.cross_attn(q=x, k=encoder_output, v=encoder_output, mask=source_mask)
        x = self.norm2(x + self.dropout(attn_output))

        # --- Third sub-layer: Feed-Forward Network ---
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x

In [21]:
# Replace your old Transformer class with this one

class Transformer(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_encoder_layers: int, 
                 num_decoder_layers: int, num_heads: int, d_ff: int, 
                 dropout: float = 0.1, max_len: int = 5000):
        super().__init__()

        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder_layers)])
        self.generator = nn.Linear(d_model, vocab_size)

    # --- THIS FORWARD METHOD IS UPDATED ---
    def forward(self, src: torch.Tensor, tgt: torch.Tensor, src_mask: torch.Tensor) -> torch.Tensor:
        
        # --- NEW: Generate target mask internally ---
        tgt_seq_len = tgt.shape[0]
        tgt_device = tgt.device
        tgt_causal_mask = torch.triu(torch.ones((tgt_seq_len, tgt_seq_len), device=tgt_device), diagonal=1).bool()
        
        # Embed and add positional encoding
        src_embedded = self.pos_encoder(self.embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.pos_encoder(self.embedding(tgt) * math.sqrt(self.d_model))

        # Encoder Pass
        encoder_output = src_embedded
        for layer in self.encoder:
            encoder_output = layer(encoder_output, src_mask)

        # Decoder Pass
        decoder_output = tgt_embedded
        for layer in self.decoder:
            # Pass the internally generated mask to the decoder layer
            decoder_output = layer(decoder_output, encoder_output, src_mask, tgt_causal_mask)
            
        output = self.generator(decoder_output)
        return output

## **Task 4**

In [22]:
# --- 1. Create the Custom Dataset Class ---

class ChatbotDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.X = self.df['X']
        self.Y = self.df['Y']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.X.iloc[idx], self.Y.iloc[idx]

# --- 2. Define the Collate Function for Padding ---

def collate_fn(batch, tokenizer, pad_token_id):
    src_batch, tgt_batch = [], []
    bos_token_id = tokenizer.token_to_id('<bos>')
    eos_token_id = tokenizer.token_to_id('<eos>')

    for src_sample, tgt_sample in batch:
        # Encode the strings and add BOS/EOS tokens
        src_encoded = [bos_token_id] + tokenizer.encode(src_sample).ids + [eos_token_id]
        tgt_encoded = [bos_token_id] + tokenizer.encode(tgt_sample).ids + [eos_token_id]
        
        src_batch.append(torch.tensor(src_encoded))
        tgt_batch.append(torch.tensor(tgt_encoded))

    # Pad the sequences in the batch
    # Our model expects (seq_len, batch_size)
    src_padded = pad_sequence(src_batch, batch_first=False, padding_value=pad_token_id)
    tgt_padded = pad_sequence(tgt_batch, batch_first=False, padding_value=pad_token_id)
    
    return src_padded, tgt_padded

# --- 3. Instantiate the DataLoaders ---

# Assuming 'train_df', 'val_df', and 'tokenizer' are already defined
pad_id = tokenizer.token_to_id('<pad>')
batch_size = 32 # Or 64 as per the spec

train_dataset = ChatbotDataset(train_df, tokenizer)
val_dataset = ChatbotDataset(val_df, tokenizer)

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=lambda batch: collate_fn(batch, tokenizer, pad_id)
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=lambda batch: collate_fn(batch, tokenizer, pad_id)
)

print("DataLoaders created successfully!")
# You can check a batch like this:
# src_batch, tgt_batch = next(iter(train_loader))
# print("Source batch shape:", src_batch.shape)
# print("Target batch shape:", tgt_batch.shape)

DataLoaders created successfully!


In [23]:
# --- 1. LOAD THE MODEL ---
# --- SETUP AND INITIALIZATION FOR TRAINING ---

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pad_id = tokenizer.token_to_id('<pad>')

# Initialize a NEW model for training
print("Initializing a new model for training...")
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(
    vocab_size=vocab_size,
    d_model=256,
    nhead=2,
    d_hid=1024,
    nlayers_encoder=2,
    nlayers_decoder=2
).to(device)

# Initialize the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.98))

print("Setup complete. Ready for training.")

# --- 2. GREEDY DECODING FUNCTION ---

def generate_response(model, tokenizer, input_text, emotion, situation,
                        max_len=50, device="cuda"):
    
    # Preprocess and format the input string
    normalized_situation = normalize_text(situation)
    normalized_input = normalize_text(input_text)
    prompt = (f"Emotion: {emotion} | Situation: {normalized_situation} | "
              f"Customer: {normalized_input} Agent:")
    
    # Tokenize the source prompt
    bos_token_id = tokenizer.token_to_id('<bos>')
    eos_token_id = tokenizer.token_to_id('<eos>')
    
    src_tokens = [bos_token_id] + tokenizer.encode(prompt).ids + [eos_token_id]
    src = torch.tensor(src_tokens).unsqueeze(1).to(device) # Shape: (seq_len, 1)

    # Start the decoder output with the <bos> token
    tgt_tokens = [bos_token_id]
    
    model.eval()
    with torch.no_grad():
        for i in range(max_len - 1):
            tgt = torch.tensor(tgt_tokens).unsqueeze(1).to(device) # Shape: (tgt_len, 1)
            
            # Create masks for the model
            tgt_seq_len = tgt.shape[0]
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len).to(device)
            
            # Get model output (logits)
            output = model(src=src, tgt=tgt, src_padding_mask=None, 
                           tgt_padding_mask=None, tgt_mask=tgt_mask)
            
            # Get the logits for the very last token
            last_token_logits = output[-1, 0, :]
            
            # Find the token with the highest probability (greedy choice)
            next_token_id = last_token_logits.argmax().item()
            
            # Append the predicted token to the target sequence
            tgt_tokens.append(next_token_id)
            
            # If the model predicts the end-of-sentence token, stop generating
            if next_token_id == eos_token_id:
                break
                
    # Decode the generated token IDs back to a string
    generated_text = tokenizer.decode(tgt_tokens, skip_special_tokens=True)
    return generated_text

# --- 3. EXAMPLE USAGE ---

# Let's try an example from your test set (or make one up)
test_situation = "I remember going to the fireworks with my best friend..."
test_emotion = "sentimental"
test_utterance = "This was a best friend. I miss her."

response = generate_response(model, tokenizer, test_utterance, test_emotion, test_situation, device=device)

print(f"\nINPUT: {test_utterance}")
print(f"MODEL RESPONSE: {response}")

Initializing a new model for training...




Setup complete. Ready for training.

INPUT: This was a best friend. I miss her.
MODEL RESPONSE: budge jumping jumping jumping jumping jumping jumping jumping foot aint busiest :))) values render salespeople gates flash values render tales aint busiest youtube neckbeards spur narcissistic offguard huskies destroy rinsing talents growls visits rely seagull steroids unbearable winkler improtant taking persisted jumping etc :))) tottenham offguard huskies destroy bacteria


In [24]:

# NOTE: Ensure your trained model is already loaded and in eval mode.
# model.load_state_dict(...)
# model.eval()

def beam_search_decode(model, tokenizer, input_text, emotion, situation,
                         beam_width=5, max_len=50, device="cuda"):
    
    # Preprocess and format the input string
    normalized_situation = normalize_text(situation)
    normalized_input = normalize_text(input_text)
    prompt = (f"Emotion: {emotion} | Situation: {normalized_situation} | "
              f"Customer: {normalized_input} Agent:")

    # Tokenize the source prompt
    bos_token_id = tokenizer.token_to_id('<bos>')
    eos_token_id = tokenizer.token_to_id('<eos>')
    src_tokens = [bos_token_id] + tokenizer.encode(prompt).ids + [eos_token_id]
    src = torch.tensor(src_tokens).unsqueeze(1).to(device)

    model.eval()
    with torch.no_grad():
        # Get the encoder output once
        src_emb = model.embedding(src) * math.sqrt(model.d_model)
        src_emb = model.pos_encoder(src_emb)
        encoder_output = model.transformer.encoder(src_emb)

        # Start with the <bos> token
        # Beams are stored as a list of tuples: (sequence, log_probability_score)
        beams = [(torch.tensor([bos_token_id], device=device), 0.0)]
        
        for _ in range(max_len - 1):
            candidates = []
            for seq, score in beams:
                # If a beam has ended, add it to candidates and continue
                if seq[-1].item() == eos_token_id:
                    candidates.append((seq, score))
                    continue

                # Get model predictions for the next token
                tgt_input = seq.unsqueeze(1) # Shape: (current_len, 1)
                tgt_emb = model.embedding(tgt_input) * math.sqrt(model.d_model)
                tgt_emb = model.pos_encoder(tgt_emb)
                
                # Create masks
                tgt_mask = nn.Transformer.generate_square_subsequent_mask(len(seq)).to(device)
                
                # Decoder forward pass
                decoder_output = model.transformer.decoder(tgt_emb, encoder_output, tgt_mask=tgt_mask)
                logits = model.generator(decoder_output)

                # Get log probabilities for the last token
                log_probs = F.log_softmax(logits[-1, 0, :], dim=-1)
                
                # Get the top k most likely next tokens
                top_log_probs, top_indices = torch.topk(log_probs, beam_width)
                
                # Create new beams
                for i in range(beam_width):
                    next_token = top_indices[i].unsqueeze(0)
                    new_seq = torch.cat([seq, next_token], dim=0)
                    new_score = score + top_log_probs[i].item()
                    candidates.append((new_seq, new_score))

            # Sort all candidates by score and select the top k
            beams = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_width]

            # If all top beams have ended, we can stop early
            if all(b[0][-1].item() == eos_token_id for b in beams):
                break
    
    # Choose the best beam (the one with the highest score)
    best_seq, _ = beams[0]
    generated_text = tokenizer.decode(best_seq.tolist(), skip_special_tokens=True)
    return generated_text

# --- EXAMPLE USAGE ---

# Use the same example as before to see the improvement
test_situation = "I remember going to the fireworks with my best friend..."
test_emotion = "sentimental"
test_utterance = "This was a best friend. I miss her."

# Get the greedy response again for comparison
greedy_response = generate_response(model, tokenizer, test_utterance, test_emotion, test_situation, device=device)

# Get the beam search response
beam_response = beam_search_decode(model, tokenizer, test_utterance, test_emotion, test_situation, beam_width=5, device=device)

print(f"INPUT: {test_utterance}")
print("-" * 20)
print(f"GREEDY RESPONSE: {greedy_response}")
print(f"BEAM SEARCH RESPONSE (k=5): {beam_response}")

INPUT: This was a best friend. I miss her.
--------------------
GREEDY RESPONSE: budge jumping jumping jumping jumping jumping jumping jumping foot aint busiest :))) values render salespeople gates flash values render tales aint busiest youtube neckbeards spur narcissistic offguard huskies destroy rinsing talents growls visits rely seagull steroids unbearable winkler improtant taking persisted jumping etc :))) tottenham offguard huskies destroy bacteria
BEAM SEARCH RESPONSE (k=5): budge jumping jumping jumping jumping jumping jumping jumping jumping jumping jumping jumping jumping foot button onesie attends seen lotions laugh np buddies lotions laugh steaming jumbled truly jumping successfull bead brazil values joyed month aunt frosting solitude boil jumping overfull sneaking offguard aint busiest colored acclaim confirmed jumping overfull


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pad_id = tokenizer.token_to_id('<pad>')
batch_size = 32

# Create DataLoaders
train_dataset = ChatbotDataset(train_df, tokenizer)
val_dataset = ChatbotDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda b: collate_fn(b, tokenizer, pad_id))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda b: collate_fn(b, tokenizer, pad_id))

# Initialize the Model for a SINGLE GPU
print(f"Using device: {device}")
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size, d_model=256, nhead=2, d_hid=1024,
                         nlayers_encoder=2, nlayers_decoder=2).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.98))

print("Setup complete. Ready for training.")

# --- 4. TRAINING & VALIDATION LOOPS ---

def train_one_epoch(model, loader, optimizer, criterion, device, pad_token_id):
    model.train()
    epoch_loss = 0
    for src, tgt in tqdm(loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:-1, :]
        tgt_output = tgt[1:, :]

        # Create masks for nn.Transformer
        tgt_seq_len = tgt_input.shape[0]
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len).to(device)
        src_padding_mask = (src == pad_token_id).transpose(0, 1)
        tgt_padding_mask = (tgt_input == pad_token_id).transpose(0, 1)

        output = model(src, tgt_input, src_padding_mask, tgt_padding_mask, tgt_mask)
        
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)


def validate_and_get_bleu(model, device, tokenizer):
    model.eval()
    predictions = []
    references = []
    
    with torch.no_grad():
        # Loop through the validation dataframe to generate predictions
        for index, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Calculating Val BLEU"):
            input_text = row['empathetic_dialogues']
            emotion = row['emotion']
            situation = row['Situation']
            ground_truth = row['labels']
            
            model_output = beam_search_decode(model, tokenizer, input_text, emotion, situation)
            
            predictions.append(model_output)
            references.append(ground_truth)

    # Calculate BLEU score
    sacrebleu_metric = evaluate.load("sacrebleu")
    
    # --- THIS IS THE FIX ---
    # The 'references' argument should be a list of lists,
    # where each inner list contains one reference sentence.
    bleu_results = sacrebleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
    
    return bleu_results['score']


# --- UPDATED MAIN TRAINING DRIVER ---
NUM_EPOCHS = 10
best_val_bleu = 0.0  # Initialize with 0.0 for BLEU
model_save_path = '/kaggle/working/best_chatbot_model.pth'

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n--- Epoch {epoch}/{NUM_EPOCHS} ---")
    
    # Training loop remains the same
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, pad_id)
    
    # New validation step
    val_bleu = validate_and_get_bleu(model, device, tokenizer)
    
    # Calculate perplexity from training loss
    train_perplexity = math.exp(train_loss)
    
    print(f"Epoch {epoch}: Train Loss = {train_loss:.4f} | Train Perplexity = {train_perplexity:.2f} | Val BLEU = {val_bleu:.2f}")
    
    # Save the model if validation BLEU has improved
    if val_bleu > best_val_bleu:
        best_val_bleu = val_bleu
        torch.save(model.state_dict(), model_save_path)
        print(f"New best model saved with Val BLEU: {val_bleu:.2f}")

print("\nTraining complete.")

Using device: cuda
Setup complete. Ready for training.

--- Epoch 1/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.60it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [30:09<00:00,  3.57it/s]


Downloading builder script: 0.00B [00:00, ?B/s]

Epoch 1: Train Loss = 4.6927 | Train Perplexity = 109.14 | Val BLEU = 0.57
New best model saved with Val BLEU: 0.57

--- Epoch 2/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.64it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [23:13<00:00,  4.64it/s]


Epoch 2: Train Loss = 4.2220 | Train Perplexity = 68.17 | Val BLEU = 0.51

--- Epoch 3/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.66it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [24:37<00:00,  4.38it/s]


Epoch 3: Train Loss = 4.0918 | Train Perplexity = 59.85 | Val BLEU = 0.64
New best model saved with Val BLEU: 0.64

--- Epoch 4/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.55it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [30:18<00:00,  3.55it/s]


Epoch 4: Train Loss = 4.0051 | Train Perplexity = 54.87 | Val BLEU = 0.93
New best model saved with Val BLEU: 0.93

--- Epoch 5/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.84it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [22:29<00:00,  4.79it/s]


Epoch 5: Train Loss = 3.9417 | Train Perplexity = 51.51 | Val BLEU = 0.65

--- Epoch 6/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.73it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [20:39<00:00,  5.22it/s]


Epoch 6: Train Loss = 3.8909 | Train Perplexity = 48.95 | Val BLEU = 0.85

--- Epoch 7/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.39it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [23:59<00:00,  4.49it/s]


Epoch 7: Train Loss = 3.8492 | Train Perplexity = 46.96 | Val BLEU = 0.82

--- Epoch 8/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.82it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [20:18<00:00,  5.31it/s]


Epoch 8: Train Loss = 3.8148 | Train Perplexity = 45.37 | Val BLEU = 0.91

--- Epoch 9/10 ---


Training: 100%|██████████| 1616/1616 [00:57<00:00, 27.92it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [21:58<00:00,  4.90it/s]


Epoch 9: Train Loss = 3.7851 | Train Perplexity = 44.04 | Val BLEU = 0.90

--- Epoch 10/10 ---


Training: 100%|██████████| 1616/1616 [00:58<00:00, 27.79it/s]
Calculating Val BLEU: 100%|██████████| 6464/6464 [22:04<00:00,  4.88it/s]


Epoch 10: Train Loss = 3.7556 | Train Perplexity = 42.76 | Val BLEU = 0.95
New best model saved with Val BLEU: 0.95

Training complete.


## **Task 5**

In [26]:
predictions = []
references = []

# Loop through your test dataframe
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    input_text = row['empathetic_dialogues']
    emotion = row['emotion']
    situation = row['Situation']
    ground_truth = row['labels']
    
    # Generate a response using your best decoding method
    model_output = beam_search_decode(model, tokenizer, input_text, emotion, situation)
    
    predictions.append(model_output)
    references.append(ground_truth)


100%|██████████| 6464/6464 [22:02<00:00,  4.89it/s]


In [27]:
# Load all the required metric modules
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
chrf_metric = evaluate.load("chrf")

# Calculate the scores
bleu_results = sacrebleu_metric.compute(predictions=predictions, references=references)
rouge_results = rouge_metric.compute(predictions=predictions, references=references)
chrf_results = chrf_metric.compute(predictions=predictions, references=references)

print("\n--- Automatic Metric Results ---")
print(f"BLEU: {bleu_results['score']:.2f}")
print(f"ROUGE-L: {rouge_results['rougeL'] * 100:.2f}")
print(f"chrF: {chrf_results['score']:.2f}")

Downloading builder script: 0.00B [00:00, ?B/s]

TypeError: validate_params() got an unexpected keyword argument 'prefer_skip_nested_validation'

### **Human Evaluation**

In [None]:
# Define how many samples you want to evaluate
NUM_SAMPLES_TO_EVALUATE = 25

# Get the original inputs from your test dataframe
inputs = test_df['X'].head(NUM_SAMPLES_TO_EVALUATE).tolist()

# Get the corresponding ground truth and model predictions
ground_truth_replies = references[:NUM_SAMPLES_TO_EVALUATE]
model_generated_replies = predictions[:NUM_SAMPLES_TO_EVALUATE]

# Create a new DataFrame for easy viewing and scoring
evaluation_df = pd.DataFrame({
    'Input_Prompt': inputs,
    'Ground_Truth_Reply': ground_truth_replies,
    'Model_Generated_Reply': model_generated_replies,
    'Fluency (1-5)': '',    # Empty column for you to fill in
    'Relevance (1-5)': '',   # Empty column for you to fill in
    'Adequacy (1-5)': ''     # Empty column for you to fill in
})

# Save the DataFrame to a CSV file
evaluation_filename = '/kaggle/working/human_evaluation_sheet.csv'
evaluation_df.to_csv(evaluation_filename, index=False)

print(f"Human evaluation sheet saved to '{evaluation_filename}'")
print("Here are the first 5 rows to review:")

# Display the first 5 rows
display(evaluation_df.head())