## **Libraries Import**

In [104]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

## **Task 1**

In [105]:
# Download latest version
path = kagglehub.dataset_download("atharvjairath/empathetic-dialogues-facebook-ai")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/empathetic-dialogues-facebook-ai


In [106]:
# The path provided by kagglehub
dataset_path = "/kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv"

# Load the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)
print(f"Dataset loaded successfully. Total rows: {len(df)}")

# First, split into 80% train and 20% temporary (for val + test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Next, split the 20% temporary set in half to get 10% validation and 10% test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Dataset loaded successfully. Total rows: 64636
Training set size: 51708
Validation set size: 6464
Test set size: 6464


In [107]:
def normalize_text(text):
    text = str(text).lower()
    
    # Rule 1: Remove "customer :" or "agent :" from the BEGINNING of the string
    text = re.sub(r'^(customer|agent)\s*:\s*', '', text, flags=re.I).strip()

    # Rule 2: Remove "agent :" from the END of the string
    text = re.sub(r'agent\s*:\s*$', '', text, flags=re.I).strip()

    # Rule 3: Handle punctuation and whitespace
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [108]:
# Apply the function to the correct columns in all three data splits
for df_split in [train_df, val_df, test_df]:
    df_split['Situation_normalized'] = df_split['Situation'].apply(normalize_text)
    df_split['empathetic_dialogues_normalized'] = df_split['empathetic_dialogues'].apply(normalize_text)
    df_split['labels_normalized'] = df_split['labels'].apply(normalize_text)

In [109]:
# Create a list of all text from the training set to build the vocabulary
corpus = list(train_df['empathetic_dialogues_normalized']) + \
         list(train_df['Situation_normalized']) + \
         list(train_df['labels_normalized'])

# We need an iterator for the tokenizer training
def corpus_iterator():
    for text in corpus:
        yield text

In [110]:
# Initialize a tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

# Define the trainer
trainer = WordLevelTrainer(special_tokens=["<pad>", "<bos>", "<eos>", "<unk>"])

# Train the tokenizer on your training data
tokenizer.train_from_iterator(corpus_iterator(), trainer=trainer)

vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

# Save the tokenizer for later use 
save_path = "/kaggle/working/my_tokenizer.json"
tokenizer.save(save_path)
print(f"Tokenizer saved to {save_path}")

Vocabulary size: 19342
Tokenizer saved to /kaggle/working/my_tokenizer.json


In [111]:
# tokenizer = Tokenizer.from_file("my_tokenizer.json")

print("Special token IDs:")
print(f"<unk> ID: {tokenizer.token_to_id('<unk>')}")
print(f"<pad> ID: {tokenizer.token_to_id('<pad>')}")
print(f"<bos> ID: {tokenizer.token_to_id('<bos>')}")
print(f"<eos> ID: {tokenizer.token_to_id('<eos>')}")

Special token IDs:
<unk> ID: 3
<pad> ID: 0
<bos> ID: 1
<eos> ID: 2


## **Task 2**

In [112]:
def create_input_string(row):
    emotion = row['emotion']
    situation = row['Situation_normalized']
    customer_utterance = row['empathetic_dialogues_normalized']
    
    # Format the string exactly as specified
    input_str = f"Emotion: {emotion} | Situation: {situation} | Customer: {customer_utterance} Agent:"
    return input_str

In [113]:
# Apply the function to create the 'X' and 'Y' columns for all data splits
for df_split in [train_df, val_df, test_df]:
    df_split['X'] = df_split.apply(create_input_string, axis=1)
    # The target 'Y' is the normalized agent's reply from the 'labels' column
    df_split['Y'] = df_split['labels_normalized']

print("X and Y columns created successfully.")

X and Y columns created successfully.


In [114]:
# Inspect a final example from the training set to verify
print("\n--- Example ---")
print("INPUT (X):")
print(train_df['X'].iloc[0])
print("\nTARGET (Y):")
print(train_df['Y'].iloc[0])


--- Example ---
INPUT (X):
Emotion: nostalgic | Situation: i had to go buy legos for my nephew the other day . makes me miss the days when my girls were young enough to play with them . | Customer: were you embarrassed or what happend ? Agent:

TARGET (Y):
no just this feeling overcame me that my kids just have outgrown this time .
