In [1]:
import pandas as pd
# Load the TSV dataset
file_path = './dialogues_eda.tsv'
df = pd.read_csv(file_path, sep='\t')
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,question,answer
0,"Hi, How are you doing?",I am fine. How about yourself?
1,I am fine. How about yourself?,I am pretty good. Thanks for asking.
2,I am pretty good. Thanks for asking.,No problem. So how have you been?
3,No problem. So how have you been?,I have been great. What about you?
4,I have been great. What about you?,I have been good. I am in school right now.


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        # Format input with special tokens
        input_text = f"<question> {self.questions[idx]} <answer>"
        target_text = self.answers[idx]

        # Tokenize input
        encoder_inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize labels (target answer)
        decoder_inputs = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Replace padding token IDs with -100 for loss calculation
        labels = decoder_inputs["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": encoder_inputs["input_ids"].squeeze(),
            "attention_mask": encoder_inputs["attention_mask"].squeeze(),
            "labels": labels
        }

In [4]:
# Initialize tokenizer and add special tokens
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
tokenizer.add_special_tokens({"additional_special_tokens": ["<question>", "<answer>"]})

# Initialize model and resize embeddings
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


BartScaledWordEmbedding(50267, 768, padding_idx=1)

In [5]:
def encode_data(sentence):
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(tokens)
    return token_ids

In [6]:
df_ids = df.map(encode_data)
df_ids.head()

Unnamed: 0,question,answer
0,"[0, 30086, 6, 1336, 32, 47, 608, 116, 2]","[0, 100, 524, 2051, 4, 1336, 59, 2512, 116, 2]"
1,"[0, 100, 524, 2051, 4, 1336, 59, 2512, 116, 2]","[0, 100, 524, 1256, 205, 4, 4557, 13, 1996, 4, 2]"
2,"[0, 100, 524, 1256, 205, 4, 4557, 13, 1996, 4, 2]","[0, 3084, 936, 4, 407, 141, 33, 47, 57, 116, 2]"
3,"[0, 3084, 936, 4, 407, 141, 33, 47, 57, 116, 2]","[0, 100, 33, 57, 372, 4, 653, 59, 47, 116, 2]"
4,"[0, 100, 33, 57, 372, 4, 653, 59, 47, 116, 2]","[0, 100, 33, 57, 205, 4, 38, 524, 11, 334, 235..."


In [7]:
from sklearn.model_selection import train_test_split

# Assuming df_ids is your DataFrame or list
data_list = df.to_dict(orient='records')

# Split data into train and validation sets
train_data, valid_data = train_test_split(data_list, test_size=0.2, random_state=42)

In [8]:
# train_questions = ["What is your name?", "Where are you from?"]
# train_answers = ["My name is XYZ.", "I am from ABC."]
# val_questions = ["What's your purpose?"]
# val_answers = ["To assist you."]

train_questions = []
train_answers = []
val_questions = []
val_answers = []

from tqdm.auto import tqdm

for row in tqdm(train_data,desc="Training Datset preparation"):
    train_questions.append(row['question'])
    train_answers.append(row['answer'])

for row in tqdm(valid_data,desc="Valid Datset preparation"):
    val_questions.append(row['question'])
    val_answers.append(row['answer'])

Training Datset preparation: 100%|██████████| 2980/2980 [00:00<?, ?it/s]
Valid Datset preparation: 100%|██████████| 745/745 [00:00<?, ?it/s]


In [9]:
# Create datasets
max_length = 64
train_dataset = QADataset(train_questions, train_answers, tokenizer, max_length)
val_dataset = QADataset(val_questions, val_answers, tokenizer, max_length)

# Create dataloaders
batch_size = 2
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
best_loss = float('inf')
for epoch in range(num_epochs):
    # Training
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader,desc="training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader,desc="Validating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pth")
    else:
      print("Over fitting")
      break
    print(f"Epoch {epoch + 1}, Val Loss: {avg_val_loss}\n")

training:   0%|          | 7/1490 [00:21<1:17:10,  3.12s/it]


KeyboardInterrupt: 

In [11]:
questions = val_questions
actual_answers = val_answers

predicted_answers = []

model.eval()
for question in tqdm(questions,desc="Predicting Answers"):
    # Prepare input
    input_text = f"<question> {question} <answer>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate answer
    output = model.generate(input_ids, max_length=64)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    predicted_answers.append(answer)
    # print(f"Q: {question}\nPredicted: {answer}\n")

# Now you can compare predicted_answers with actual_answers


Predicting Answers:   1%|▏         | 10/745 [00:13<16:13,  1.32s/it]


KeyboardInterrupt: 

## Evaluating the model

In [44]:
# !pip install rouge-score bert-score nltk
# !python -m nltk.downloader punkt

In [45]:
import numpy as np
import string
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from collections import Counter

In [46]:
def normalize_text(text):
    """Lowercase, remove punctuation, and strip whitespace."""
    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def exact_match(pred, true):
    """Compute Exact Match (case and punctuation insensitive)."""
    return int(normalize_text(pred) == normalize_text(true))

def compute_token_f1(pred, true):
    """Compute token-level F1 score with word counts (bag-of-words)."""
    pred_tokens = normalize_text(pred).split()
    true_tokens = normalize_text(true).split()

    pred_counter = Counter(pred_tokens)
    true_counter = Counter(true_tokens)
    common_tokens = pred_counter & true_counter
    num_common = sum(common_tokens.values())

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens) if pred_tokens else 0.0
    recall = num_common / len(true_tokens) if true_tokens else 0.0

    if (precision + recall) == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def compute_rouge_l(pred, true):
    """Compute ROUGE-L score."""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(true, pred)
    return scores['rougeL'].fmeasure

def compute_meteor(pred, true):
    """Compute METEOR score with tokenization."""
    pred_tokens = word_tokenize(pred.lower())
    true_tokens = word_tokenize(true.lower())
    return meteor_score([true_tokens], pred_tokens)

In [47]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
# Example data (replace with your actual and predicted answers)
# actual_answers = ["The capital of France is Paris.", "Einstein developed the theory of relativity."]
# predicted_answers = ["Paris is the capital of France.", "Einstein's theory of relativity was groundbreaking."]

# Initialize metric lists
em_scores, f1_scores, rouge_scores, meteor_scores = [], [], [], []

for pred, true in zip(predicted_answers, actual_answers):
    # Exact Match
    em_scores.append(exact_match(pred, true))

    # Token F1
    f1_scores.append(compute_token_f1(pred, true))

    # ROUGE-L
    rouge_scores.append(compute_rouge_l(pred, true))

    # METEOR
    meteor_scores.append(compute_meteor(pred, true))

# BERTScore (computes for all pairs at once)
P, R, F1 = bert_score(predicted_answers, actual_answers, lang='en')
bert_score_f1 = np.mean(F1.numpy())  # Convert tensor to numpy array if needed

# Compute averages
metrics = {
    "Exact Match": np.mean(em_scores),
    "Token F1": np.mean(f1_scores),
    "ROUGE-L": np.mean(rouge_scores),
    "BERTScore F1": bert_score_f1,
    "METEOR": np.mean(meteor_scores)
}

# Print results
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Exact Match: 0.0081
Token F1: 0.1677
ROUGE-L: 0.1667
BERTScore F1: 0.8733
METEOR: 0.1615


## INfer The Model

In [None]:
while True:
    # Ask the user for a question
    question = input("Enter your question (or type 'exit' to quit): ")
    if question.lower() == "exit":
        break

    # Prepare input
    input_text = f"<question> {question} <answer>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate answer
    model.eval()
    output = model.generate(input_ids, max_length=64)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print question and answer
    print(f"Q: {question}\nA: {answer}\n")
