In [13]:
import pandas as pd
# Load the TSV dataset
file_path = './dialogues_eda.tsv'
df = pd.read_csv(file_path, sep='\t')
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,question,answer
0,"Hi, How are you doing?",I am fine. How about yourself?
1,I am fine. How about yourself?,I am pretty good. Thanks for asking.
2,I am pretty good. Thanks for asking.,No problem. So how have you been?
3,No problem. So how have you been?,I have been great. What about you?
4,I have been great. What about you?,I have been good. I am in school right now.


In [14]:
from sklearn.model_selection import train_test_split

# Assuming your data is in a DataFrame called 'df'
# Split into train and validation sets (keeping question-answer pairs intact)
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

# Use ONLY the training data
questions = train_df['question'].tolist()
answers = train_df['answer'].tolist()

# Verify
print(f"Training samples: {len(questions)}")
print(f"First training pair:")
print(f"Q: {questions[0]}")
print(f"A: {answers[0]}")

Training samples: 2980
First training pair:
Q: Because it has great teachers.
A: What else?


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm

# Special tokens
STOP_TOKEN = '<end>'
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'

class Vocabulary:
    def __init__(self):
        self.word2idx = defaultdict(lambda: len(self.word2idx))
        self.idx2word = {}
        # Add special tokens first
        self.add_word(PAD_TOKEN)  # index 0
        self.add_word(UNK_TOKEN)  # index 1
        self.add_word(STOP_TOKEN)  # index 2
        
    def add_word(self, word):
        if word not in self.word2idx:
            idx = self.word2idx[word]
            self.idx2word[idx] = word

def build_vocabulary(texts):
    vocab = Vocabulary()
    for text in texts:
        for word in text.lower().split():
            vocab.add_word(word)
    return vocab

def text_to_sequence(text, vocab):
    return [vocab.word2idx.get(word, vocab.word2idx[UNK_TOKEN]) 
            for word in text.lower().split()]

# Sample dataset
data = {
    'Question': questions,
    'Answer': answers
}

# Build vocabulary
all_texts = data['Question'] + data['Answer']
vocab = build_vocabulary(all_texts)

# Convert and pad sequences with stop token
stop_token = vocab.word2idx[STOP_TOKEN]
pad_token = vocab.word2idx[PAD_TOKEN]

questions = [text_to_sequence(q, vocab) for q in data['Question']]
answers = [text_to_sequence(a, vocab) + [stop_token] for a in data['Answer']]

# Calculate max length with stop token
max_len = max(max(len(q) for q in questions), max(len(a) for a in answers))

def pad_sequence(seq, max_len):
    return seq + [pad_token] * (max_len - len(seq))

questions = [pad_sequence(seq, max_len) for seq in questions]
answers = [pad_sequence(seq, max_len) for seq in answers]

# Dataset and DataLoader
class DialogueDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = DialogueDataset(questions, answers)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# LSTM Model with stop token handling
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.stop_token = stop_token
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out)
        return logits

model = LSTMModel(len(vocab.word2idx))
criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    batch_progress = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)
    for batch_X, batch_y in batch_progress:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs.view(-1, len(vocab.word2idx)), batch_y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')

# Enhanced response generation with stop token
def generate_response(input_text):
    model.eval()
    sequence = text_to_sequence(input_text, vocab)
    padded = pad_sequence(sequence, max_len)
    input_tensor = torch.LongTensor(padded).unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(input_tensor)
    
    response = []
    stop_token = vocab.word2idx[STOP_TOKEN]
    for step in range(max_len):
        probs = torch.softmax(outputs[0, step], dim=0)
        predicted = torch.multinomial(probs, 1).item()
        
        if predicted == stop_token:
            break
        if predicted != pad_token and predicted != stop_token:
            response.append(vocab.idx2word.get(predicted, UNK_TOKEN))
    
    return ' '.join(response)

# Test the model
test_input = "hi, how are you doing?"
print(f"Input: {test_input}")
print(f"Response: {generate_response(test_input)}")

                                                              

Input: hi, how are you doing?
Response: it sunday that going to pair




In [45]:
test_input = "hi, how are you doing?"
print(f"Input: {test_input}")
print(f"Response: {generate_response(test_input)}")

Input: hi, how are you doing?
Response: how two the to to funny-looking.


In [None]:
# !pip install rouge-score bert-score nltk
# !python -m nltk.downloader punkt

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Downloading absl_py-2.2.2-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml): started
  Building wheel for rouge-score (pyproject.toml): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25027 sha256=e8d5f50efd25c233327b8a64185d35f8d54f9a8dc6ac6f2390ca2f74800225bd
  Stored in directory: c:\users\rk225\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef54687


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rk225\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
import numpy as np
import string
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from collections import Counter

In [32]:
def normalize_text(text):
    """Lowercase, remove punctuation, and strip whitespace."""
    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def exact_match(pred, true):
    """Compute Exact Match (case and punctuation insensitive)."""
    return int(normalize_text(pred) == normalize_text(true))

def compute_token_f1(pred, true):
    """Compute token-level F1 score with word counts (bag-of-words)."""
    pred_tokens = normalize_text(pred).split()
    true_tokens = normalize_text(true).split()

    pred_counter = Counter(pred_tokens)
    true_counter = Counter(true_tokens)
    common_tokens = pred_counter & true_counter
    num_common = sum(common_tokens.values())

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens) if pred_tokens else 0.0
    recall = num_common / len(true_tokens) if true_tokens else 0.0

    if (precision + recall) == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def compute_rouge_l(pred, true):
    """Compute ROUGE-L score."""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(true, pred)
    return scores['rougeL'].fmeasure

def compute_meteor(pred, true):
    """Compute METEOR score with tokenization."""
    pred_tokens = word_tokenize(pred.lower())
    true_tokens = word_tokenize(true.lower())
    return meteor_score([true_tokens], pred_tokens)

In [2]:
import torch
import torch.nn as nn
from transformers import BartForConditionalGeneration, BartConfig
from transformers.modeling_outputs import Seq2SeqLMOutput
from transformers.models.bart.modeling_bart import shift_tokens_right

class ERMBART(BartForConditionalGeneration):
    def __init__(self, config: BartConfig, k=10):
        super().__init__(config)
        self.d = config.d_model  # Hidden size (768 for base, 1024 for large)
        self.k = k  # Number of memory slots
        
        # Entailment Relation Memory (ERM)
        self.memory = nn.Parameter(torch.randn(k, self.d))  # [k, d]
        self.W_pi = nn.Linear(self.d, k)  # Memory attention weights
        
        # Initialize memory and projection layer
        nn.init.xavier_normal_(self.memory)
        nn.init.xavier_normal_(self.W_pi.weight)

    def compute_z(self, inputs_embeds: torch.Tensor) -> torch.Tensor:
        """Compute latent memory vector using mean pooling"""
        # inputs_embeds: [batch_size, seq_len, d]
        pooled = inputs_embeds.mean(dim=1)  # [batch_size, d]
        pi = torch.softmax(self.W_pi(pooled), dim=-1)  # [batch_size, k]
        z = torch.matmul(pi, self.memory)  # [batch_size, d]
        return z

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: torch.LongTensor = None,
        decoder_input_ids: torch.LongTensor = None,
        decoder_attention_mask: torch.LongTensor = None,
        labels: torch.LongTensor = None,
        **kwargs
    ) -> Seq2SeqLMOutput:
        # Encode inputs
        inputs_embeds = self.model.encoder.embed_tokens(input_ids)
        z = self.compute_z(inputs_embeds)  # [batch_size, d]

        # Handle decoder inputs for training
        if labels is not None:
            # Shift labels for autoregressive training
            decoder_input_ids = shift_tokens_right(
                labels, self.config.pad_token_id, self.config.decoder_start_token_id
            )

        # Get decoder embeddings
        decoder_inputs_embeds = self.model.decoder.embed_tokens(decoder_input_ids)
        
        # Inject latent memory into first decoder token
        decoder_inputs_embeds[:, 0] += z  # [batch_size, d]

        # Forward through original BART architecture
        return super().forward(
            input_ids=None,  # We're using embeddings directly
            attention_mask=attention_mask,
            decoder_inputs_embeds=decoder_inputs_embeds,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
            **kwargs
        )

    def generate(self, input_ids: torch.LongTensor, **kwargs):
        """Fixed generation method with proper input handling"""
        # Compute encoder embeddings from input_ids
        inputs_embeds = self.model.encoder.embed_tokens(input_ids)
        z = self.compute_z(inputs_embeds)
        
        # Create initial decoder input with start token
        decoder_start = torch.full(
            (input_ids.size(0), 1),
            self.config.decoder_start_token_id,
            device=input_ids.device,
            dtype=torch.long
        )
        
        # Get decoder embeddings and inject memory
        decoder_inputs_embeds = self.model.decoder.embed_tokens(decoder_start)
        decoder_inputs_embeds[:, 0] += z
        
        # Generate using both encoder and decoder embeddings
        return super().generate(
            input_ids=input_ids,  # Maintain original input_ids
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            **kwargs
        )

In [3]:
# Tokenize input with proper padding
inputs = tokenizer(
    "What's your favorite food?", 
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=128
)

# Generate response
output = model.generate(
    input_ids=inputs.input_ids.to(device),
    attention_mask=inputs.attention_mask.to(device),
    max_length=50,
    num_beams=5
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

NameError: name 'tokenizer' is not defined