In [3]:
import pandas as pd

EXCEL_PATH = "/home/pushpit-saluja/ML/salujapushpit/conversationfile.xlsx"

# Load the Excel file
xls = pd.ExcelFile(EXCEL_PATH)
print("Available sheets:", xls.sheet_names)

# Load first sheet
df = pd.read_excel(EXCEL_PATH, sheet_name=0)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())


Available sheets: ['userAuserB']
Shape: (22, 4)
Columns: ['Conversation ID', 'Timestamp', 'Sender', 'Message']
   Conversation ID           Timestamp  Sender  \
0                1 2025-10-07 10:15:12  User B   
1                1 2025-10-07 10:15:45  User A   
2                1 2025-10-07 10:16:05  User B   
3                1 2025-10-07 10:16:38  User A   
4                1 2025-10-07 10:17:01  User B   

                                             Message  
0  "Hey, did you see the client's feedback on the...  
1  "Just saw it. They want a lot of changes to th...  
2  "Yeah, that's what I was thinking. It's a big ...  
3  "I'll start on the revisions. Can you update t...  
4  "Will do. I'll block out the rest of the week ...  


In [4]:
import re

# Standardize column names
df.columns = [c.strip().lower() for c in df.columns]

# Check essential columns
assert 'message' in df.columns, "Dataset must have a 'message' column."
assert 'sender' in df.columns, "Dataset must have a 'sender' column."

# Clean sender labels (normalize)
df['sender'] = df['sender'].astype(str).str.strip().str.lower()

# Detect A/B labels dynamically
senders = df['sender'].unique()
print("Detected senders:", senders)

# Try to detect who is User A and B
user_a = next((s for s in senders if 'a' in s), senders[0])
user_b = next((s for s in senders if 'b' in s), senders[-1])

print(f"→ Interpreted User A as: {user_a}, User B as: {user_b}")

# Sort by timestamp if available
if 'timestamp' in df.columns:
    df = df.sort_values('timestamp')

# Build (B → A) message pairs
pairs = []
for i in range(len(df) - 1):
    sender_now = df.iloc[i]['sender']
    sender_next = df.iloc[i + 1]['sender']
    if re.search(user_b, sender_now) and re.search(user_a, sender_next):
        pairs.append([df.iloc[i]['message'], df.iloc[i + 1]['message']])

pairs_df = pd.DataFrame(pairs, columns=['input_text', 'target_text'])
print("✅ Total pairs created:", len(pairs_df))
print(pairs_df.head(10))


Detected senders: ['user b' 'user a']
→ Interpreted User A as: user a, User B as: user b
✅ Total pairs created: 10
                                          input_text  \
0  "Hey, did you see the client's feedback on the...   
1  "Yeah, that's what I was thinking. It's a big ...   
2                          "Any plans for Saturday?"   
3   "Oh, the one near the park? I heard it's great."   
4                          "Sounds good! What time?"   
5                "Oh no. Did you try a hard reboot?"   
6  "Okay, try connecting it to an external monito...   
7  "Let me know if that works. If not, we might h...   
8   "Nice! What did you think? I loved the visuals."   
9  "I can see that. The ending felt a bit rushed....   

                                         target_text  
0  "Just saw it. They want a lot of changes to th...  
1  "I'll start on the revisions. Can you update t...  
2  "Not yet, was thinking of heading to the new b...  
3              "Yeah, that's the one. Want to jo

In [5]:
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

class ChatDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_enc = self.tokenizer(
            self.df.iloc[idx]['input_text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        target_enc = self.tokenizer(
            self.df.iloc[idx]['target_text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        input_ids = input_enc['input_ids'].squeeze()
        labels = target_enc['input_ids'].squeeze()
        labels[labels == tokenizer.pad_token_id] = -100  # ignore pad tokens

        return {"input_ids": input_ids, "labels": labels}

dataset = ChatDataset(pairs_df, tokenizer)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

print("DataLoader created successfully with", len(dataset), "samples.")


  from .autonotebook import tqdm as notebook_tqdm


DataLoader created successfully with 10 samples.


In [7]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import AdamW
from tqdm.auto import tqdm

# Force CPU
device = torch.device("cpu")

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training settings
epochs = 2  # CPU is slow, keep small
model.train()

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    total_loss = 0
    for batch in loop:
        inputs = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} avg loss: {total_loss/len(loader):.4f}")

# Save model
torch.save(model.state_dict(), "ChatRec_Model.pt")
print("✅ Model saved successfully.")


  0%|          | 0/3 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Epoch 1: 100%|██████████| 3/3 [00:12<00:00,  4.25s/it, loss=7.45]


Epoch 1 avg loss: 9.2154


Epoch 2: 100%|██████████| 3/3 [00:11<00:00,  3.78s/it, loss=4.03]


Epoch 2 avg loss: 5.3603
✅ Model saved successfully.


In [8]:
from nltk.translate.bleu_score import sentence_bleu
import math, numpy as np

model.eval()

def generate_reply(prompt, max_len=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_len,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.8
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Quick test
print("Test Response →", generate_reply("Hi, how are you?"))

# ---- BLEU & Perplexity ----
def evaluate_bleu(df, n=50):
    sample_df = df.sample(min(n, len(df)))
    scores = []
    for _, row in sample_df.iterrows():
        pred = generate_reply(row["input_text"])
        ref = [row["target_text"].split()]
        scores.append(sentence_bleu(ref, pred.split(), weights=(0.5, 0.5)))
    return np.mean(scores)

def evaluate_perplexity(loader):
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in loader:
            inputs = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            loss = model(inputs, labels=labels).loss
            losses.append(loss.item())
    return math.exp(np.mean(losses))

print("BLEU:", evaluate_bleu(pairs_df))
print("Perplexity:", evaluate_perplexity(loader))


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Test Response → Hi, how are you?


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU: 1.97476174763398e-155
Perplexity: 46.371679435196306


In [9]:
import joblib

joblib.dump(model.state_dict(), "Model.joblib")

with open("ReadMe.txt", "w") as f:
    f.write("Offline Chat-Reply Recommendation System\n")
    f.write("Model: DistilGPT-2 fine-tuned on chat data\n")
    f.write("Metrics: BLEU, Perplexity\n")
    f.write("Files: ChatRec_Model.pt, Model.joblib\n")

print("✅ Artifacts ready for submission.")


✅ Artifacts ready for submission.
