In [1]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AdamW,
    get_linear_schedule_with_warmup
)
import joblib # For saving/loading as requested, though not ideal for PyTorch models
import os
from tqdm import tqdm

# --- For Evaluation (Conceptual) ---
from nltk.translate.bleu_score import sentence_bleu

# --- Set a seed for reproducibility ---
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("Libraries imported successfully.")

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'AdamW' from 'transformers' (/home/pushpit-saluja/ML/venv/lib/python3.12/site-packages/transformers/__init__.py)

In [None]:
# --- Load the dataset ---
# In a real scenario, you would use the provided path.
# For this example, we'll use the content from the prompt.
# Let's simulate the file loading.
from io import StringIO

csv_content = """Conversation ID,Timestamp,Sender,Message
1,2025-10-07 10:15:12,User B,"Hey, did you see the client's feedback on the mockups?"
1,2025-10-07 10:15:45,User A,"Just saw it. They want a lot of changes to the color scheme."
1,2025-10-07 10:16:05,User B,"Yeah, that's what I was thinking. It's a big shift from the original brief."
1,2025-10-07 10:16:38,User A,"I'll start on the revisions. Can you update the project timeline?"
1,2025-10-07 10:17:01,User B,"Will do. I'll block out the rest of the week for it."
2,2025-10-07 10:20:19,User B,"Any plans for Saturday?"
2,2025-10-07 10:20:41,User A,"Not yet, was thinking of heading to the new bookstore in Swaroop Nagar."
2,2025-10-07 10:21:03,User B,"Oh, the one near the park? I heard it's great."
2,2025-10-07 10:21:25,User A,"Yeah, that's the one. Want to join?"
2,2025-10-07 10:21:39,User B,"Sounds good! What time?"
2,2025-10-07 10:22:00,User A,"How about around 3 PM?"
3,2025-10-07 10:25:05,User A,"Are you free? My laptop just went blank."
3,2025-10-07 10:25:21,User B,"Oh no. Did you try a hard reboot?"
3,2025-10-07 10:25:48,User A,"Tried it twice. Nothing."
3,2025-10-07 10:26:10,User B,"Okay, try connecting it to an external monitor. Maybe the display is the issue."
3,2025-10-07 10:26:33,User A,"Good idea, let me find a cable."
3,2025-10-07 10:26:59,User B,"Let me know if that works. If not, we might have to call IT."
4,2025-10-07 10:28:15,User A,"Finally watched that new sci-fi movie everyone's talking about."
4,2025-10-07 10:28:44,User B,"Nice! What did you think? I loved the visuals."
4,2025-10-07 10:29:11,User A,"Visuals were amazing, but the plot was a bit predictable for me."
4,2025-10-07 10:29:50,User B,"I can see that. The ending felt a bit rushed. Still a fun watch though."
4,2025-10-07 10:30:17,User A,"Definitely. Worth it just for the big screen experience."
"""

df = pd.read_csv(StringIO(csv_content))

# --- Define special tokens ---
USER_A_TOKEN = "<|userA|>"
USER_B_TOKEN = "<|userB|>"
END_OF_TEXT_TOKEN = "<|endoftext|>"

# --- Process the data ---
# We'll create a single string for each conversation instance that leads to a reply from User A.
# The format will be: <|userB|> message <|userA|> reply <|endoftext|>
training_samples = []
grouped = df.groupby("Conversation ID")

for _, group in grouped:
    conversation_history = ""
    messages = group.to_dict('records')
    
    for i in range(len(messages)):
        sender = messages[i]['Sender']
        message = messages[i]['Message'].strip()
        
        # We are looking for sequences where User B speaks, and then User A replies.
        # This pair becomes a training instance.
        if sender == 'User B' and i + 1 < len(messages) and messages[i+1]['Sender'] == 'User A':
            # Context is the current message from User B
            context_msg = messages[i]['Message'].strip()
            # Reply is the next message from User A
            reply_msg = messages[i+1]['Message'].strip()
            
            # Format: Previous history + current turn
            # For simplicity with this small dataset, we'll just use the immediate turn as context.
            # For a larger dataset, you'd include more of `conversation_history`.
            sample = f"{USER_B_TOKEN} {context_msg} {USER_A_TOKEN} {reply_msg} {END_OF_TEXT_TOKEN}"
            training_samples.append(sample)

        # Update conversation history (not used in the simple logic above, but good practice)
        token = USER_A_TOKEN if sender == 'User A' else USER_B_TOKEN
        conversation_history += f"{token} {message} "

print(f"Created {len(training_samples)} training samples.")
print("Example sample:")
print(training_samples[0])