In [None]:
!pip install torch transformers pandas numpy scikit-learn nltk rouge-score tqdm

In [None]:
!python -c "import nltk; nltk.download('punkt'); nltk.download('wordnet')"

In [5]:
"""
Conversational Response Prediction System
==========================================
A complete pipeline for predicting User A's responses based on conversation history.

This system:
1. Preprocesses and tokenizes conversational data
2. Fine-tunes a GPT-2 model for response generation
3. Generates context-aware replies
4. Evaluates using BLEU, ROUGE, and Perplexity metrics
5. Provides deployment-ready components
"""

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPT2Config,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# For evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import math

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


# ============================================================================
# STEP 1: DATA PREPROCESSING AND TOKENIZATION
# ============================================================================

class ConversationDataProcessor:
    """Handles data loading, preprocessing, and separation of user conversations."""

    def __init__(self, data):
        """
        Initialize with conversation data.

        Args:
            data: DataFrame or path to CSV/PDF containing conversation data
        """
        self.data = data
        self.user_a_conversations = []
        self.user_b_conversations = []

    def load_and_parse_data(self):
        """Parse the conversation data and separate by user."""
        if isinstance(self.data, str):
            # If data is a file path
            if self.data.endswith('.csv'):
                df = pd.read_csv(self.data)
            else:
                # Assuming PDF export of CSV
                df = pd.read_csv(self.data)
        else:
            df = self.data

        # Group by conversation ID
        conversations = df.groupby('Conversation ID')

        processed_conversations = []

        for conv_id, conv_data in conversations:
            conv_data = conv_data.sort_values('Timestamp')

            messages = []
            for _, row in conv_data.iterrows():
                messages.append({
                    'sender': row['Sender'],
                    'message': row['Message'],
                    'timestamp': row['Timestamp']
                })

            processed_conversations.append({
                'conversation_id': conv_id,
                'messages': messages
            })

        return processed_conversations

    def clean_message(self, message):
        """Clean and normalize message text."""
        # Remove quotes if present
        message = message.strip('"\'')
        # Normalize whitespace
        message = re.sub(r'\s+', ' ', message)
        return message.strip()

    def create_training_pairs(self, conversations):
        """
        Create (context, response) pairs for training.
        Context includes conversation history, response is User A's reply.
        """
        training_pairs = []

        for conv in conversations:
            messages = conv['messages']
            context_window = []

            for i, msg in enumerate(messages):
                sender = msg['sender']
                message = self.clean_message(msg['message'])

                if sender == 'User A' and len(context_window) > 0:
                    # This is User A's response - create training pair
                    context = ' [SEP] '.join(context_window)
                    response = message
                    training_pairs.append({
                        'context': context,
                        'response': response,
                        'full_text': f"{context} [RESPONSE] {response}"
                    })

                # Add current message to context
                context_window.append(f"{sender}: {message}")

                # Keep only last 5 messages for context (adjustable)
                if len(context_window) > 5:
                    context_window.pop(0)

        return training_pairs

    def separate_user_conversations(self, conversations):
        """Separate conversations by user for analysis."""
        user_a_msgs = []
        user_b_msgs = []

        for conv in conversations:
            for msg in conv['messages']:
                clean_msg = self.clean_message(msg['message'])
                if msg['sender'] == 'User A':
                    user_a_msgs.append(clean_msg)
                elif msg['sender'] == 'User B':
                    user_b_msgs.append(clean_msg)

        self.user_a_conversations = user_a_msgs
        self.user_b_conversations = user_b_msgs

        return user_a_msgs, user_b_msgs

    def save_separated_data(self, output_dir='./'):
        """Save separated user conversations to files."""
        pd.DataFrame({'messages': self.user_a_conversations}).to_csv(
            f'{output_dir}user_a_conversations.csv', index=False
        )
        pd.DataFrame({'messages': self.user_b_conversations}).to_csv(
            f'{output_dir}user_b_conversations.csv', index=False
        )
        print(f"Saved User A conversations: {len(self.user_a_conversations)} messages")
        print(f"Saved User B conversations: {len(self.user_b_conversations)} messages")


# ============================================================================
# STEP 2: CUSTOM DATASET FOR TRAINING
# ============================================================================

class ConversationDataset(Dataset):
    """PyTorch Dataset for conversation pairs."""

    def __init__(self, data_pairs, tokenizer, max_length=256):
        self.data_pairs = data_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        pair = self.data_pairs[idx]

        # Encode the full text (context + response)
        encoding = self.tokenizer(
            pair['full_text'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # For language modeling, labels are the same as input_ids
        labels = input_ids.clone()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


# ============================================================================
# STEP 3: MODEL TRAINING
# ============================================================================

class ConversationModelTrainer:
    """Handles model fine-tuning and training."""

    def __init__(self, model_name='gpt2', device=None):
        """
        Initialize trainer with model.

        Args:
            model_name: Pretrained model to use ('gpt2', 'gpt2-medium', etc.)
            device: Training device (cuda/cpu)
        """
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        print(f"Using device: {self.device}")

        # Load tokenizer and model
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # Add special tokens
        special_tokens = {'additional_special_tokens': ['[SEP]', '[RESPONSE]']}
        self.tokenizer.add_special_tokens(special_tokens)

        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.model.to(self.device)

        print(f"Model loaded: {model_name}")
        print(f"Vocabulary size: {len(self.tokenizer)}")

    def train(self, train_dataset, val_dataset=None,
              epochs=5, batch_size=4, learning_rate=5e-5,
              warmup_steps=100, save_steps=500, output_dir='./model_checkpoints'):
        """
        Train the model.

        Args:
            train_dataset: Training dataset
            val_dataset: Validation dataset (optional)
            epochs: Number of training epochs
            batch_size: Training batch size
            learning_rate: Learning rate
            warmup_steps: Warmup steps for scheduler
            save_steps: Save checkpoint every N steps
            output_dir: Directory to save checkpoints
        """
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )

        # Optimizer and scheduler
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )

        # Training loop
        self.model.train()
        global_step = 0
        training_stats = []

        print(f"\nStarting training for {epochs} epochs...")
        print(f"Total steps: {total_steps}")

        for epoch in range(epochs):
            epoch_loss = 0
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

            for batch in progress_bar:
                # Move batch to device
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                epoch_loss += loss.item()

                # Backward pass
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                global_step += 1

                # Update progress bar
                progress_bar.set_postfix({'loss': loss.item()})

                # Save checkpoint
                if global_step % save_steps == 0:
                    checkpoint_path = f"{output_dir}/checkpoint-{global_step}"
                    self.save_model(checkpoint_path)

            avg_epoch_loss = epoch_loss / len(train_loader)
            print(f"Epoch {epoch+1} - Average Loss: {avg_epoch_loss:.4f}")

            training_stats.append({
                'epoch': epoch + 1,
                'avg_loss': avg_epoch_loss
            })

            # Validation
            if val_dataset:
                val_loss = self.evaluate(val_dataset, batch_size)
                print(f"Validation Loss: {val_loss:.4f}")
                training_stats[-1]['val_loss'] = val_loss

        return training_stats

    def evaluate(self, dataset, batch_size=4):
        """Evaluate model on dataset."""
        self.model.eval()
        eval_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        total_loss = 0

        with torch.no_grad():
            for batch in eval_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_loss += outputs.loss.item()

        self.model.train()
        return total_loss / len(eval_loader)

    def save_model(self, path):
        """Save model and tokenizer."""
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        """Load model and tokenizer."""
        self.model = GPT2LMHeadModel.from_pretrained(path)
        self.tokenizer = GPT2Tokenizer.from_pretrained(path)
        self.model.to(self.device)
        print(f"Model loaded from {path}")


# ============================================================================
# STEP 4: RESPONSE GENERATION
# ============================================================================

class ResponseGenerator:
    """Generates responses using trained model."""

    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, context, max_length=100,
                         num_return_sequences=1,
                         temperature=0.7,
                         top_k=50,
                         top_p=0.95,
                         do_sample=True):
        """
        Generate response given conversation context.

        Args:
            context: Conversation history
            max_length: Maximum length of generated response
            num_return_sequences: Number of responses to generate
            temperature: Sampling temperature
            top_k: Top-k sampling
            top_p: Nucleus sampling
            do_sample: Whether to use sampling
        """
        self.model.eval()

        # Format input with response prompt
        input_text = f"{context} [RESPONSE]"

        # Encode input
        input_ids = self.tokenizer.encode(
            input_text,
            return_tensors='pt'
        ).to(self.device)

        # Generate
        with torch.no_grad():
            output_sequences = self.model.generate(
                input_ids=input_ids,
                max_length=input_ids.shape[1] + max_length,
                num_return_sequences=num_return_sequences,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                do_sample=do_sample,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        # Decode responses
        responses = []
        for sequence in output_sequences:
            text = self.tokenizer.decode(sequence, skip_special_tokens=False)

            # Extract only the response part
            if '[RESPONSE]' in text:
                response = text.split('[RESPONSE]')[-1].strip()
                # Remove EOS token
                response = response.replace(self.tokenizer.eos_token, '').strip()
                responses.append(response)

        return responses


# ============================================================================
# STEP 5: EVALUATION METRICS
# ============================================================================

class ResponseEvaluator:
    """Evaluates generated responses using multiple metrics."""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )
        self.smoothing = SmoothingFunction()

    def calculate_bleu(self, reference, hypothesis):
        """Calculate BLEU score."""
        reference_tokens = reference.split()
        hypothesis_tokens = hypothesis.split()

        # Calculate BLEU with smoothing
        bleu_score = sentence_bleu(
            [reference_tokens],
            hypothesis_tokens,
            smoothing_function=self.smoothing.method1
        )

        return bleu_score

    def calculate_rouge(self, reference, hypothesis):
        """Calculate ROUGE scores."""
        scores = self.rouge_scorer.score(reference, hypothesis)

        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def calculate_perplexity(self, model, tokenizer, texts, device):
        """Calculate perplexity on a set of texts."""
        model.eval()
        total_loss = 0
        total_tokens = 0

        with torch.no_grad():
            for text in texts:
                encodings = tokenizer(
                    text,
                    return_tensors='pt',
                    truncation=True,
                    max_length=512
                )

                input_ids = encodings['input_ids'].to(device)

                outputs = model(input_ids, labels=input_ids)
                loss = outputs.loss

                total_loss += loss.item() * input_ids.shape[1]
                total_tokens += input_ids.shape[1]

        perplexity = math.exp(total_loss / total_tokens)
        return perplexity

    def evaluate_predictions(self, predictions, references):
        """
        Evaluate a set of predictions against references.

        Args:
            predictions: List of predicted responses
            references: List of reference responses

        Returns:
            Dictionary containing average metrics
        """
        bleu_scores = []
        rouge1_scores = []
        rouge2_scores = []
        rougeL_scores = []

        for pred, ref in zip(predictions, references):
            # BLEU
            bleu = self.calculate_bleu(ref, pred)
            bleu_scores.append(bleu)

            # ROUGE
            rouge = self.calculate_rouge(ref, pred)
            rouge1_scores.append(rouge['rouge1'])
            rouge2_scores.append(rouge['rouge2'])
            rougeL_scores.append(rouge['rougeL'])

        return {
            'bleu': np.mean(bleu_scores),
            'rouge1': np.mean(rouge1_scores),
            'rouge2': np.mean(rouge2_scores),
            'rougeL': np.mean(rougeL_scores)
        }


# ============================================================================
# MAIN PIPELINE
# ============================================================================

def main_pipeline(data_path):
    """
    Complete pipeline for training and evaluating the conversation model.

    Args:
        data_path: Path to conversation data CSV/PDF
    """

    print("=" * 80)
    print("CONVERSATIONAL RESPONSE PREDICTION SYSTEM")
    print("=" * 80)

    # Step 1: Load and preprocess data
    print("\n[STEP 1] Loading and preprocessing data...")

    # Read the uploaded data
    df = pd.read_csv(data_path)

    processor = ConversationDataProcessor(df)
    conversations = processor.load_and_parse_data()

    print(f"Loaded {len(conversations)} conversations")

    # Separate user conversations
    user_a_msgs, user_b_msgs = processor.separate_user_conversations(conversations)
    processor.save_separated_data()

    # Create training pairs
    training_pairs = processor.create_training_pairs(conversations)
    print(f"Created {len(training_pairs)} training pairs")

    # Split data
    train_pairs, test_pairs = train_test_split(
        training_pairs,
        test_size=0.2,
        random_state=42
    )
    train_pairs, val_pairs = train_test_split(
        train_pairs,
        test_size=0.1,
        random_state=42
    )

    print(f"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}")

    # Step 2: Initialize trainer and create datasets
    print("\n[STEP 2] Initializing model and creating datasets...")

    trainer = ConversationModelTrainer(model_name='gpt2')

    train_dataset = ConversationDataset(train_pairs, trainer.tokenizer)
    val_dataset = ConversationDataset(val_pairs, trainer.tokenizer)
    test_dataset = ConversationDataset(test_pairs, trainer.tokenizer)

    # Step 3: Train model
    print("\n[STEP 3] Training model...")

    training_stats = trainer.train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        epochs=3,  # Adjust based on your needs
        batch_size=2,  # Adjust based on GPU memory
        learning_rate=5e-5
    )

    # Save final model
    trainer.save_model('./final_model')

    # Step 4: Generate responses
    print("\n[STEP 4] Generating responses on test set...")

    generator = ResponseGenerator(trainer.model, trainer.tokenizer, trainer.device)

    predictions = []
    references = []

    for pair in test_pairs[:10]:  # Test on first 10 examples
        context = pair['context']
        reference = pair['response']

        generated = generator.generate_response(
            context,
            max_length=50,
            temperature=0.7
        )

        if generated:
            predictions.append(generated[0])
            references.append(reference)

            print(f"\nContext: {context}")
            print(f"Reference: {reference}")
            print(f"Generated: {generated[0]}")
            print("-" * 80)

    # Step 5: Evaluate
    print("\n[STEP 5] Evaluating model...")

    evaluator = ResponseEvaluator()

    metrics = evaluator.evaluate_predictions(predictions, references)

    print("\n" + "=" * 80)
    print("EVALUATION RESULTS")
    print("=" * 80)
    print(f"BLEU Score: {metrics['bleu']:.4f}")
    print(f"ROUGE-1: {metrics['rouge1']:.4f}")
    print(f"ROUGE-2: {metrics['rouge2']:.4f}")
    print(f"ROUGE-L: {metrics['rougeL']:.4f}")

    # Calculate perplexity
    test_texts = [pair['full_text'] for pair in test_pairs]
    perplexity = evaluator.calculate_perplexity(
        trainer.model,
        trainer.tokenizer,
        test_texts,
        trainer.device
    )
    print(f"Perplexity: {perplexity:.4f}")

    print("\n" + "=" * 80)
    print("TRAINING COMPLETE")
    print("=" * 80)

    return trainer, generator, evaluator, metrics


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Run the complete pipeline
    # Replace with your actual data path
    data_path = "conversationfile.csv"  # or .pdf if exported as PDF

    trainer, generator, evaluator, metrics = main_pipeline(data_path)

    # Example: Generate response for new context
    print("\n\n" + "=" * 80)
    print("INTERACTIVE EXAMPLE")
    print("=" * 80)

    new_context = "User B: Hey, did you finish the report? [SEP] User A: Almost done, just need to add the charts."

    responses = generator.generate_response(
        new_context,
        num_return_sequences=3,
        temperature=0.8
    )

    print(f"\nContext: {new_context}")
    print("\nGenerated Responses:")
    for i, response in enumerate(responses, 1):
        print(f"{i}. {response}")

CONVERSATIONAL RESPONSE PREDICTION SYSTEM

[STEP 1] Loading and preprocessing data...
Loaded 4 conversations
Saved User A conversations: 11 messages
Saved User B conversations: 11 messages
Created 9 training pairs
Train: 6, Val: 1, Test: 2

[STEP 2] Initializing model and creating datasets...
Using device: cuda
Model loaded: gpt2
Vocabulary size: 50259

[STEP 3] Training model...

Starting training for 3 epochs...
Total steps: 9


Epoch 1/3: 100%|██████████| 3/3 [00:00<00:00,  4.88it/s, loss=7.45]


Epoch 1 - Average Loss: 8.3912
Validation Loss: 10.5800


Epoch 2/3: 100%|██████████| 3/3 [00:00<00:00,  5.55it/s, loss=7.77]


Epoch 2 - Average Loss: 8.1086
Validation Loss: 10.0677


Epoch 3/3: 100%|██████████| 3/3 [00:00<00:00,  5.39it/s, loss=7.03]


Epoch 3 - Average Loss: 7.6163
Validation Loss: 9.0668
Model saved to ./final_model

[STEP 4] Generating responses on test set...

Context: User A: Finally watched that new sci-fi movie everyone's talking about. [SEP] User B: Nice! What did you think? I loved the visuals.
Reference: Visuals were amazing, but the plot was a bit predictable for me.
Generated: User C: I thought the story was amazing. 𐼤𐽢𐽢𐽢𐽢𐽢𐽢𐽢𐽢�
--------------------------------------------------------------------------------

Context: User B: Hey, did you see the client's feedback on the mockups? [SEP] User A: Just saw it. They want a lot of changes to the color scheme. [SEP] User B: Yeah, that's what I was thinking. It's a big shift from the original brief.
Reference: I'll start on the revisions. Can you update the project timeline?
Generated: User B: Yeah. ㅋ ONSORED ㅋ ㅋㅋㅋㅋㅋㅋㅋㅋㅋ ㅋ ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ ㅋㅋㅋㅋㅋㅋㅋㅋㅋ ㅋ ㅋㅋㅋㅋㅋㅋㅋㅋㅋ ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ ㅋㅋㅋㅋ
--------------------------------------------------------------------------------

[STEP 5] E