In [1]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report, f1_score, accuracy_score
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import matplotlib.pyplot as plt
from feature_extractor import FeatureExtractor

def read_jsonl(file_path):
    """Read messages and additional fields from a JSONL file."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

def extract_features_from_messages(data):
    """Extract features from a list of messages using FeatureExtractor and include additional fields."""
    feature_extractor = FeatureExtractor()
    features_list = []
    for entry in data:
        # Extract features from the message
        features = feature_extractor.extract_features(entry['message'])
        # Include additional fields as features
        features.update({
            'receiver_annotation': entry['receiver_annotation'],
            'sender_annotation': entry['sender_annotation'],
            'speaker': entry['speaker'],
            'receiver': entry['receiver'],
            'absolute_message_index': entry['absolute_message_index'],
            'relative_message_index': entry['relative_message_index'],
            'season': entry['season'],
            'year': entry['year'],
            'game_score': entry['game_score'],
            'game_score_delta': entry['game_score_delta'],
            'game_id': entry['game_id']
        })
        features_list.append(features)
    return features_list

def create_dataframe(features_list):
    """Create a pandas DataFrame from a list of feature dictionaries."""
    return pd.DataFrame(features_list)

def process_jsonl_to_dataframe(file_path):
    """Process a JSONL file to a pandas DataFrame with extracted features and additional fields."""
    data = read_jsonl(file_path)
    features_list = extract_features_from_messages(data)
    df = create_dataframe(features_list)
    return df

class DataProcessor:
    @staticmethod
    def read_jsonl(file_path):
        """Read messages and additional fields from a JSONL file."""
        data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                data.append(json.loads(line))
        return data

    @staticmethod
    def preprocess_messages(messages, tokenizer, vocab):
        """Tokenize and convert messages to sequences."""
        return [vocab(tokenizer(message)) for message in messages]

    @staticmethod
    def load_glove_embeddings(glove_path):
        """Load GloVe embeddings from a file."""
        embeddings_index = {}
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        return embeddings_index

    @staticmethod
    def generate_embeddings(vocab, embeddings_index, embedding_dim=50):
        """Generate embedding matrix from vocabulary and GloVe embeddings."""
        embedding_matrix = np.zeros((len(vocab), embedding_dim))
        for word, idx in vocab.get_stoi().items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[idx] = embedding_vector
        return torch.tensor(embedding_matrix, dtype=torch.float32)

    @staticmethod
    def process_jsonl_to_dataframe(file_path):
        """Convert JSONL data to a DataFrame."""
        data = DataProcessor.read_jsonl(file_path)
        return pd.DataFrame(data)
    
    
class CustomDataset(Dataset):
    def __init__(self, sequences, features, labels):
        self.sequences = sequences
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.features[idx], self.labels[idx]

def collate_fn(batch):
    sequences, features, labels = zip(*batch)
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    features_stacked = torch.stack(features)
    labels_stacked = torch.stack(labels)
    return sequences_padded, features_stacked, labels_stacked

class CombinedModel(nn.Module):
    def __init__(self, embedding_matrix, num_features, hidden_dim, output_dim):
        super(CombinedModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.fc1 = nn.Linear(embedding_matrix.size(1) + num_features, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, seq, features):
        embedded = self.embedding(seq).mean(dim=1)
        combined = torch.cat((embedded, features), dim=1)
        out = self.fc1(combined)
        out = self.fc2(out)
        return out

class Trainer:
    @staticmethod
    def train_model(model, train_loader, val_loader, test_loader, criterion, optimizer, num_epochs=10):
        model.train()
        train_losses, val_losses = [], []
        train_accuracies, val_accuracies = [], []
        train_f1s, val_f1s = [], []

        for epoch in range(num_epochs):
            # Training
            model.train()
            epoch_train_loss = 0
            train_true_labels, train_predicted_labels = [], []

            for seq, features, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(seq, features)
                loss = criterion(outputs.squeeze(), labels)
                loss.backward()
                optimizer.step()

                epoch_train_loss += loss.item()
                predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()
                train_true_labels.extend(labels.tolist())
                train_predicted_labels.extend(predicted.tolist())

            train_losses.append(epoch_train_loss / len(train_loader))
            train_accuracies.append(accuracy_score(train_true_labels, train_predicted_labels))
            train_f1s.append(f1_score(train_true_labels, train_predicted_labels, average='macro'))

            # Validation
            model.eval()
            epoch_val_loss = 0
            val_true_labels, val_predicted_labels = [], []

            with torch.no_grad():
                for seq, features, labels in val_loader:
                    outputs = model(seq, features)
                    loss = criterion(outputs.squeeze(), labels)
                    epoch_val_loss += loss.item()
                    predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()
                    val_true_labels.extend(labels.tolist())
                    val_predicted_labels.extend(predicted.tolist())

            val_losses.append(epoch_val_loss / len(val_loader))
            val_accuracies.append(accuracy_score(val_true_labels, val_predicted_labels))
            val_f1s.append(f1_score(val_true_labels, val_predicted_labels, average='macro'))

            print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}')
            print(f'Train Accuracy: {train_accuracies[-1]}, Val Accuracy: {val_accuracies[-1]}')
            print(f'Train Macro F1: {train_f1s[-1]}, Val Macro F1: {val_f1s[-1]}')
            test_true_labels, test_predicted_labels = Trainer.evaluate_model(model, test_loader)
            print("--- Test Set Classification Report ---")
            print(classification_report(test_true_labels, test_predicted_labels, target_names=['False', 'True']))
            print()

        return model, train_losses, val_losses, train_accuracies, val_accuracies, train_f1s, val_f1s

    @staticmethod
    def evaluate_model(model, loader):
        model.eval()
        true_labels = []
        predicted_labels = []
        with torch.no_grad():
            for seq, features, labels in loader:
                outputs = model(seq, features)
                probabilities = torch.sigmoid(outputs)  # Apply sigmoid to get probabilities
                predicted = (probabilities.squeeze() > 0.5).float()  # Convert to binary predictions
                true_labels.extend(labels.tolist())
                predicted_labels.extend(predicted.tolist())
        return true_labels, predicted_labels

def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies, train_f1s, val_f1s):
    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(2, 1, 2)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Val Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies, train_f1s, val_f1s):
    plt.figure(figsize=(12, 8))

    plt.subplot(3, 1, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(3, 1, 2)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Val Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(3, 1, 3)
    plt.plot(train_f1s, label='Train Macro F1')
    plt.plot(val_f1s, label='Val Macro F1')
    plt.xlabel('Epoch')
    plt.ylabel('Macro F1')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Main Execution
# Load and preprocess data
train_data = DataProcessor.read_jsonl('data/train_sm.jsonl')
val_data = DataProcessor.read_jsonl('data/validation_sm.jsonl')
test_data = DataProcessor.read_jsonl('data/test_sm.jsonl')

# Extract features and create DataFrames
train_features_list = extract_features_from_messages(train_data)
val_features_list = extract_features_from_messages(val_data)
test_features_list = extract_features_from_messages(test_data)

train_df = create_dataframe(train_features_list)
val_df = create_dataframe(val_features_list)
test_df = create_dataframe(test_features_list)

# Corrected line: Access shape property without parentheses
print(train_df.shape)

# Prepare features and labels
X_train = train_df.drop('sender_annotation', axis=1)
y_train = train_df['sender_annotation']
X_val = val_df.drop('sender_annotation', axis=1)
y_val = val_df['sender_annotation']
X_test = test_df.drop('sender_annotation', axis=1)
y_test = test_df['sender_annotation']

# One-hot encode categorical features
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Align columns
common_cols = list(set(X_train_encoded.columns) & set(X_val_encoded.columns) & set(X_test_encoded.columns))
X_train_encoded = X_train_encoded[common_cols].fillna(0)
X_val_encoded = X_val_encoded[common_cols].fillna(0)
X_test_encoded = X_test_encoded[common_cols].fillna(0)

# Convert boolean columns to integers
bool_columns_train = X_train_encoded.select_dtypes(include=['bool']).columns
bool_columns_val = X_val_encoded.select_dtypes(include=['bool']).columns
bool_columns_test = X_test_encoded.select_dtypes(include=['bool']).columns
X_train_encoded[bool_columns_train] = X_train_encoded[bool_columns_train].astype(int)
X_val_encoded[bool_columns_val] = X_val_encoded[bool_columns_val].astype(int)
X_test_encoded[bool_columns_test] = X_test_encoded[bool_columns_test].astype(int)

# Convert to PyTorch tensors
train_features = torch.tensor(X_train_encoded.values, dtype=torch.float32)
train_labels = torch.tensor(y_train.values, dtype=torch.float32)
val_features = torch.tensor(X_val_encoded.values, dtype=torch.float32)
val_labels = torch.tensor(y_val.values, dtype=torch.float32)
test_features = torch.tensor(X_test_encoded.values, dtype=torch.float32)
test_labels = torch.tensor(y_test.values, dtype=torch.float32)

# Tokenize and preprocess messages
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, [entry['message'] for entry in train_data]), specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

train_sequences = DataProcessor.preprocess_messages([entry['message'] for entry in train_data], tokenizer, vocab)
val_sequences = DataProcessor.preprocess_messages([entry['message'] for entry in val_data], tokenizer, vocab)
test_sequences = DataProcessor.preprocess_messages([entry['message'] for entry in test_data], tokenizer, vocab)

# Load GloVe embeddings
glove_path = 'glove.6B/glove.6B.50d.txt'
embeddings_index = DataProcessor.load_glove_embeddings(glove_path)
embedding_matrix = DataProcessor.generate_embeddings(vocab, embeddings_index)

# Convert sequences to tensors
train_sequences = [torch.tensor(seq) for seq in train_sequences]
val_sequences = [torch.tensor(seq) for seq in val_sequences]
test_sequences = [torch.tensor(seq) for seq in test_sequences]

# Create datasets and data loaders
train_dataset = CustomDataset(train_sequences, train_features, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

val_dataset = CustomDataset(val_sequences, val_features, val_labels)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

test_dataset = CustomDataset(test_sequences, test_features, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akshat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(13132, 33)


In [2]:
class CombinedLSTMModel(nn.Module):
    def __init__(self, embedding_matrix, num_features, hidden_dim, output_dim):
        super(CombinedLSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim + num_features, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, seq, features):
        embedded = self.embedding(seq)  # Shape: (batch_size, seq_len, embedding_dim)
        _, (hidden, _) = self.lstm(embedded)  # Take the hidden state from the last time step
        hidden = hidden.squeeze(0)  # Shape: (batch_size, hidden_dim)
        
        combined = torch.cat((hidden, features), dim=1)  # Combine LSTM output and additional features
        out = self.fc1(combined)
        out = self.fc2(out)
        return out

In [None]:
#load the test loader from test_loader.pt
data_loader_path = 'data/data_loaders/test_loader.pt'
test_loader = torch.load(data_loader_path)
#loading the model for testing
model_test = CombinedLSTMModel(embedding_matrix, X_train_encoded.shape[1], 128, 1)
model_folder = 'Final Models'
model_test.load_state_dict(torch.load(f'{model_folder}/combined_lstm_model.pth'))
model_test.eval()
test_true_labels, test_predicted_labels = Trainer.evaluate_model(model_test, test_loader)
print("--- Test Set Classification Report ---")
print(classification_report(test_true_labels, test_predicted_labels, target_names=['False', 'True']))
print(f"Test Accuracy: {accuracy_score(test_true_labels, test_predicted_labels)}")

--- Test Set Classification Report ---
              precision    recall  f1-score   support

       False       0.29      0.23      0.26       240
        True       0.93      0.95      0.94      2501

    accuracy                           0.88      2741
   macro avg       0.61      0.59      0.60      2741
weighted avg       0.87      0.88      0.88      2741

Test Accuracy: 0.883983947464429
