In [None]:
! pip install pandas
! pip install numpy
import pandas as pd
import numpy as np
import gensim
import gensim.downloader
import nltk
nltk.download('punkt')
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import ParameterGrid
import random

## Data Preprocessing

In [None]:
# Only run this cell if you want to generate a new validation set
df = pd.read_csv("trec/original/train.csv")
# TODO: check w group if we need this
# Remove duplicates from train
df.drop_duplicates(subset='text', keep='first', inplace=True)
# Randomly choose 500 rows to drop 
num_rows_to_drop = 500
np.random.seed(42)
rows_to_drop = np.random.choice(df.index, num_rows_to_drop, replace=False)
# print(rows_to_drop)
# Create a development dataframe from these 500 dropped rows
validation_df = df.loc[rows_to_drop].copy()

# Reset index of development dataframe and export to csv
validation_df.reset_index(drop=True, inplace=True)
validation_df.to_csv("trec/generated/validation.csv",index=None)

df_copy = df.copy(deep=True)
# Drop validation rows from original dataset, export as csv
df_copy.drop(rows_to_drop, inplace=True)
df_copy.reset_index(drop=True, inplace=True)
df_copy.to_csv("trec/generated/train.csv",index=None)

In [None]:
# Run this cell if you're generating a new validation set for sanity checking
def check_unique_texts(train_csv_file, validation_csv_file):
    train_df = pd.read_csv(train_csv_file)
    validation_df = pd.read_csv(validation_csv_file)

    train_texts = train_df['text']
    validation_texts = validation_df['text']

    common_texts = validation_texts[validation_texts.isin(train_texts)]

    if common_texts.empty:
        print("Validation set and train sets are unique")
    else:
        print("Common values found in the 'text' column:")
        print(common_texts)

train_csv_file = "trec/generated/train.csv"
validation_csv_file = "trec/generated/validation.csv"
check_unique_texts(train_csv_file, validation_csv_file)

In [None]:
# check for duplicates from test - remove if there are any
df = pd.read_csv("trec/original/test.csv")
df.drop_duplicates(subset='text', keep='first', inplace=True)
df.to_csv('trec/generated/test.csv',index=None)

In [None]:
train_df=pd.read_csv('trec/generated/train.csv')
val_df=pd.read_csv('trec/generated/validation.csv')
test_df=pd.read_csv('trec/generated/test.csv')

train_df.drop(columns='label-fine', inplace=True)
val_df.drop(columns='label-fine', inplace=True)
test_df.drop(columns='label-fine', inplace=True)

In [None]:
classes=train_df['label-coarse'].unique()
random.shuffle(classes)
for i in classes[:2]:
    train_df['label-coarse']=train_df['label-coarse'].apply(lambda x:'OTHERS' if x==i else x)
    val_df['label-coarse']=val_df['label-coarse'].apply(lambda x:'OTHERS' if x==i else x)
    test_df['label-coarse']=test_df['label-coarse'].apply(lambda x:'OTHERS' if x==i else x)

mapping_dict = {item: idx for idx, item in enumerate(set(train_df['label-coarse'].unique()))}
train_df['label-coarse']=train_df['label-coarse'].apply(lambda x:mapping_dict[x])
val_df['label-coarse']=val_df['label-coarse'].apply(lambda x:mapping_dict[x])
test_df['label-coarse']=test_df['label-coarse'].apply(lambda x:mapping_dict[x])

train_df['label-coarse'].unique()

## word2vec

In [None]:
word2vec_model = gensim.downloader.load('word2vec-google-news-300')

def text_to_word2vec(text, word2vec_model):
    #words = nltk.word_tokenize(text)
    words=text.split(" ")
    embeddings = []
    for word in words:
        if word in word2vec_model:
            embeddings.append(word2vec_model[word])
        else:
            #If word not in the vocab, use a default vector or zeros
            embeddings.append(np.zeros(word2vec_model.vector_size, dtype=np.float32))
            #pass
    return embeddings

## Dataloader

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, max_seq_length,word2vec_model):
        self.data = data
        self.word2vec_model=word2vec_model
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label,text = self.data.iloc[idx]
        text = text_to_word2vec(text, self.word2vec_model)

        text = np.array(text, dtype=np.float32)
        if text.shape[0] < self.max_seq_length:
            padding = np.zeros((self.max_seq_length - text.shape[0], text.shape[1]), dtype=np.float32)
            text = np.vstack((text, padding))
        elif text.shape[0] > self.max_seq_length:
            text = text[:self.max_seq_length]
        return torch.Tensor(text), label
    
def intialise_loaders(df,max_seq_length,batch_size):
    dataloader = DataLoader(CustomDataset(df,max_seq_length,word2vec_model), batch_size=batch_size, shuffle=True)
    return dataloader

## Model architecture

In [None]:
class AveragePooling(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),  # Average pooling over the sequence
            nn.Flatten(),  # Flatten the tensor
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        # Apply permute operation before feeding to the sequential layers
        x = x.permute(0, 2, 1)
        return self.layers(x)

In [None]:
class MaxPooling(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.AdaptiveMaxPool1d(1),  # Max pooling over the sequence
            nn.Flatten(),  # Flatten the tensor
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        # Apply permute operation before feeding to the sequential layers
        x = x.permute(0, 2, 1)
        return self.layers(x)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size) # LSTM layer
        self.fc = nn.Linear(hidden_size, output_size) # Linear layer for classification

    def forward(self, x):
        lstm_out, _ = self.lstm(x) # Pass through the LSTM
        last_hidden_state = lstm_out[:, -1, :] # Take the output from the last time step
        output = self.fc(last_hidden_state) # Apply the linear layer for classification
        return output

## Train/Test

In [None]:
class EarlyStopper:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0: 
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [None]:
max_seq_length = 50
input_size = word2vec_model.vector_size
output_size = len(mapping_dict)
epochs=100

hyperparameter_grid = {
    'hidden_size': [8,16,32,64,128,256,512],
    'batch_size': [8,16, 32, 64]
}

best_loss = np.Inf
best_hyperparameters = None

# Iterate over the parameter grid
for params in ParameterGrid(hyperparameter_grid):
    early_stopper=EarlyStopper()
    
    hidden_size = params['hidden_size']
    batch_size = params['batch_size']

    # Initialize and train your model using the current hyperparameters
    #model = AveragePooling(input_size, hidden_size, output_size)
    #model = MaxPooling(input_size, hidden_size, output_size)
    model = LSTM(input_size, hidden_size, output_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Training and validation data loaders
    train_dataloader = intialise_loaders(train_df, max_seq_length, batch_size)
    val_dataloader = intialise_loaders(val_df, max_seq_length, batch_size)

    for epoch in range(epochs):
        train(train_dataloader, model, criterion, optimizer)
        val_loss = test(val_dataloader, model, criterion)
        if early_stopper.early_stop(val_loss):
            print(f'Early Stopping at epoch {epoch+1}')
            break

    # Evaluate the model on the validation set
    if val_loss < best_loss:
        best_loss = val_loss
        best_hyperparameters = params

print("Best Hyperparameters:", best_hyperparameters)

In [None]:
# best_hidden_size = best_hyperparameters['hidden_size']
# best_batch_size = best_hyperparameters['batch_size']
best_hidden_size =512
best_batch_size =32

best_loss = np.Inf

#best_model = AveragePooling(input_size, best_hidden_size, output_size)
#best_model = MaxPooling(input_size, best_hidden_size, output_size)
best_model = LSTM(input_size, best_hidden_size, output_size)


print(best_model)
train_dataloader = intialise_loaders(train_df, max_seq_length, best_batch_size)
val_dataloader = intialise_loaders(val_df, max_seq_length, best_batch_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=0.001)

early_stopper=EarlyStopper()
for epoch in range(epochs):
    train(train_dataloader, best_model, criterion, optimizer)
    val_loss = test(val_dataloader, best_model, criterion)
    if early_stopper.early_stop(val_loss):
        print(f'Early Stopping at epoch {epoch+1}')
        break

test_dataloader = intialise_loaders(test_df, max_seq_length, best_batch_size)
test(test_dataloader, best_model, criterion)


### Test accuracies:
- 84% avg pooling
- 72.6% max pooling