In [None]:
%%capture
!pip install datasets
!pip install optuna

In [None]:
# The value for the seed and paths
SEED = 168
PATH_TO_W2V_MODEL_DRIVE = '/content/drive/MyDrive/NLP 24-25 Sem1/GoogleNews-vectors-negative300.bin'
PATH_TO_W2V_MODEL_LOCAL = r'C:/Users/60178/Downloads/GoogleNews-vectors-negative300.bin'
PATH_TO_FASTTEXT_MODEL_DRIVE = '/content/drive/MyDrive/NLP 24-25 Sem1/cc.en.300.bin'
PATH_TO_FASTTEXT_MODEL_LOCAL = r'C:/Users/60178/Downloads/cc.en.300.bin/cc.en.300.bin'

In [None]:
# Standard library imports
from collections import Counter

# General third-party libraries import
import copy
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_model
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import random
import string
import sys

# Import optuna for hyperparams tuning
import optuna

# Import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary nltk resources
nltk.download('wordnet')
nltk.download('punkt')  # Ensure punkt is downloaded for tokenization
nltk.download('stopwords')  # Ensure stopwords are also downloaded if not already

# Import pytorch
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.nn.utils.rnn import pad_sequence

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Method to set random seed to ensure consistency
def set_seed(seed = SEED):
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

# Factory to create the dataloader
def dataloader_factory(X_train_sequence, X_val_sequence, X_test_sequence, Y_train, Y_val, Y_test, batch_size, device):
    # Create Tensor datasets
    train_dataset = TensorDataset(X_train_sequence.to(device), torch.tensor(Y_train, dtype=torch.long, device=device))
    val_dataset = TensorDataset(X_val_sequence.to(device), torch.tensor(Y_val, dtype=torch.long, device=device))
    test_dataset = TensorDataset(X_test_sequence.to(device), torch.tensor(Y_test, dtype=torch.long, device=device))

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_loader, val_loader, test_loader

# Early Stopper
class EarlyStopper:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.max_validation_accuracy = 0

    def early_stop(self, validation_accuracy):
        if validation_accuracy > self.max_validation_accuracy:
            self.max_validation_accuracy = validation_accuracy
            self.counter = 0
        elif validation_accuracy < (self.max_validation_accuracy + self.delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

# Factory to provide required optimizer which specified parameters
def optimizer_factory(optimizer_name, model, learning_rate, momentum = 0.8):
    if optimizer_name == 'Adam':
        return torch.optim.Adam(model.parameters(), lr = learning_rate)
    if optimizer_name == 'SGD':
        return torch.optim.SGD(model.parameters(), lr = learning_rate, momentum = momentum)
    if optimizer_name == 'RMSprop':
        return torch.optim.RMSprop(model.parameters(), lr = learning_rate)

# Method to train the model
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()  # Set model to training mode
    train_loss = 0  # To accumulate loss for this epoch

    for inputs, labels in dataloader:
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass

        # Calculate loss
        loss = loss_fn(outputs.squeeze(), labels.float())  # Squeeze to match dimensions
        loss.backward()  # Backpropagation

        # # Apply gradient clipping
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

        optimizer.step()  # Update parameters

        train_loss += loss.item()  # Accumulate loss

    # Calculate average loss for the epoch
    train_loss /= len(dataloader)
    return train_loss

# Method to evaluate the model
def test_loop(dataloader, model):
    model.eval() # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in dataloader:
            outputs = model(inputs)  # Forward pass
            predicted = (outputs.squeeze() > 0.5).float()  # Apply threshold

            total += labels.size(0)
            correct += (predicted == labels.float()).sum().item()  # Count correct predictions

    accuracy = correct / total
    return accuracy

# Method to plot the performance graph
def plot_performance(train_losses, val_accuracies, num_epochs):
    # Visualization
    plt.figure(figsize=(12, 5))

    # Plotting Loss
    plt.subplot(1, 2, 1)
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss', color='blue')
    plt.title('Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.xticks(range(1, num_epochs + 1, 10))
    plt.legend()

    # Plotting Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy', color='orange')
    plt.title('Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.xticks(range(1, num_epochs + 1, 10))
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
set_seed()
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# **Part 0. Dataset Preparation**

Part 0. Dataset Preparation We will be using the movie review dataset introduced in https://www.cs.cornell.edu/people/ pabo/movie-review-data/rt-polaritydata.README.1.0.txt. To load this dataset, you need to install the “datasets” library via pip install datasets. Then you can use the following code snippet: 1 from datasets import load_dataset 2 dataset = load_dataset (" rotten_tomatoes ") 3 train_dataset = dataset [’train ’] 4 validation_dataset = dataset [’validation ’] 5 test_dataset = dataset [’test ’] Using the original train-valid-test split provided in the above code, you will perform model training on the training dataset, configure your model (e.g., learning rate, batch size, number of training epochs) on the validation dataset, and conduct evaluation on the test dataset.

# **Load Dataset**

In [None]:
set_seed()
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [None]:
print(f"train_dataset size: {len(train_dataset)}")
print(f"validation_dataset size: {len(validation_dataset)}")
print(f"test_dataset size: {len(test_dataset)}")
print(f"data format: {train_dataset[0]}")

train_dataset size: 8530
validation_dataset size: 1066
test_dataset size: 1066
data format: {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}


In [None]:
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"The name of the CUDA device is: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA is available!
The name of the CUDA device is: Tesla P100-PCIE-16GB


# **Load word2vec Model**

In [None]:
# Load the pre-trained Word2Vec model
if 'google.colab' in sys.modules:
    print("Running on Google Colab's hosted runtime")
    from google.colab import drive
    drive.mount('/content/drive')
    w2v_model = KeyedVectors.load_word2vec_format(PATH_TO_W2V_MODEL_DRIVE, binary=True)
else:
    print("Running on a local runtime")
    w2v_model = KeyedVectors.load_word2vec_format(PATH_TO_W2V_MODEL_LOCAL, binary=True)

# Get the dimension of the embeddings
vector_dim = w2v_model.vector_size
print(f"Word2Vec Dimension: {vector_dim}")


Running on a local runtime
Word2Vec Dimension: 300


# **Data Preprocessing**

In [None]:
# this code was previously used when running file on kaggle
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

# Dataset Preprocessing to tokenize and/or lemmatize
def preprocess_dataset(data, lemmatization = True):
    processed_sentences = []

    for entry in data:
        text = entry['text']
        # Tokenize the sentence
        tokens = word_tokenize(text)
        if lemmatization:
          # With case folding, punctuation and stop words removal
          #tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word not in punctuation and word.lower() not in stop_words]
          tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word not in punctuation]

        processed_sentences.append(tokens)

    return processed_sentences

# Get vocab and oov words
def get_vocab_OOV(sentences, w2v_model):
    vocabulary = set()
    oov_words = set()

    for sentence in sentences:
        for word in sentence:
            vocabulary.add(word)
            if word not in w2v_model:
                oov_words.add(word)

    return vocabulary, oov_words

vocabulary, oov_words = get_vocab_OOV(preprocess_dataset(train_dataset, False), w2v_model)
lemmatized_vocabulary, lemmatized_oov_words = get_vocab_OOV(preprocess_dataset(train_dataset), w2v_model)

print(f"Before lemmatization: Vocabulary size: {len(vocabulary)}, OOV words: {len(oov_words)}")
print(f"After lemmatization: Vocabulary size: {len(lemmatized_vocabulary)}, OOV words: {len(lemmatized_oov_words)}")

Before lemmatization: Vocabulary size: 18035, OOV words: 3619
After lemmatization: Vocabulary size: 16571, OOV words: 3590




1.   Lemmatize, case folding and remove stop words for train, validation and test set
2.   Drop words not found in train dataset for validation and test set




In [None]:
preprocessed_train_dataset = preprocess_dataset(train_dataset)
preprocessed_validation_dataset = preprocess_dataset(validation_dataset)
preprocessed_test_dataset = preprocess_dataset(test_dataset)

In [None]:
def drop_oov(preprocessed_dataset, w2v_model):
    return [[word for word in sentence if word in w2v_model] for sentence in preprocessed_dataset]

def drop_non_train_vocab(dataset, train_vocab):
    return [[word for word in sentence if word in train_vocab] for sentence in dataset]

X_train = preprocessed_train_dataset
Y_train = [entry['label'] for entry in train_dataset]

X_val = drop_non_train_vocab(preprocessed_validation_dataset, lemmatized_vocabulary)
Y_val = [entry['label'] for entry in validation_dataset]

X_test = drop_non_train_vocab(preprocessed_test_dataset, lemmatized_vocabulary)
Y_test = [entry['label'] for entry in test_dataset]


# **Generate Embedding**

In [None]:
def get_word2index(train_vocab):
    word2index = {}
    for idx, word in enumerate(train_vocab):
        word2index[word] = idx + 1  # +1 to offset 0 for padding
    return word2index

word2index = get_word2index(lemmatized_vocabulary)

In [None]:
print(f"word2index size = vocab size: {len(word2index)}")

word2index size = vocab size: 16571


In [None]:
def create_embedding_matrix(w2v_model, word2index, embedding_dim=300):
    """
    Creates an embedding matrix based on the word2index mapping and Word2Vec model.

    Args:
    - w2v_model: Pre-trained Word2Vec model.
    - word2index: Dictionary mapping words to their indices.
    - embedding_dim: Dimension of the Word2Vec word vectors (default: 300).

    Returns:
    - embedding_matrix: Embedding matrix where each row corresponds to the vector of a word in the vocabulary.
    """
    vocab_size = len(word2index)  # Number of words in the training vocabulary

    embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
    mean = np.mean(w2v_model.vectors, axis=0)
    std = np.std(w2v_model.vectors, axis=0)

    for word, idx in word2index.items():
        if word in w2v_model:
            embedding_matrix[idx] = w2v_model[word]
        else:
            embedding_matrix[idx] = np.random.normal(
                loc=mean, scale=std, size=(embedding_dim,)
            )

    return embedding_matrix

embedding_dim = w2v_model.vector_size  # Get the dimension of the word vectors in Word2Vec
embedding_matrix = create_embedding_matrix(w2v_model, word2index, embedding_dim)
print(f"embedding_matrix shape: {embedding_matrix.shape}")


embedding_matrix shape: (16572, 300)


# **Sentence2sequence + Padding**

In [None]:
# Get maximum sentence length
def get_max_sentence_length(*datasets):
    """
    Get the maximum sentence length from the provided datasets.

    Args:
    - datasets: A variable number of lists containing tokenized sentences.

    Returns:
    - max_length: The maximum sentence length across all datasets.
    """
    max_length = 0
    for index, dataset in enumerate(datasets):
        # Update max_length if a longer sentence is found
        max_length = max(max_length, max(len(sentence) for sentence in dataset))
        min_length = min(len(sentence) for sentence in dataset)
        print(f"Min sentence length in dataset {index}: {min_length}")
    return max_length

# Example usage
max_length = get_max_sentence_length(X_train, X_val, X_test)
print(f"Maximum sentence length of three datasets: {max_length}")

Min sentence length in dataset 0: 1
Min sentence length in dataset 1: 0
Min sentence length in dataset 2: 1
Maximum sentence length of three datasets: 53


In [None]:
def convert_to_sequence_data(tokenized_data, word2index, max_length=max_length):
    """
    Preprocess the tokenized data for RNN input.

    Args:
    - tokenized_data: List of tokenized sentences.
    - word2index: Dictionary mapping words to indices.
    - max_length: Maximum length for padding (default is 100).

    Returns:
    - padded_sequences: Padded tensor of sequences.
    """
    # Convert tokens to indices
    indexed_sequences = []
    for sentence in tokenized_data:
        indexed_sentence = [word2index.get(word, 0) for word in sentence]  # Use 0 for OOV words

        # Truncate or pad the indexed sentence to the specified max_length
        if len(indexed_sentence) > max_length:
            indexed_sentence = indexed_sentence[:max_length]  # Truncate
        else:
            indexed_sentence += [0] * (max_length - len(indexed_sentence))  # Pad with zeros

        indexed_sequences.append(torch.tensor(indexed_sentence, dtype=torch.long))

    # Convert list of tensors to a padded tensor
    padded_sequences = torch.stack(indexed_sequences)

    return padded_sequences

# Convert to sequence
X_train_sequence = convert_to_sequence_data(X_train, word2index)
X_val_sequence = convert_to_sequence_data(X_val , word2index)
X_test_sequence = convert_to_sequence_data(X_test, word2index)


# **CNN Model Architecture**

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, embedding_matrix, num_filters=64, num_conv_layers=1, filter_size=3, aggregation='mean_pool', attn = False, num_heads=1):
        super(SimpleCNN, self).__init__()
        embedding_dim = embedding_matrix.shape[1]
        self.aggregation = aggregation
        self.attn = attn

        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            padding_idx=0,  # Assuming 0 is the padding index
            freeze=False  # Freezing the embeddings
        )

        self.conv1d_layers = nn.ModuleList([
            nn.Sequential(*[
                nn.Conv1d(in_channels=(embedding_dim if i == 0 else num_filters),  # Use embedding_dim for the first layer
                          out_channels=num_filters,
                          kernel_size=filter_size),
                nn.ReLU(),
            ]) for i in range(num_conv_layers)
        ])
        if self.attn:
            self.attn_head = nn.MultiheadAttention(num_filters, num_heads, batch_first = True)

        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(num_filters, 1)  # Output size is 1 for binary classification

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)

        conv_out = embedded
        for conv1d_layer in self.conv1d_layers:
            conv_out = conv1d_layer(conv_out)

        if self.aggregation == 'max_pool':
            agg_out, _ = torch.max(conv_out, dim=2)  # Max-pooling over the sequence length
        elif self.aggregation == 'mean_pool':
            agg_out = torch.mean(conv_out, dim=2)  # Mean-pooling over the sequence length
        if self.attn:
            agg_out, _ = self.attn_head(agg_out, agg_out, agg_out)
        out = self.dropout(agg_out)
        out = self.fc(out)
        out = torch.sigmoid(out)

        return out

# **CNN Best Hyperparameter Test**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Reset the seed to ensure fairness when comparing performances
set_seed()

# Best parameter for CNN with Max Pool
num_filters = 256
num_conv_layers = 1
filter_size = 7
learning_rate = 0.0005
batch_size = 32
num_epochs = 40
aggregation = 'max_pool'
optimizer_name = 'Adam'

'''
# Best parameter for CNN with Mean Pool
num_filters = 128
num_conv_layers = 2
filter_size = 3
learning_rate = 0.001
batch_size = 32
num_epochs = 50
aggregation = 'mean_pool'
optimizer = 'RMSprop'
'''

'''
# Best parameter for CNN + Attention
num_filters = 64
num_conv_layers = 1
filter_size = 5
learning_rate = 0.0001
batch_size = 32
num_epochs = 50
num_heads = 1
aggregation = 'max_pool'
optimizer = 'RMSprop'
'''


# Create data loaders
train_loader, val_loader, test_loader = dataloader_factory(X_train_sequence, X_val_sequence, X_test_sequence, Y_train, Y_val, Y_test, batch_size, device)

# Initialize the model
model = SimpleCNN(embedding_matrix, num_filters, num_conv_layers, filter_size, aggregation, False)
model.to(device)
criterion = nn.BCELoss()  # Use BCELoss for binary classification

# Choose optimizer based on trial suggestion
optimizer = optimizer_factory(optimizer_name, model, learning_rate)

early_stopper = EarlyStopper(patience = 20, delta = 0.05)
# Lists to store loss and accuracy for visualization
train_losses = []
val_accuracies = []

best_val_accuracy = 0
best_model = None

for epoch in range(num_epochs):
    loss = train_loop(train_loader, model, criterion, optimizer)
    train_losses.append(loss)

    # Evaluate on validation set
    val_accuracy = test_loop(val_loader, model)
    val_accuracies.append(val_accuracy)
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = copy.deepcopy(model)

    if early_stopper.early_stop(val_accuracy):
        print("Early stopping triggered!")
        break

    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Evaluate on test data
test_accuracy = test_loop(test_loader, best_model)

# Print out the performance
print(f"Using Best Model, Test Accuracy: {test_accuracy:.4f}")

Epoch [1/40], Train Loss: 0.5344, Validation Accuracy: 0.7814
Epoch [2/40], Train Loss: 0.3094, Validation Accuracy: 0.7720
Epoch [3/40], Train Loss: 0.1515, Validation Accuracy: 0.7964
Epoch [4/40], Train Loss: 0.0604, Validation Accuracy: 0.7861
Epoch [5/40], Train Loss: 0.0228, Validation Accuracy: 0.7777
Epoch [6/40], Train Loss: 0.0094, Validation Accuracy: 0.7861
Epoch [7/40], Train Loss: 0.0047, Validation Accuracy: 0.7795
Epoch [8/40], Train Loss: 0.0030, Validation Accuracy: 0.7767
Epoch [9/40], Train Loss: 0.0019, Validation Accuracy: 0.7833
Epoch [10/40], Train Loss: 0.0013, Validation Accuracy: 0.7777
Epoch [11/40], Train Loss: 0.0010, Validation Accuracy: 0.7786
Epoch [12/40], Train Loss: 0.0007, Validation Accuracy: 0.7805
Epoch [13/40], Train Loss: 0.0006, Validation Accuracy: 0.7805
Epoch [14/40], Train Loss: 0.0005, Validation Accuracy: 0.7739
Epoch [15/40], Train Loss: 0.0004, Validation Accuracy: 0.7739
Epoch [16/40], Train Loss: 0.0003, Validation Accuracy: 0.7739
E