## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

import torch
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
# importing the dataset
df = pd.read_csv('/nyt.csv')
print(df.head())
print(df.shape)

                                                text   label
0  (reuters) - carlos tevez sealed his move to ju...  sports
1  if professional pride and strong defiance can ...  sports
2  palermo, sicily — roberta vinci beat top-seede...  sports
3  spain's big two soccer teams face a pair of it...  sports
4  the argentine soccer club san lorenzo complete...  sports
(11519, 2)


## Data Cleaning

In [None]:
# converting the string categorical labels into numerical encodings
df['encoded_labels'] = LabelEncoder().fit_transform(df['label'])
df['encoded_labels'].unique()

array([2, 0, 1])

In [None]:
# confirming the number of categories
df['label'].unique()

array(['sports', 'business', 'politics'], dtype=object)

In [None]:
# checking for imbalance
df['encoded_labels'].value_counts()

Unnamed: 0_level_0,count
encoded_labels,Unnamed: 1_level_1
2,8639
1,1451
0,1429


In [None]:
# Splitting data into training and test sets before vectorising

texts = df['text'].values
labels = df['encoded_labels'].values

# first i will split data into 80% training and 20% validation + test sets
# since dataset is imbalanced, i will ensure stratification
x_train, x_val_test, y_train, y_val_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels, shuffle=True)

# next, i will further split the 20% of the validation + test data into individual validation and test sets
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5, random_state=42, stratify=y_val_test, shuffle=True)


Part 1: Bag Of Words:

(a) binary-valued vector

(b) frequency vector

(c) tf-idf vector

In [None]:
# (a)binary-valued vector

vectorizer = CountVectorizer(binary=True) # setting binary = true enables binary vectorisation

x_train_vec = vectorizer.fit_transform(x_train) # fitting to training data and transforming to vector

# transforming to vector based on training fit to avoid data leakage
x_val_vec = vectorizer.transform(x_val)
x_test_vec = vectorizer.transform(x_test)

In [None]:
# i will define a function to try different hyperparameters to find the optimal logistic regression model

def find_best_model(x_train, x_val):
    C_values = [0.001, 0.01, 0.1, 1, 10]
    best_val_f1 = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, random_state=42, max_iter=1000)
        model.fit(x_train, y_train)

        # evaluating on macro f1, since we want to ensure we perform well on underrepresented classes too
        val_preds = model.predict(x_val)
        val_f1_macro = f1_score(y_val, val_preds, average='macro')

        print(f"For C={C}, validation f1 score (macro) = {val_f1_macro}")

        if val_f1_macro > best_val_f1:
            best_val_f1 = val_f1_macro
            best_C = C

    return best_C

In [None]:
best_C = find_best_model(x_train_vec, x_val_vec) # finding the optimal C for the given training and validation data

For C=0.001, validation f1 score (macro) = 0.9296806602174538
For C=0.01, validation f1 score (macro) = 0.9553736740573259
For C=0.1, validation f1 score (macro) = 0.9641629561318424
For C=1, validation f1 score (macro) = 0.9641629561318424
For C=10, validation f1 score (macro) = 0.9618685417800322


In [None]:
# next, i will define a function to retrain the model on the best C, and print the performance stats

def print_performance(best_C, x_train, x_test):
    # training the final model with the best hyperparameter C
    final_model = LogisticRegression(C=best_C, random_state=42, max_iter=1000)
    final_model.fit(x_train, y_train)

    # evaluating the model performance on the test set
    test_preds = final_model.predict(x_test)
    test_accuracy = accuracy_score(y_test, test_preds)
    test_f1_macro = f1_score(y_test, test_preds, average='macro')
    test_f1_micro = f1_score(y_test, test_preds, average='micro')

    print(f"Testing accuracy = {test_accuracy:.4f}")
    print(f"Test macro f1 score: {test_f1_macro:.4f}")
    print(f"Test micro f1 score: {test_f1_micro:.4f}")

In [None]:
print_performance(best_C, x_train_vec, x_test_vec) # printing the performance stats of the best performing model

Testing accuracy = 0.9870
Test macro f1 score: 0.9690
Test micro f1 score: 0.9870


In [None]:
# (b) frequency vector
vectorizer = CountVectorizer() # the default binary=False enables frequency-based vectorisation

x_train_vec = vectorizer.fit_transform(x_train) # fitting to training data and transforming to vector

# transforming to vector based on training fit to avoid data leakage
x_val_vec = vectorizer.transform(x_val)
x_test_vec = vectorizer.transform(x_test)

best_C = find_best_model(x_train_vec, x_val_vec) # finding the optimal C for the given training and validation data

For C=0.001, validation f1 score (macro) = 0.9422420668938205
For C=0.01, validation f1 score (macro) = 0.9641888928524152
For C=0.1, validation f1 score (macro) = 0.9641888928524152
For C=1, validation f1 score (macro) = 0.9628580201033726
For C=10, validation f1 score (macro) = 0.960571429889416


In [None]:
print_performance(best_C, x_train_vec, x_test_vec) # printing the performance stats of the best performing model

Testing accuracy = 0.9878
Test macro f1 score: 0.9724
Test micro f1 score: 0.9878


In [None]:
# (c) tf-idf vector
vectorizer = TfidfVectorizer()

x_train_vec = vectorizer.fit_transform(x_train) # fitting to training data and transforming to vector

# transforming to vector based on training fit to avoid data leakage
x_val_vec = vectorizer.transform(x_val)
x_test_vec = vectorizer.transform(x_test)

best_C = find_best_model(x_train_vec, x_val_vec) # finding the optimal C for the given training and validation data

For C=0.001, validation f1 score (macro) = 0.2857142857142857
For C=0.01, validation f1 score (macro) = 0.2857142857142857
For C=0.1, validation f1 score (macro) = 0.8876118256933653
For C=1, validation f1 score (macro) = 0.9568308323126699
For C=10, validation f1 score (macro) = 0.9705033871076241


In [None]:
print_performance(best_C, x_train_vec, x_test_vec) # printing the performance stats of the best performing model

Testing accuracy = 0.9913
Test macro f1 score: 0.9798
Test micro f1 score: 0.9913


Part 2: Word2Vec

In [None]:
# (i) Using glove embeddings

# loading the 100 dimensional glove embeddings
glove_embeddings = {}
file_path = 'glove.6B.100d.txt' # file name for the 100 dimensional embedding file

# I have used Claude AI to generate the next few lines of code in this cell (to load the glove embeddings) as i am unfamiliar with glove's embedding format
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

print(f"Loaded {len(glove_embeddings)} word vectors")

Loaded 400000 word vectors


In [None]:
# i will define a function to convert each article's text into its averaged vector
def document_to_vector(text, embeddings):
    words = text.lower().split() # tokenising words from text

    word_vectors = []
    for word in words:
        if word in embeddings: # finding corresponding vectors for words existing in glove's embeddings
            word_vectors.append(embeddings[word])

    doc_vector = np.mean(word_vectors, axis=0) # averaging the word vectors
    return doc_vector

# converting all data
x_train_glove = np.array([document_to_vector(text, glove_embeddings) for text in x_train])
x_val_glove = np.array([document_to_vector(text, glove_embeddings) for text in x_val])
x_test_glove = np.array([document_to_vector(text, glove_embeddings) for text in x_test])

In [None]:
best_C = find_best_model(x_train_glove, x_val_glove) # finding the optimal C for the given training and validation data

For C=0.001, validation f1 score (macro) = 0.2857142857142857
For C=0.01, validation f1 score (macro) = 0.8293433326835634
For C=0.1, validation f1 score (macro) = 0.9081110872777539
For C=1, validation f1 score (macro) = 0.945777565768855
For C=10, validation f1 score (macro) = 0.9501629925220065


In [None]:
print_performance(best_C, x_train_glove, x_test_glove) # printing the performance stats of the best performing model

Testing accuracy = 0.9878
Test macro f1 score: 0.9706
Test micro f1 score: 0.9878


In [None]:
# (ii) Training Word2Vec on the given data

# tokenising by creating a list of the list of words in each article
tokens = [text.lower().split() for text in texts]

# training the word2vec model on generated tokens with context size = 5, 4 cpu cores and CBOW training for every word present in the documents
w2v_model = Word2Vec(sentences=tokens, vector_size=100, window=5, min_count=1, workers=4, sg=0, epochs=10)

print(f"Vocabulary size = {len(w2v_model.wv)}")

Vocabulary size = 262246


In [None]:
# converting documents to 100-dim vectors

# i will define a function to convert the words in each article's text into vectors
def w2v_vectorisation(text, w2v_model):
    tokens = text.lower().split() # tokenising

    # extracting word vectors from the word2vec model
    word_vectors = []
    for word in tokens:
        if word in w2v_model.wv:
            word_vectors.append(w2v_model.wv[word])

    # averaging all the word vectors to get the document vector
    doc_vector = np.mean(word_vectors, axis=0)
    return doc_vector

# vectorising all data
x_train_w2v = np.array([w2v_vectorisation(text, w2v_model) for text in x_train])
x_val_w2v = np.array([w2v_vectorisation(text, w2v_model) for text in x_val])
x_test_w2v = np.array([w2v_vectorisation(text, w2v_model) for text in x_test])

print("Successfully vectorised all 3 text datasets.")

Successfully vectorised all 3 text datasets.


In [None]:
best_C = find_best_model(x_train_w2v, x_val_w2v) # finding the optimal C for the given training and validation data

For C=0.001, validation f1 score (macro) = 0.8346177921855383
For C=0.01, validation f1 score (macro) = 0.9034057763754094
For C=0.1, validation f1 score (macro) = 0.9370720698450686
For C=1, validation f1 score (macro) = 0.9468289223824221
For C=10, validation f1 score (macro) = 0.9466447408060996


In [None]:
print_performance(best_C, x_train_w2v, x_test_w2v) # printing the performance stats of the best performing model

Testing accuracy = 0.9844
Test macro f1 score: 0.9594
Test micro f1 score: 0.9844


Part 3: BERT

In [None]:
# i will be running this fine-tuning task on Google Colab's A100 GPU to speed things up

# loading the tokeniser and the BERT model

tokeniser = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained("bert-base-uncased")

In [None]:
num_classes = len(np.unique(y_train)) # number of classes

# i will create a simple model with BERT and one classification layer on top

class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(768, num_classes) # since 768 is BERT's output size

    def forward(self, input_ids, attention_mask): # prediction function
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) # getting BERT output

        cls_output = outputs.pooler_output # taking the [CLS] token

        logits = self.classifier(cls_output) # passing through the classification layer
        return logits

In [None]:
# i will create the classifier model and move it to the GPU
model = BertClassifier(bert, num_classes)
model.to('cuda')

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
# next i will define a dataset class to convert text to BERT's format
class TextDataset(Dataset):
  def __init__(self, texts, labels):
    self.texts = texts
    self.labels = labels

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
    return text, label

# i will now use this function to create my BERT-compatible text datasets
train_bert = TextDataset(x_train, y_train)
val_bert = TextDataset(x_val, y_val)
test_bert = TextDataset(x_test, y_test)

In [None]:
# now i will create DataLoaders with batch size = 16
train_loader = DataLoader(train_bert, batch_size=16, shuffle=True) # setting shuffle = True because this is the one used for tuning
val_loader = DataLoader(val_bert, batch_size=16, shuffle=False)
test_loader = DataLoader(test_bert, batch_size=16, shuffle=False)

# training setup
optimiser = torch.optim.AdamW(model.parameters(), lr=2e-5) # Adaptive Moments optimiser
loss_fn = nn.CrossEntropyLoss() # Cross Entropy Loss

In [None]:
# i will define a function to train/fine-tune the model for a single epoch

def train_epoch(model, train_loader, optimiser, loss_fn):
    model.train() # setting the model to training mode
    total_loss = 0 # initialising loss

    for texts, labels in tqdm(train_loader, desc='Training'): # process each batch with progress bar
      tokens = tokeniser(list(texts), max_length=64, padding='max_length', truncation=True, return_tensors='pt') # tokenising

      # moving to GPU
      input_ids = tokens['input_ids'].to('cuda')
      attention_mask = tokens['attention_mask'].to('cuda')
      labels = torch.tensor(labels).to('cuda')

      optimiser.zero_grad() # clearing gradients
      logits = model(input_ids, attention_mask) # forward pass

      loss = loss_fn(logits, labels) # computing loss
      loss.backward() # backpropagation

      optimiser.step() # updating weights
      total_loss += loss.item() # adding to total loss

    avg_loss = total_loss / len(train_loader)
    return avg_loss # returning the average loss


In [None]:
# now i will define the evaluation function
def evaluate(model, dataloader):
  model.eval() # setting to evaluation mode
  preds = []
  labels = []

  with torch.no_grad(): # disabling gradients
    for texts, batch_labels in tqdm(dataloader, desc='Evaluating'):
      tokens = tokeniser(list(texts), max_length=64, padding='max_length', truncation=True, return_tensors='pt') # tokenising

      # moving to GPU
      input_ids = tokens['input_ids'].to('cuda')
      attention_mask = tokens['attention_mask'].to('cuda')

      # get predictions
      logits = model(input_ids, attention_mask)
      batch_preds = torch.argmax(logits, dim=1)

      # saving the predictions
      preds.extend(batch_preds.cpu().numpy())
      labels.extend(batch_labels)

  return np.array(preds), np.array(labels)

In [None]:
# finally, i will run the fine-tuning function for 3 epochs
epochs = 3
best_val_f1 = 0

for epoch in range(epochs):
  print(f"Epoch #{epoch+1}")

  train_loss = train_epoch(model, train_loader, optimiser, loss_fn) # training and calculating loss
  print(f'Training loss: {train_loss:.4f}')

  val_preds, val_labels = evaluate(model, val_loader) # validating and calculating performance stats
  val_accuracy = accuracy_score(val_labels, val_preds)
  val_f1_macro = f1_score(val_labels, val_preds, average='macro')
  val_f1_micro = f1_score(val_labels, val_preds, average='micro')

  print(f"Validation accuracy = {val_accuracy:.4f}")
  print(f"Validation macro f1 score: {val_f1_macro:.4f}")
  print(f"Validation micro f1 score: {val_f1_micro:.4f}")

  # saving the best model
  if val_f1_macro > best_val_f1:
    best_val_f1 = val_f1_macro
    torch.save(model.state_dict(), 'best_bert_model.pt')


Epoch #1


  labels = torch.tensor(labels).to('cuda')
Training: 100%|██████████| 576/576 [06:35<00:00,  1.45it/s]


Training loss: 0.1328


Evaluating: 100%|██████████| 72/72 [00:39<00:00,  1.85it/s]


Validation accuracy = 0.9644
Validation macro f1 score: 0.9184
Validation micro f1 score: 0.9644
Epoch #2


  labels = torch.tensor(labels).to('cuda')
Training: 100%|██████████| 576/576 [06:39<00:00,  1.44it/s]


Training loss: 0.0395


Evaluating: 100%|██████████| 72/72 [00:42<00:00,  1.69it/s]


Validation accuracy = 0.9774
Validation macro f1 score: 0.9528
Validation micro f1 score: 0.9774
Epoch #3


  labels = torch.tensor(labels).to('cuda')
Training: 100%|██████████| 576/576 [06:42<00:00,  1.43it/s]


Training loss: 0.0139


Evaluating: 100%|██████████| 72/72 [00:42<00:00,  1.70it/s]

Validation accuracy = 0.9722
Validation macro f1 score: 0.9392
Validation micro f1 score: 0.9722





In [None]:
# finally, i will load the best fine-tuned model for testing
model.load_state_dict(torch.load('best_bert_model.pt'))

# testing and calculating performance stats
test_preds, test_labels = evaluate(model, test_loader)
test_accuracy = accuracy_score(test_labels, test_preds)
test_f1_macro = f1_score(test_labels, test_preds, average='macro')
test_f1_micro = f1_score(test_labels, test_preds, average='micro')

print(f"Test accuracy: {test_accuracy:.4f}")
print(f"Test macro f1 score: {test_f1_macro:.4f}")
print(f"Test micro f1 score: {test_f1_micro:.4f}")

Evaluating: 100%|██████████| 72/72 [00:42<00:00,  1.68it/s]

Test accuracy: 0.9835
Test macro f1 score: 0.9634
Test micro f1 score: 0.9835



