In [1]:
!git clone https://github.com/scottjaymsu/nlp-autograder.git
%cd nlp-autograder

Cloning into 'nlp-autograder'...
remote: Enumerating objects: 186, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 186 (delta 73), reused 21 (delta 10), pack-reused 70 (from 1)[K
Receiving objects: 100% (186/186), 113.21 MiB | 13.29 MiB/s, done.
Resolving deltas: 100% (93/93), done.
Updating files: 100% (30/30), done.
/content/nlp-autograder


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

import os
from tqdm import tqdm
from transformers import BertTokenizer, BertModel



In [None]:
# load and process data
data = pd.read_csv('train.csv').dropna()
X = data.iloc[:, 1]
y = data.iloc[:, 2].astype(int)
max_score = y.max()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')


'\ncwd = os.getcwd()\nembed = cwd + "/sentence_embeddings_train.pt"\nif os.path.exists(embed):\n    X_train = torch.load(embed, weights_only = True)\nelse:\n    print("No train embeddings found")\n\nembed = cwd + "/sentence_embeddings_test.pt"\nif os.path.exists(embed):\n    X_test = torch.load(embed, weights_only=True)\nelse:\n    print("No test embeddings found")\n\nembed = cwd + "/sentence_embeddings_val.pt"\nif os.path.exists(embed):\n    X_val = torch.load(embed, map_location=torch.device(\'cpu\'), weights_only=True)\n    X_val = X_val.to(device)\nelse:\n    print("No val embeddings found")\n'

In [None]:
def get_bert_embeddings(texts, device='cuda', save_name=None):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert = BertModel.from_pretrained("bert-base-uncased").to(device)
    bert.eval()

    embeddings = []

    for text in tqdm(texts, desc="Processing texts", unit="text"):
        with torch.no_grad():
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
            outputs = bert(**inputs)
            sentence_embedding = outputs.last_hidden_state[:, 0, :]  
            embeddings.append(sentence_embedding.squeeze(0))

    embeddings_tensor = torch.stack(embeddings)

    if save_name:
        torch.save(embeddings_tensor, f"{save_name}.pt")

    return embeddings_tensor

In [None]:
# generate BERT embeddings
features_tensor = get_bert_embeddings(X_train)
features_tensor_test = get_bert_embeddings(X_test)

labels_tensor = torch.tensor(y_train.values - 1, dtype=torch.long)
labels_tensor_test = torch.tensor(y_test.values - 1, dtype=torch.long)


Processing texts: 100%|██████████| 6808/6808 [03:15<00:00, 34.87text/s]
Processing texts: 100%|██████████| 1702/1702 [00:46<00:00, 36.76text/s]


In [28]:
# fully connected NN
class FCNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FCNN, self).__init__()

        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


In [50]:
# train model
def train_model(model, train_loader, loss_fn, optimizer, epochs=1000):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()

    model.eval()
    y_actual, y_pred = [], []
    with torch.no_grad():
      for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_actual.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    train_accuracy = accuracy_score(y_actual, y_pred)
    print(f"\nTrain Set Accuracy: {train_accuracy}")



train_dataset_full = TensorDataset(features_tensor, labels_tensor)
train_loader_full = DataLoader(train_dataset_full, batch_size=64, shuffle=True)

model = FCNN(input_dim=features_tensor.shape[1], hidden_dim=256, output_dim=max_score).to(device)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

train_model(model, train_loader_full, loss_fn, optimizer, epochs=750)



Train Set Accuracy: 0.7804054054054054


In [51]:
# cross validate model using 5 folds
k_folds = 5
kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(features_tensor, labels_tensor)):
    print(f"Fold {fold + 1}/{k_folds}")

    train_features, val_features = features_tensor[train_idx], features_tensor[val_idx]
    train_labels, val_labels = labels_tensor[train_idx], labels_tensor[val_idx]

    train_dataset = TensorDataset(train_features, train_labels)
    val_dataset = TensorDataset(val_features, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    model = FCNN(input_dim=features_tensor.shape[1], hidden_dim=256, output_dim=max_score).to(device)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    train_model(model, train_loader, loss_fn, optimizer, epochs=500)

    model.eval()
    y_actual, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_actual.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    fold_accuracy = accuracy_score(y_actual, y_pred)
    fold_accuracies.append(fold_accuracy)
    print(f"Fold {fold + 1} Accuracy: {fold_accuracy}")


Fold 1/5

Train Set Accuracy: 0.6836210062431142
Fold 1 Accuracy: 0.5323054331864905
Fold 2/5

Train Set Accuracy: 0.6733382298934998
Fold 2 Accuracy: 0.5418502202643172
Fold 3/5

Train Set Accuracy: 0.6852735952993022
Fold 3 Accuracy: 0.5205580029368576
Fold 4/5

Train Set Accuracy: 0.6945107398568019
Fold 4 Accuracy: 0.5429831006612784
Fold 5/5

Train Set Accuracy: 0.6787222324215164
Fold 5 Accuracy: 0.5576781778104335


In [52]:
# average validation accuracy
average_accuracy = np.mean(fold_accuracies)
print(f"\nAverage accuracy across {k_folds} folds: {average_accuracy}")



Average accuracy across 5 folds: 0.5390749869718754


In [63]:
# retrain and test
train_dataset_full = TensorDataset(features_tensor, labels_tensor)
train_loader_full = DataLoader(train_dataset_full, batch_size=64, shuffle=True)

final_model = FCNN(input_dim=features_tensor.shape[1], hidden_dim=256, output_dim=len(np.unique(y))).to(device)
final_optimizer = optim.Adam(final_model.parameters(), lr=0.0001)

train_model(final_model, train_loader_full, loss_fn, final_optimizer, epochs=750)

test_dataset = TensorDataset(features_tensor_test, labels_tensor_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

final_model.eval()
y_actual_test, y_pred_test = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = final_model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_actual_test.extend(labels.cpu().numpy())
        y_pred_test.extend(predicted.cpu().numpy())

test_accuracy = accuracy_score(y_actual_test, y_pred_test)
print(f"\nTest Set Accuracy: {test_accuracy}")



Train Set Accuracy: 0.777027027027027

Test Set Accuracy: 0.5558166862514688


In [64]:
from sklearn.metrics import cohen_kappa_score
kappa_score = cohen_kappa_score(y_actual_test, y_pred_test)
print(f"\nQuadratic Kappa Score: {kappa_score}")


Quadratic Kappa Score: 0.402699455763871


In [65]:
from sklearn.metrics import precision_score, recall_score, fbeta_score, accuracy_score

# metrics calculated for each class and averaged
precision_macro = precision_score(y_actual_test, y_pred_test, average='macro')
recall_macro = recall_score(y_actual_test, y_pred_test, average='macro')
fscore_macro = fbeta_score(y_actual_test, y_pred_test, beta=1, average='macro')

# uses sums of tp, fp, and fn to perform calculations
precision_micro = precision_score(y_actual_test, y_pred_test, average='micro')
recall_micro = recall_score(y_actual_test, y_pred_test, average='micro')
fscore_micro = fbeta_score(y_actual_test, y_pred_test, beta=1, average='micro')

# weighted average of metrics calculated independently for each class
precision_weighted = precision_score(y_actual_test, y_pred_test, average='weighted')
recall_weighted = recall_score(y_actual_test, y_pred_test, average='weighted')
fscore_weighted = fbeta_score(y_actual_test, y_pred_test, beta=1, average='weighted')

# overall accuracy
accuracy = accuracy_score(y_actual_test, y_pred_test)

print("Precision (Macro):", precision_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", fscore_macro)
print("Precision (Micro):", precision_micro)
print("Recall (Micro):", recall_micro)
print("F1 Score (Micro):", fscore_micro)
print("Precision (Weighted):", precision_weighted)
print("Recall (Weighted):", recall_weighted)
print("F1 Score (Weighted):", fscore_weighted)
print("Overall Accuracy: ", accuracy)

Precision (Macro): 0.4370801250511031
Recall (Macro): 0.4481733064272384
F1 Score (Macro): 0.43976220824315626
Precision (Micro): 0.5558166862514688
Recall (Micro): 0.5558166862514688
F1 Score (Micro): 0.5558166862514688
Precision (Weighted): 0.5652378924371716
Recall (Weighted): 0.5558166862514688
F1 Score (Weighted): 0.5568321765680497
Overall Accuracy:  0.5558166862514688
