In [1]:
!git clone https://github.com/scottjaymsu/nlp-autograder.git
%cd nlp-autograder

Cloning into 'nlp-autograder'...
remote: Enumerating objects: 62, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 62 (delta 15), reused 35 (delta 9), pack-reused 16 (from 1)[K
Receiving objects: 100% (62/62), 110.50 MiB | 13.37 MiB/s, done.
Resolving deltas: 100% (18/18), done.
Updating files: 100% (28/28), done.
/content/nlp-autograder


In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from gensim.models import Word2Vec

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


In [26]:
# Step 1: Load and preprocess the data
data = pd.read_csv('train.csv').dropna()
X = data.iloc[:, 1]
y = data.iloc[:, 2].astype(int)
max_score = y.max()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

def preprocess_text(text):
    """Preprocesses a single essay."""

    # cleaning text
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    #tokenization
    tokens = word_tokenize(text)

    # other processing
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

X_train_preprocessed = X_train.apply(preprocess_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [28]:
# embeddings for train set
w2v_model = Word2Vec(sentences=[essay.split() for essay in X_train_preprocessed], vector_size=50, window=5, min_count=1, workers=4)

def get_feature_vector(essay, w2v_model):
    words = essay.split()
    word_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(w2v_model.vector_size)

# create feature vectors for train set
feature_vectors = np.array([get_feature_vector(essay, w2v_model) for essay in X_train_preprocessed])

In [29]:
features_tensor = torch.tensor(feature_vectors, dtype=torch.float32)
labels_tensor = torch.tensor(y_train.values-1, dtype=torch.long)

# fully connected NN
class FCNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FCNN, self).__init__()

        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )


    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


In [31]:
    # train model
    def train_model(model, train_loader, loss_fn, optimizer, epochs=1000):
        model.train()
        for epoch in range(epochs):
            running_loss = 0.0
            for inputs, labels in train_loader:
                optimizer.zero_grad()  # Zero the gradients
                outputs = model(inputs)  # Forward pass
                loss = loss_fn(outputs, labels)  # Compute loss
                running_loss += loss.item()
                loss.backward()  # Backward pass
                optimizer.step()  # Update weights

        model.eval()
        y_actual, y_pred = [], []
        with torch.no_grad():
          for inputs, labels in train_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_actual.extend(labels.numpy())
            y_pred.extend(predicted.numpy())

        train_accuracy = accuracy_score(y_actual, y_pred)
        print(f"\nTrain Set Accuracy: {train_accuracy}")



train_dataset_full = TensorDataset(features_tensor, labels_tensor)
train_loader_full = DataLoader(train_dataset_full, batch_size=64, shuffle=True)

model = FCNN(input_dim=features_tensor.shape[1], hidden_dim=256, output_dim=max_score)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y)
class_weights = torch.tensor(class_weights, dtype=torch.float)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

train_model(model, train_loader_full, loss_fn, optimizer, epochs=1000)



Train Set Accuracy: 0.57975910693302


In [38]:
# cross validate model using 5 folds
k_folds = 5
kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(features_tensor, labels_tensor)):
    print(f"Fold {fold + 1}/{k_folds}")

    train_features, val_features = features_tensor[train_idx], features_tensor[val_idx]
    train_labels, val_labels = labels_tensor[train_idx], labels_tensor[val_idx]

    train_dataset = TensorDataset(train_features, train_labels)
    val_dataset = TensorDataset(val_features, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    model = FCNN(input_dim=features_tensor.shape[1], hidden_dim=256, output_dim=max_score)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
    class_weights = torch.tensor(class_weights, dtype=torch.float)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    train_model(model, train_loader, loss_fn, optimizer, epochs=1000)

    model.eval()
    y_actual, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_actual.extend(labels.numpy())
            y_pred.extend(predicted.numpy())

    fold_accuracy = accuracy_score(y_actual, y_pred)
    fold_accuracies.append(fold_accuracy)
    print(f"Fold {fold + 1} Accuracy: {fold_accuracy}")


Fold 1/5

Train Set Accuracy: 0.577120822622108
Fold 1 Accuracy: 0.4185022026431718
Fold 2/5

Train Set Accuracy: 0.5829966948218877
Fold 2 Accuracy: 0.3957415565345081
Fold 3/5

Train Set Accuracy: 0.5806096217407272
Fold 3 Accuracy: 0.4236417033773862
Fold 4/5

Train Set Accuracy: 0.5889480447953002
Fold 4 Accuracy: 0.3997060984570169
Fold 5/5

Train Set Accuracy: 0.5849091242885992
Fold 5 Accuracy: 0.4019103600293902


In [35]:
# average validation accuracy
average_accuracy = np.mean(fold_accuracies)
print(f"\nAverage accuracy across {k_folds} folds: {average_accuracy}")



Average accuracy across 5 folds: 0.2773179002655256


In [36]:
# retrain and test
train_dataset_full = TensorDataset(features_tensor, labels_tensor)
train_loader_full = DataLoader(train_dataset_full, batch_size=64, shuffle=True)

final_model = FCNN(input_dim=features_tensor.shape[1], hidden_dim=256, output_dim=len(np.unique(y)))
final_optimizer = optim.Adam(final_model.parameters(), lr=0.0001)


train_model(final_model, train_loader_full, loss_fn, final_optimizer, epochs=1000)


X_test_preprocessed = X_test.apply(preprocess_text)

feature_vectors_test = np.array([get_feature_vector(essay, w2v_model) for essay in X_test_preprocessed])
features_tensor_test = torch.tensor(feature_vectors_test, dtype=torch.float32)
labels_tensor_test = torch.tensor(y_test.values-1, dtype=torch.long)

test_dataset = TensorDataset(features_tensor_test, labels_tensor_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# evaluate final model
final_model.eval()
y_actual_test, y_pred_test = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = final_model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_actual_test.extend(labels.numpy())
        y_pred_test.extend(predicted.numpy())

test_accuracy = accuracy_score(y_actual_test, y_pred_test)
print(f"\nTest Set Accuracy: {test_accuracy}")



Train Set Accuracy: 0.5900411280846063

Test Set Accuracy: 0.4112808460634548
