In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, cohen_kappa_score


import sys
import os


parent_dir = os.path.dirname(os.getcwd())  
sys.path.append(parent_dir)  

from src.preprocessing import essay_to_sentences, clean_sentence
from src.lstm_model import build_lstm_model, reshape_for_lstm


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load training data
train_df = pd.read_csv(
    "C:/Users/Vijay/Apy/A_LSTM-BERT/data/training_set_rel3.tsv",
    sep="\t",
    encoding="ISO-8859-1"
)

train_df = train_df[["essay", "domain1_score"]].dropna()

# Load test data (NO labels)
test_df = pd.read_csv(
    "C:/Users/Vijay/Apy/A_LSTM-BERT/data/test_set.tsv",
    sep="\t",
    encoding="ISO-8859-1"
)

test_df = test_df[["essay"]].dropna()

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (12976, 2)
Test shape: (4254, 1)


In [3]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df["essay"],
    train_df["domain1_score"],
    test_size=0.2,
    random_state=42
)

train_essays = X_train.tolist()
val_essays = X_val.tolist()
test_essays = test_df["essay"].tolist()


In [4]:
train_sentences = []
for essay in train_essays:
    train_sentences.extend(essay_to_sentences(essay))


In [5]:
WORD_DIM = 300

w2v_model = Word2Vec(
    sentences=train_sentences,
    vector_size=WORD_DIM,
    window=10,
    min_count=40,
    workers=4,
    sample=1e-3,
    seed=42
)

w2v_model.wv.save_word2vec_format(
    "C:/Users/Vijay/Apy/A_LSTM-BERT/models/word2vec_lstm.bin",
    binary=True
)


In [6]:
def essay_to_vector(words, model, dim):
    valid_words = [w for w in words if w in model.wv]
    if not valid_words:
        return np.zeros(dim)
    return np.mean(model.wv[valid_words], axis=0)


def vectorize_essays(essays, model, dim):
    vectors = np.zeros((len(essays), dim))
    for i, essay in enumerate(essays):
        tokens = clean_sentence(essay)
        vectors[i] = essay_to_vector(tokens, model, dim)
    return vectors


In [7]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

In [8]:
X_train_vec = vectorize_essays(train_essays, w2v_model, WORD_DIM)
X_val_vec   = vectorize_essays(val_essays, w2v_model, WORD_DIM)
X_test_vec  = vectorize_essays(test_essays, w2v_model, WORD_DIM)

X_train_vec = reshape_for_lstm(X_train_vec)
X_val_vec   = reshape_for_lstm(X_val_vec)
X_test_vec  = reshape_for_lstm(X_test_vec)


In [9]:
lstm_model = build_lstm_model(WORD_DIM)

lstm_model.fit(
    X_train_vec,
    y_train,
    validation_data=(X_val_vec, y_val),
    batch_size=64,
    epochs=100,
    verbose=1
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x142b8174c70>

In [11]:
y_val_pred = lstm_model.predict(X_val_vec)
y_val_round = np.round(y_val_pred).astype(int)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
kappa = cohen_kappa_score(y_val, y_val_round)

print("Validation RMSE:", rmse)
print("Validation QWK:", kappa)


Validation RMSE: 2.2019970376466884
Validation QWK: 0.34304723908269186


In [12]:
test_predictions = lstm_model.predict(X_test_vec)
test_predictions = np.round(test_predictions).astype(int)

test_df["predicted_score"] = test_predictions
test_df.head()




Unnamed: 0,essay,predicted_score
0,I believe that computers have a positive effec...,9
1,"Dear @CAPS1, I know some problems have came up...",10
2,"Dear to whom it @MONTH1 concern, Computers are...",7
3,"Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte...",10
4,"Dear Local newspaper, I think that people have...",9


In [13]:
test_df.to_csv(
    "C:/Users/Vijay/Apy/A_LSTM-BERT/results/test_set_predictions_lstm.csv",
    index=False
)
