In [5]:
import numpy as np
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim.models import Word2Vec

from keras.models import Sequential, load_model, model_from_json
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
import keras.backend as K

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, cohen_kappa_score

# Use numpy's triu function
triu = np.triu




In [6]:
import numpy as np
import sys

# Patch scipy.linalg.triu to numpy.triu if it's being imported somewhere
import types
fake_linalg = types.SimpleNamespace(triu=np.triu)
sys.modules['scipy.linalg'] = fake_linalg


In [7]:
import numpy as np
triu = np.triu

matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
upper_triangle = triu(matrix)

print(upper_triangle)


[[1 2 3]
 [0 5 6]
 [0 0 9]]


**Preparing Dataset**

In [8]:
import pandas as pd

# Load the TSV file with proper separator and encoding
df = pd.read_csv("Dataset/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')

# Drop columns that are completely NaN
df.dropna(axis=1, inplace=True)

# Drop specific columns safely
columns_to_drop = ['domain1_score', 'rater1_domain1', 'rater2_domain1']
for col in columns_to_drop:
    if col in df.columns:
        df.drop(columns=col, inplace=True)

# Display first few rows
print(df.head())

# Load processed data
temp = pd.read_csv("Processed_data.csv")

# Drop 'Unnamed: 0' only if it exists
if 'Unnamed: 0' in temp.columns:
    temp.drop("Unnamed: 0", inplace=True, axis=1)

# Optional: Show columns to confirm
print("Columns in temp:", temp.columns)


   essay_id  essay_set                                              essay
0         1          1  Dear local newspaper, I think effects computer...
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...
4         5          1  Dear @LOCATION1, I know having computers has a...
Columns in temp: Index(['essay_id', 'essay_set', 'essay', 'final_score', 'clean_essay',
       'char_count', 'word_count', 'sent_count', 'avg_word_len',
       'spell_err_count', 'noun_count', 'adj_count', 'verb_count',
       'adv_count'],
      dtype='object')


In [9]:
# Ensure both dataframes have the same number of rows
if len(df) == len(temp):
    df = df.copy()  # Avoid SettingWithCopyWarning
    df['domain1_score'] = temp['final_score'].values  # assign as numpy array to avoid index issues
    print(df.head())
else:
    print("Mismatch in row counts: df =", len(df), ", temp =", len(temp))


   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   domain1_score  
0              6  
1              7  
2              5  
3              8  
4              6  


In [10]:
df['essay'][0]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [11]:
temp.head(1)

Unnamed: 0,essay_id,essay_set,essay,final_score,clean_essay,char_count,word_count,sent_count,avg_word_len,spell_err_count,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",6,Dear local newspaper I think effects computer...,1441,344,16,4.188953,11,76,75,18,24


In [12]:
#Make Dataset
y = df['domain1_score']
df.drop('domain1_score',inplace=True,axis=1)
X=df

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
X_train.shape

(9083, 3)

**PREPROCESSING**

In [15]:
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()

In [16]:
train_sents=[]
test_sents=[]

stop_words = set(stopwords.words('english')) 
def sent2word(x):
    x=re.sub("[^A-Za-z]"," ",x)
    x.lower()
    filtered_sentence = [] 
    words=x.split()
    for w in words:
        if w not in stop_words: 
            filtered_sentence.append(w)
    return filtered_sentence

def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words=[]
    for i in raw:
        if(len(i)>0):
            final_words.append(sent2word(i))
    return final_words

for i in train_e:
    train_sents+=essay2word(i)

for i in test_e:
    test_sents+=essay2word(i)

In [17]:
len(train_sents)

116500

In [18]:
train_sents[0]

['It',
 'first',
 'day',
 'high',
 'school',
 'gut',
 'full',
 'butterflies',
 'make',
 'want',
 'run',
 'bathrooms',
 'hide',
 'world']

**Preparing WORD2VEC and LSTM Model**

In [19]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

In [22]:
from gensim.models import Word2Vec
import gensim

num_features = 300 
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = Word2Vec(train_sents, 
                 vector_size=num_features,  # modern gensim usage
                 workers=num_workers, 
                 min_count=min_word_count, 
                 window=context, 
                 sample=downsampling)

model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)



In [26]:
def makeVec(words, model, num_features):
    vec = np.zeros((num_features,), dtype="float32")
    noOfWords = 0.
    index2word_set = set(model.wv.index_to_key)  # Updated for Gensim 4.x
    for i in words:
        if i in index2word_set:
            noOfWords += 1
            vec = np.add(vec, model.wv.get_vector(i))  # ✅ updated here
    
    # Avoid division by zero, return zero vector if no valid words were found
    if noOfWords > 0:
        vec = np.divide(vec, noOfWords)
    else:
        vec = np.zeros((num_features,), dtype="float32")  # Handle no valid words

    return vec


In [27]:
training_vectors.shape

(9083, 300)

In [28]:
training_vectors = np.array(training_vectors)
testing_vectors = np.array(testing_vectors)

# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
training_vectors = np.reshape(training_vectors, (training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = np.reshape(testing_vectors, (testing_vectors.shape[0], 1, testing_vectors.shape[1]))
lstm_model = get_model()

  super().__init__(**kwargs)


In [29]:
training_vectors.shape

(9083, 1, 300)


**TRAINING AND PREDICTION**







In [30]:
lstm_model.fit(training_vectors, y_train, batch_size=64, epochs=150)

Epoch 1/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 12.6394 - mae: 2.8072
Epoch 2/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 5.1904 - mae: 1.8012
Epoch 3/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 5.0471 - mae: 1.7716
Epoch 4/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 4.7019 - mae: 1.7025
Epoch 5/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 4.5792 - mae: 1.6836
Epoch 6/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 4.4861 - mae: 1.6636
Epoch 7/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 4.2830 - mae: 1.6141
Epoch 8/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 4.2508 - mae: 1.6231
Epoch 9/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x16604dfd0>

In [31]:
lstm_model.save('final_lstm.h5')
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)
y_pred



[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step


array([[3.],
       [6.],
       [6.],
       ...,
       [8.],
       [8.],
       [9.]], dtype=float32)