In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score

In [2]:
# Preparing Dataset
df = pd.read_csv("Dataset/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')
df.dropna(axis=1, inplace=True)
df.drop(columns=['domain1_score', 'rater1_domain1', 'rater2_domain1'], inplace=True, axis=1)
df.head()

temp = pd.read_csv("Processed_data.csv")
temp.drop("Unnamed: 0", inplace=True, axis=1)
df['domain1_score'] = temp['final_score']
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",8
4,5,1,"Dear @LOCATION1, I know having computers has a...",6


In [3]:
# Make Dataset
y = df['domain1_score']
df.drop('domain1_score', inplace=True, axis=1)
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape

(9083, 3)

In [4]:
# PREPROCESSING
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()
train_sents = []
test_sents = []

stop_words = set(stopwords.words('english')) 

def sent2word(x):
    x = re.sub("[^A-Za-z]", " ", x)
    x.lower()
    filtered_sentence = [] 
    words = x.split()
    for w in words:
        if w not in stop_words: 
            filtered_sentence.append(w)
    return filtered_sentence

def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words = []
    for i in raw:
        if len(i) > 0:
            final_words.append(sent2word(i))
    return final_words

for i in train_e:
    train_sents += essay2word(i)

for i in test_e:
    test_sents += essay2word(i)

len(train_sents)
train_sents[0]


['It',
 'first',
 'day',
 'high',
 'school',
 'gut',
 'full',
 'butterflies',
 'make',
 'want',
 'run',
 'bathrooms',
 'hide',
 'world']

In [5]:
# Preparing WORD2VEC and GRU Model
def get_model():
    model = Sequential()
    model.add(GRU(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

In [6]:
# Training Word2Vec model
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = Word2Vec(train_sents, 
                 workers=num_workers, 
                 vector_size=num_features,  # Changed from size to vector_size
                 min_count=min_word_count, 
                 window=context, 
                 sample=downsampling)

model.init_sims(replace=True)
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

def makeVec(words, model, num_features):
    vec = np.zeros((num_features,), dtype="float32")
    noOfWords = 0.
    index2word_set = set(model.wv.index_to_key)  # Updated this line
    for i in words:
        if i in index2word_set:
            noOfWords += 1
            vec = np.add(vec, model.wv[i])  # Use model.wv[i] to get the vector
    if noOfWords > 0:  # Avoid division by zero
        vec = np.divide(vec, noOfWords)
    return vec


def getVecs(essays, model, num_features):
    c = 0
    essay_vecs = np.zeros((len(essays), num_features), dtype="float32")
    for i in essays:
        essay_vecs[c] = makeVec(i, model, num_features)
        c += 1
    return essay_vecs

  model.init_sims(replace=True)


In [7]:
# Clean and vectorize the training essays
clean_train = []
for i in train_e:
    clean_train.append(sent2word(i))
training_vectors = getVecs(clean_train, model, num_features)

In [8]:
# Clean and vectorize the test essays
clean_test = []
for i in test_e:
    clean_test.append(sent2word(i))
testing_vectors = getVecs(clean_test, model, num_features)
training_vectors.shape
training_vectors = np.array(training_vectors)
testing_vectors = np.array(testing_vectors)

In [9]:
# Reshaping train and test vectors to 3 dimensions. (1 represents one timestep)
training_vectors = np.reshape(training_vectors, (training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = np.reshape(testing_vectors, (testing_vectors.shape[0], 1, testing_vectors.shape[1]))


In [10]:
gru_model = get_model()


  super().__init__(**kwargs)


In [11]:
# Training and Prediction
gru_model.fit(training_vectors, y_train, batch_size=64, epochs=150)
gru_model.save('final_gru.h5')

y_pred = gru_model.predict(testing_vectors)
y_pred = np.around(y_pred)
print(y_pred)

Epoch 1/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 13.8019 - mae: 2.9443
Epoch 2/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.3280 - mae: 1.8328
Epoch 3/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 5.3015 - mae: 1.8100
Epoch 4/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.0878 - mae: 1.7808
Epoch 5/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.2080 - mae: 1.8203
Epoch 6/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.1009 - mae: 1.7808
Epoch 7/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.1381 - mae: 1.7859
Epoch 8/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.0523 - mae: 1.7754
Epoch 9/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m



[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[[2.]
 [5.]
 [6.]
 ...
 [7.]
 [7.]
 [9.]]
