In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

In [None]:
data.head(1)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,rater1_trait2,rater1_trait3,rater1_trait4,rater1_trait5,rater1_trait6,rater2_trait1,rater2_trait2,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,,,,,,,,,,,,,,,,,,


In [None]:
data.isnull().any()

essay_id          False
essay_set         False
essay             False
rater1_domain1    False
rater2_domain1    False
rater3_domain1     True
domain1_score     False
rater1_domain2     True
rater2_domain2     True
domain2_score      True
rater1_trait1      True
rater1_trait2      True
rater1_trait3      True
rater1_trait4      True
rater1_trait5      True
rater1_trait6      True
rater2_trait1      True
rater2_trait2      True
rater2_trait3      True
rater2_trait4      True
rater2_trait5      True
rater2_trait6      True
rater3_trait1      True
rater3_trait2      True
rater3_trait3      True
rater3_trait4      True
rater3_trait5      True
rater3_trait6      True
dtype: bool

In [None]:
data = data.dropna(axis=1)
data = data.drop(columns=['rater1_domain1','rater2_domain1'])

In [None]:
data.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [None]:
x=data.iloc[:,0:3]
y=data.iloc[:,3:4]

In [None]:
x

Unnamed: 0,essay_id,essay_set,essay
0,1,1,"Dear local newspaper, I think effects computer..."
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that..."
4,5,1,"Dear @LOCATION1, I know having computers has a..."
...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...
12972,21628,8,I never understood the meaning laughter is th...
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ..."
12974,21630,8,Trippin' on fen...


In [None]:
y

Unnamed: 0,domain1_score
0,8
1,9
2,7
3,10
4,8
...,...
12971,35
12972,32
12973,40
12974,40


In [None]:
minsrc = []
maxsrc = []
for i in range(1,9):
  minsrc.append(data[data['essay_set']==i]['domain1_score'].min())
  maxsrc.append(data[data['essay_set']==i]['domain1_score'].max())
print("minimum score= ",minsrc )
print("maximum score= ",maxsrc )

minimum score=  [2, 1, 0, 0, 0, 0, 2, 10]
maximum score=  [12, 6, 3, 3, 4, 4, 24, 60]


In [None]:
def essay_to_wordlist(essay_v):
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    return (words)

#Tokenize the senstences and call essay_to_wordlist() for word tokenization.
def essay_to_sentences(essay_v):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence))
    return sentences

In [None]:
#Feature vector is made from the words list of an essay.
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2entity)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.wv[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

#Word vectors are generated for Word2Vec model
def getAvgFeatureVecs(essays, model, num_features):
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 0)

In [None]:
x_train.shape

(10380, 3)

In [None]:
x_test.shape

(2596, 3)

In [None]:
train_essays = x_train['essay']
test_essays = x_test['essay']

In [None]:
sentences = []
# Obtaining all sentences from the training essays.
for essay in train_essays:
    sentences += essay_to_sentences(essay)

In [None]:
sentences[0]

['essay', 'author', 'talking', 'journey']

In [None]:
model = Word2Vec(sentences, workers = 4, size= 300, min_count= 40, window = 10, sample = 1e-3)

In [None]:
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

In [None]:
clean_train_essays = []
for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, 300)
 
#get the testing vectors
clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist( essay_v))
testDataVecs = getAvgFeatureVecs( clean_test_essays, model, 300 )
    
#convert the vectors to numpy array
trainDataVecs = np.array(trainDataVecs)
testDataVecs = np.array(testDataVecs)

# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

In [None]:
model = Sequential()

In [None]:
model = Sequential()
model.add(LSTM(300,dropout=0.4, recurrent_dropout=0.4, input_shape=[1,300], return_sequences=True))
model.add(LSTM(64, recurrent_dropout=0.4))
model.add(Dropout(0.5))
model.add(Dense(1,activation='relu'))

In [None]:
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])

In [None]:
model.fit(trainDataVecs, y_train, batch_size=64, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f7e61bdddd0>

In [None]:
testDataVecs.shape

(2596, 1, 300)

In [None]:
y_train.shape

(10380, 1)

In [None]:
ypred = model.predict(testDataVecs)

In [None]:
ypred

array([[12.615318 ],
       [15.47917  ],
       [ 3.1516871],
       ...,
       [ 1.917428 ],
       [ 2.8116941],
       [ 2.8596215]], dtype=float32)

In [None]:
y_test

Unnamed: 0,domain1_score
10823,15
11018,17
2263,4
11074,20
12491,33
...,...
6906,2
8418,3
5226,2
8992,3


In [None]:
model.save('final_lstm.h5')

In [None]:
from sklearn.metrics import r2_score
accuracy = r2_score(y_test,ypred)
accuracy

0.939725818586611

In [None]:

import gensim.models.keyedvectors as word2vec

In [None]:
testsen='''Dear local newspaper, I\'ve heard that not many people think computers benefit society. I disagree with that. Computers benefit society by teaching hand-eye coordination, allowing people to learn about foregin places, and allow people to communicate with others online. Some people were basically born with hand-eye coordination, but most people weren\'t, like me, for example. But, thanks to computers, my hand-eye coordination has gone up! "On average, computers can increase a persons hand-eye coordination by up to @PERCENT1 says @PERSON2. Think of it, just by learning to tye, your hand-eye coordination can go up @PERCENT1.! That\'s amazing! Having good hand-eye coordination will get you far in life, and helps with small, everyday things such as playing an instrument, or doing household chores. It also helps in sports. You need good hand-eye coordination to throw, catch, and aim. If more people spend more time on the computer, then the population\'s hand-eye coordination would increase, and who wouldn\'t want thats. Have you ever heard of @LOCATION1? It is a country in @LOCATION2 that no one really knows much about. By looking it up on the computer, you can become an expert on the country! The computer gives us access to a lot of knowledge, and will make us more of people around the world. This can improve our travel. Instead of going to the same place every vacation, you can go to a place you saw online that looks just as exciting. Also, by learning about other countries and the people in them, we gain respect for them. We @MONTH1 also learn about their traditions, and culture. "The internet gives us access to the world," says @PERSON1, "we can learn so many new things that will make us more of people, countries, and the environment." @CAPS1 is right, we can also be aware of the environment. Since global warming is a problem, we can learn how o preserve foreign places by learning about them on the internet. With a click of a mouse you can virtually have the whose world at your fingertips! In a recent study @PERCENT2 of people said they don\'t have time to meet with old friends. The internet, however has made it quick and easy to talk to people online. Websites like @CAPS2, @CAPS3, and @CAPS4 can keep your social life in tact, and can even reunite yu with old friends. For example, @CAPS2 suggest family. You can keep in touch wih many people trough @CAPS2! Also, you can make new friends. Most teens say they are always willing to meet new people, and the internet is a great way to do so! Sites like @CAPS7, and @CAPS8 can can help you find friends by connecting you in a chat room to a random person, just to talk. I once talked with someone on @CAPS7 for over an hour! We found that we both have @CAPS4 accounts and keep in touch there! This also helps you gain confidence because you @MONTH1 have the courage to say something over the computer that you would\'t say real life. The computer can really amp up your social life. As you can see. computers don\'t just "your barain," they imprdove hand-eye coordination, help you learn about far away places, and improve your social life by allowing you to talk to others. Computers benefit the society more than anymore will ever know!'''

In [None]:
model1 = word2vec.KeyedVectors.load_word2vec_format('word2vecmodel.bin', binary=True)
index2word_set = set(model1.index2entity)

In [None]:
testsen2 = re.sub("[^a-zA-Z]", " ", testsen)
testsen2 = essay_v.lower()
featureVec = np.zeros((300,),dtype="float32")
for word in testsen2:
        if word in index2word_set:
            featureVec = np.add(featureVec,model1[word])

In [None]:
featureVec.shape
avc=featureVec.reshape(1,1,300)

In [None]:
from keras.models  import load_model
model2 = load_model("final_lstm.h5")

In [None]:
y_pred = model2.predict(avc)

In [None]:
y_pred

array([[4.0548315]], dtype=float32)