In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [0]:
from google.colab import files
uploaded=files.upload()

Saving training_set_rel3.tsv to training_set_rel3.tsv


In [0]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
dataset=pd.read_csv('training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,rater1_trait2,rater1_trait3,rater1_trait4,rater1_trait5,rater1_trait6,rater2_trait1,rater2_trait2,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,,,,,,,,,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,,,,,,,,,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,,,,,,,,,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,,,,,,,,,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,,,,,,,,,,,,,,,,,,


**DATA PREPROCESSING**

In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


stopwords=set(stopwords.words("english"))

def cleanup_tokenize(essay):
  
  """
  1) remove punctuations , numbers, stopwords 
  """
  sentence_tokens=sent_tokenize(essay)
  sentence=[]
  for essay in sentence_tokens:
    essay=re.sub("[^a-zA-Z]"," ",essay)
    essay=essay.lower().split()
    words=[word for word in essay if not word in stopwords ]
    sentence.append(words)
  
  return sentence




**WORD2VEC**

In [0]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords

stopwords=set(stopwords.words("english"))


def word2vec_model(sentences,essays):
  size=300
  word2vec=Word2Vec(sentences=sentences,
                workers=4,
                min_count=20,
                size=size,
                window=10,
                sample=0.001)

  word2vec.init_sims(replace=True)
  word2vec.wv.save_word2vec_format("word2vecmodel.bin",binary=True)

  clean_essays=[]
  for essay in essays:
    essay=re.sub("[^a-zA-Z]"," ",essay)
    essay=essay.lower().split()
    words=[word for word in essay if not word in stopwords ]
    clean_essays.append(words)

  # Building feature vectors here 

  i=0
  feature_vector_essay=np.zeros((len(clean_essays),size),dtype="float32")
  for e in clean_essays:
    feature_vector=np.zeros((size,),dtype="float32")
    number=0
    index2word_set=set(word2vec.wv.index2word)
    for word in e:
        if word in index2word_set:
          num_words=+1
          feature_vector=np.add(feature_vector,word2vec[word])
    feature_vector=np.divide(feature_vector,number)
    feature_vector_essay[i]=feature_vector
    i=i+1

  trained_data=np.array(feature_vector_essay)
  trained_data=np.reshape(trained_data,(trained_data.shape[0], 1, trained_data.shape[1]))
  
  print(trained_data.shape)

  return trained_data


DEEP LEARNING MODEL - RNN(**LSTM**)

In [0]:
from keras.layers import Embedding,LSTM,Dense,Dropout,Flatten
from keras.models import Sequential

def lstm_model():

  model=Sequential()
  model.add(LSTM(300,dropout=0.4,recurrent_dropout=0.4,input_shape=[1,300],return_sequences=True))
  model.add(LSTM(64,recurrent_dropout=0.4))
  model.add(Dropout(0.5))
  model.add(Dense(1,activation="relu")) # change this to sigmoid and check


  model.compile(
                loss="mean_squared_error",
                optimizer="rmsprop",
                metrics=["accuracy","mae"]
                )
  return model

Using TensorFlow backend.


In [0]:
X=dataset
y=X["domain1_score"]
X=X.dropna(axis=1)
X=X.drop(columns=["rater1_domain1","rater2_domain1"])

from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score


cross_validation=KFold(n_splits=5,shuffle=True)
results=[]
y_pred_list=[]

for traincv,testcv in cross_validation.split(X):
  X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]

  print(X_test.shape)
  print(X_train.shape)
  print(y_test.shape)
  print(y_train.shape)

  train_essays = X_train['essay']
  test_essays = X_test['essay']

  sentences=[]


  for essay in train_essays:
    sentences=sentences+cleanup_tokenize(essay)


  trained_data_vec=word2vec_model(sentences,train_essays)
  test_data_vec=word2vec_model(sentences,test_essays)


  model=lstm_model()
  model.fit( trained_data_vec,y_train,batch_size=64,epochs=50)
  y_pred=model.predict(test_data_vec)
  y_pred=np.around(y_pred)

  print(y_pred.shape)
  print(y_test.values.shape)
  
  result=cohen_kappa_score(y_test.values,y_pred,weights="quadratic")
  print(result)
  results.append(result)


print("Average Kappa score after a 3-fold cross validation: ",np.around(np.array(results).mean(),decimals=4))

(2596, 4)
(10380, 4)
(2596,)
(10380,)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


(10380, 1, 300)
(2596, 1, 300)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
(2596, 1)
(2596,)
0.0
(2595, 4)
(10381, 4)
(2595,)
(10381,)


KeyboardInterrupt: ignored

In [0]:

GLOVE_DIR = './glove.6B/'
SAVE_DIR = './'

import os
import pandas as pd

X = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [0]:
minimum_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
maximum_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]

In [0]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [0]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [0]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

cv = KFold(n_splits=5, shuffle=True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 8 models.
    if count == 5:
         lstm_model.save('./model_weights/final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    print(y_pred.shape)
    print(y_test.values.shape)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1


--------Fold 1--------

Training Word2Vec Model...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_23 (LSTM)               (None, 1, 300)            721200    
_________________________________________________________________
lstm_24 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Ep

KeyboardInterrupt: ignored