# **Automated Essay Scoring using Neural Networks**

In [None]:
# Have to use java8 for language_check

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

In [None]:
!pip install language-check
!pip install skll
import nltk
nltk.download('stopwords')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
from string import punctuation
import re

from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.model_selection import cross_val_score,KFold,train_test_split

from gensim.models.word2vec import Word2Vec

from nltk.corpus import stopwords
import language_check
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from skll.metrics import kappa

from scipy.sparse import csr_matrix

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, LSTM, Embedding, Bidirectional, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.optimizers import SGD, Adam
from keras.wrappers.scikit_learn import KerasRegressor
import en_core_web_sm


nlp = en_core_web_sm.load()
stopwords = stopwords.words('english')

In [None]:
def mean_quadratic_weighted_kappa(kappas, weights=None):
    """
    Calculates the mean of the quadratic
    weighted kappas after applying Fisher's r-to-z transform, which is
    approximately a variance-stabilizing transformation.  This
    transformation is undefined if one of the kappas is 1.0, so all kappa
    values are capped in the range (-0.999, 0.999).  The reverse
    transformation is then applied before returning the result.
    
    mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
    kappa values
    mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
    of weights that is the same size as kappas.  Weights are applied in the
    z-space
    """
    kappas = np.array(kappas, dtype=float)
    if weights is None:
        weights = np.ones(np.shape(kappas))
    else:
        weights = weights / np.mean(weights)

    # ensure that kappas are in the range [-.999, .999]
    kappas = np.array([min(x, .999) for x in kappas])
    kappas = np.array([max(x, -.999) for x in kappas])
    
    z = 0.5 * np.log( (1+kappas)/(1-kappas) ) * weights
    z = np.mean(z)
    kappa = (np.exp(2*z)-1) / (np.exp(2*z)+1)
    return kappa

In [None]:
# Using language tool to correct most spelling and grammatical errors.

def correct_language(df):
    tool = language_check.LanguageTool('en-US')
    df['matches'] = df['essay'].apply(lambda txt:tool.check(txt))
    df['corrections'] = df.apply(lambda l:len(l['matches']),axis=1)
    df['corrected'] = df.apply(lambda l:language_check.correct(l['essay'],l['matches']),axis=1)
    return df

In [None]:
# read essays from training_set
training_set = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Data asap aes/training_set_rel3.tsv',sep='\t',encoding="ISO-8859-1")\
            .rename(columns={'essay_set': 'topic', 'domain1_score': 'target_score', 'domain2_score': 'topic2_target'})

# Count characters and words for each essay
training_set['word_count'] = training_set['essay'].str.strip().str.split().str.len()

# apply spelling and grammar corrections
training_set = correct_language(training_set)

**NLP with Spacy**

In [None]:
sents = []
tokens = []
lemma = []
pos = []
ner = []

stop_words = set(STOP_WORDS)
stop_words.update(punctuation)

for essay in nlp.pipe(training_set['corrected'], batch_size = 100, n_threads = 3):
    if essay.is_parsed:
        tokens.append([e.text for e in essay])
        sents.append([sent.string.strip() for sent in essay.sents])
        pos.append([e.pos_ for e in essay])
        ner.append([e.text for e in essay.ents])
        lemma.append([n.lemma_ for n in essay])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        sents.append(None)
        ner.append(None)

training_set['tokens'] = tokens
training_set['lemma'] = lemma
training_set['pos'] = pos
training_set['sents'] = sents
training_set['ner'] = ner

In [None]:
# For each topic, essay with highest available target_score has been chosen.
# All other essays will be compared to these.

reference_essays = {1: 161, 2: 3022, 3: 5263, 4: 5341, 5: 7209, 6: 8896, 7: 11796, 8: 12340} # topic: essay_id

references = {}

for topic, index in reference_essays.items():
    references[topic] = nlp(training_set.iloc[index]['essay'])

# generate document similarity for each essay compared to topic reference
training_set['similarity'] = training_set.apply(lambda row: nlp(row['essay']).similarity(references[row['topic']]), axis=1)

In [None]:
# Various other features are counted.

training_set['token_count'] = training_set.apply(lambda x: len(x['tokens']), axis=1)
training_set['unique_token_count'] = training_set.apply(lambda x: len(set(x['tokens'])), axis=1)
training_set['nostop_count'] = training_set \
            .apply(lambda x: len([token for token in x['tokens'] if token not in stop_words]), axis=1)
training_set['sent_count'] = training_set.apply(lambda x: len(x['sents']), axis=1)
training_set['ner_count'] = training_set.apply(lambda x: len(x['ner']), axis=1)
training_set['comma'] = training_set.apply(lambda x: x['corrected'].count(','), axis=1)
training_set['question'] = training_set.apply(lambda x: x['corrected'].count('?'), axis=1)
training_set['exclamation'] = training_set.apply(lambda x: x['corrected'].count('!'), axis=1)
training_set['quotation'] = training_set.apply(lambda x: x['corrected'].count('"') + x['corrected'].count("'"), axis=1)
training_set['organization'] = training_set.apply(lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1)
training_set['caps'] = training_set.apply(lambda x: x['corrected'].count(r'@CAPS'), axis=1)
training_set['person'] = training_set.apply(lambda x: x['corrected'].count(r'@PERSON'), axis=1)
training_set['location'] = training_set.apply(lambda x: x['corrected'].count(r'@LOCATION'), axis=1)
training_set['money'] = training_set.apply(lambda x: x['corrected'].count(r'@MONEY'), axis=1)
training_set['time'] = training_set.apply(lambda x: x['corrected'].count(r'@TIME'), axis=1)
training_set['date'] = training_set.apply(lambda x: x['corrected'].count(r'@DATE'), axis=1)
training_set['percent'] = training_set.apply(lambda x: x['corrected'].count(r'@PERCENT'), axis=1)
training_set['noun'] = training_set.apply(lambda x: x['pos'].count('NOUN'), axis=1)
training_set['adj'] = training_set.apply(lambda x: x['pos'].count('ADJ'), axis=1)
training_set['pron'] = training_set.apply(lambda x: x['pos'].count('PRON'), axis=1)
training_set['verb'] = training_set.apply(lambda x: x['pos'].count('VERB'), axis=1)
training_set['noun'] = training_set.apply(lambda x: x['pos'].count('NOUN'), axis=1)
training_set['cconj'] = training_set.apply(lambda x: x['pos'].count('CCONJ'), axis=1)
training_set['adv'] = training_set.apply(lambda x: x['pos'].count('ADV'), axis=1)
training_set['det'] = training_set.apply(lambda x: x['pos'].count('DET'), axis=1)
training_set['propn'] = training_set.apply(lambda x: x['pos'].count('PROPN'), axis=1)
training_set['num'] = training_set.apply(lambda x: x['pos'].count('NUM'), axis=1)
training_set['part'] = training_set.apply(lambda x: x['pos'].count('PART'), axis=1)
training_set['intj'] = training_set.apply(lambda x: x['pos'].count('INTJ'), axis=1)

In [None]:
training_set.info()

In [None]:
training_set.to_pickle('/content/drive/My Drive/Colab Notebooks/Data asap aes/training_features.pkl')

In [None]:
training_set = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/Data asap aes/training_features.pkl')

In [None]:
# read essays from validation and test sets

valid_set = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Data asap aes/valid_set.tsv',sep='\t',encoding="ISO-8859-1")\
            .rename(columns={'essay_set':'topic'})

test_set = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Data asap aes/test_set.tsv',sep='\t',encoding="ISO-8859-1")\
          .rename(columns={'essay_set':'topic'})

combo_set = pd.concat([valid_set,test_set],sort = False)

# apply spelling and grammar corrections
combo_set = correct_language(combo_set)


In [None]:
combo_set = pd.concat([combo_set,training_set],sort = False)
combo_set.to_pickle('/content/drive/My Drive/Colab Notebooks/Data asap aes/combo_set.pkl')
len(combo_set)

21448

In [None]:
combo_set = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/Data asap aes/combo_set.pkl')

**Generate word embeddings with Word2Vec**

In [None]:
# Clean training_set essays before feeding them to the Word2Vec model.

punctuations = string.punctuation

# Function for cleaning text by removing personal pronouns, stopwords, and puncuation
def cleanup_essays(essays,logging = False):
    texts = []
    counter = 1
    for essay in essays.corrected:
        counter += 1
        essay = nlp(essay,disable=['parser','ner'])
        tokens = [tok.lemma_.lower().strip() for tok in essay if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return pd.Series(texts) 

In [None]:
train_cleaned = cleanup_essays(training_set,True)

In [None]:
# Function to preprocess text for a word2vec model
def cleanup_essays_word2vec(essays, logging=False):
    sentences = []
    counter = 1
    for essay in essays:
        essay = nlp(essay,disable = ['tagger'])
        essay = " ".join([tok.lemma_.lower() for tok in essay])
        essay = re.split("[\.?!;] ", essay)
        essay = [re.sub("[\.,;:!?]", "", sent) for sent in essay]
        essay = [sent.split() for sent in essay]
        sentences += essay
        counter += 1
    return sentences

In [None]:
cleaned_word2vec = cleanup_essays_word2vec(combo_set['corrected'],logging = True)

In [None]:
text_dim = 300
wordvec_model = Word2Vec(cleaned_word2vec, size=text_dim, window=5, min_count=3, workers=4, sg=1)
wordvec_model.save('wordvec_model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# Function to create averaged word vectors given a cleaned text.
def create_average_vec(essay):
    average = np.zeros((text_dim,), dtype='float32')
    num_words = 0.
    for word in essay.split():
        if word in wordvec_model.wv.vocab:
            average = np.add(average, wordvec_model.wv[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average

In [None]:
# Create word vectors
cleaned_vec = np.zeros((training_set.shape[0], text_dim), dtype="float32")  
for i in range(len(train_cleaned)):
    cleaned_vec[i] = create_average_vec(train_cleaned[i])

print("Word vectors for all essays in the training data set are of shape:", cleaned_vec.shape)

Word vectors for all essays in the training data set are of shape: (12976, 300)


**Neural Network Models**

In [None]:
feature_list = [
                'word_count',
                'corrections',
                'similarity',
                'token_count',
                'unique_token_count',
                'nostop_count',
                'sent_count',
                'ner_count',
                'comma',
                'question',
                'exclamation',
                'quotation',
                'organization',
                'caps',
                'person',
                'location',
                'money',
                'time',
                'date',
                'percent',
                'noun',
                'adj',
                'pron',
                'verb',
                'cconj',
                'adv',
                'det',
                'propn',
                'num',
                'part',
                'intj'
                ]

additional_features = training_set[feature_list]

stdscaler = StandardScaler()
additional_features = stdscaler.fit_transform(additional_features)
additional_features.shape

(12976, 31)

In [None]:
# Combine topic number, target score, additional features and cleaned word vectors
all_data = pd.concat([training_set[['topic','target_score']],pd.DataFrame(additional_features), pd.DataFrame(cleaned_vec)], axis=1)
all_data.shape

(12976, 333)

In [None]:
# Build model
output_dim = 1
input_dim = all_data.shape[1] - 2
model = None
dropout = 0.2
model = Sequential()
model.add(Dense(14,activation='relu',kernel_initializer='he_normal',input_dim=input_dim))
model.add(Dropout(dropout))
model.add(Dense(output_dim))
model.summary()

# Compile the model
adam = Adam(lr=0.0001,beta_1=0.9,beta_2=0.999,epsilon=10e-8,decay=0.0,amsgrad=False)
model.compile(optimizer=adam,loss='mse',metrics=['mae','mse'])

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 14)                4648      
_________________________________________________________________
dropout_4 (Dropout)          (None, 14)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 15        
Total params: 4,663
Trainable params: 4,663
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Run each topic individually through neural network

kappa_list = []
weights = []
epochs = 100

for topic in range(1,9):
    X = all_data[all_data.topic==topic].drop(['topic','target_score'],axis=1)
    y = all_data[all_data.topic==topic].target_score.to_frame()
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=26)
    estimator = model.fit(X_train,y_train,epochs=epochs,batch_size=15,verbose=0)

    # get predictions
    y_pred = pd.DataFrame(model.predict(X_test).reshape(-1))

    # get topic kappa score
    kappa_list.append(kappa(y_test.values,y_pred.round(0).astype(int).values,weights='quadratic'))

    # get weights
    weights.append(y_test.shape[0]/all_data.shape[0])   

# get weighted average kappa
qwk = mean_quadratic_weighted_kappa(kappa_list, weights=1) # weights)
print(qwk)

0.7332403066846499


In [None]:
# Cross-validation

kappa_dict = {}
for topic in range(1,9):
    model = None
    # Create the model
    model = Sequential()
    model.add(Dense(14,activation='relu',kernel_initializer='he_normal',input_dim=input_dim))
    model.add(Dropout(dropout))
    model.add(Dense(1))

    # Compile the model
    model.compile(loss = 'mean_squared_error',optimizer = adam)

    X = all_data[all_data.topic == topic].drop(['topic','target_score'],axis=1)
    y = all_data[all_data.topic == topic].target_score.to_frame()
    kf = KFold(n_splits =5,random_state = 26)
    kappa_list = []
    for train,test in kf.split(X):
        X_train,X_test = X.iloc[train],X.iloc[test]
        y_train,y_test = y.iloc[train],y.iloc[test]
        model.fit(X_train,y_train,epochs=200,batch_size=15,verbose=0)
        y_pred = pd.DataFrame(model.predict(X_test).reshape(-1))
        kappa_list.append(kappa(y_pred.round(0).astype(int).values,
                               y.iloc[test].values,
                               weights='quadratic'))
    print("Kappa for topic", topic, ": {:.3f}%".format(np.mean(kappa_list)))
    kappa_dict[topic] = np.mean(kappa_list)
    
mqwk = mean_quadratic_weighted_kappa(list(kappa_dict.values()), weights=1) # weights)
print(mqwk)



Kappa for topic 1 : 0.818%
Kappa for topic 2 : 0.711%
Kappa for topic 3 : 0.712%
Kappa for topic 4 : 0.775%
Kappa for topic 5 : 0.819%
Kappa for topic 6 : 0.811%
Kappa for topic 7 : 0.779%
Kappa for topic 8 : 0.629%
0.7631138266404067
