In [1]:
import os, glob, string, spacy, math
import numpy as np
import pandas as pd
from math import sqrt
from numpy import dot
from numpy.linalg import norm
from scipy import spatial

import en_core_web_sm

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVR
from sklearn.linear_model import Lasso

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm
tqdm.pandas()

In [2]:
# Loading Spacy

nlp = en_core_web_sm.load()

In [3]:
# Load InferSent model

%load_ext autoreload
%autoreload 2
%matplotlib inline
import torch
from models import InferSent

V = 2
MODEL_PATH = 'models/InferSent/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'models/InferSent/fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

infersent.build_vocab_k_words(K=100000)

Vocab size : 100000


In [5]:
# Load BERT

import tensorflow as tf
import transformers
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [6]:
from rouge_score import rouge_scorer

rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])

In [7]:
# Utility functions

w2v_size = 300
infersent_size = 4096
bert_size = 768
deconf_size = 300

max_emb_size = infersent_size

lemma_pos_tags = ['ADJ', 'ADV', 'NOUN', 'VERB', 'NUM', 'PROPN']

def get_lemmas(text):
    if text == '':
        return []
    return list(filter(None, [t.split('\t')[2] for t in text.split('\n') if t.split('\t')[1] in lemma_pos_tags]))

def get_emb_from_string(text):
    return [float(c) for c in text.split()[1:]]

def get_mean_emb(embeddings):
    if len(embeddings)==0:
        return np.zeros(max_emb_size)
    return np.mean(embeddings, axis=0)

def process(text):  
    processed = nlp(text)
    sentence = list()
        
    for word in processed:
        item = list()
        item.append(str(word))
        item.append(word.pos_)
        item.append(word.lemma_)        
        sentence.append('\t'.join(item))
    return '\n'.join(sentence)

def get_w2v(text):
    lemmatized = get_lemmas(text) 
    embeddings = list()
    for lemma in lemmatized:
        map_emb = w2v_mapping.get(lemma)
        if map_emb:
            emb = get_emb_from_string(w2v_flines[map_emb])
            embeddings.append(emb)
    return get_mean_emb(embeddings)

def get_infersent(text):
    lemmatized = get_lemmas(text) 
    emb = infersent.encode([' '.join(lemmatized)], bsize=128, tokenize=False)[0]
    if len(emb) == 0:
        emb = np.zeros(max_emb_size)
    return emb

def get_bert(text):
    lemmatized = get_lemmas(text) 
    inputs = tokenizer(' '.join(lemmatized), return_tensors="pt")
    outputs = bert(**inputs)
    lhs = outputs.last_hidden_state
    attention = inputs['attention_mask'].reshape((lhs.size()[0], lhs.size()[1], -1)).expand(-1, -1, 768)
    embeddings = torch.mul(lhs, attention)
    denominator = (embeddings != 0).sum(dim=1)
    summation = torch.sum(embeddings, dim=1)
    mean_embedding = torch.div(summation, denominator)[0].detach().numpy()
    if len(mean_embedding) == 0:
        mean_embedding = np.zeros(max_emb_size)
    return mean_embedding

In [8]:
# Similarity functions

def cos(a, b):
    return 1 - spatial.distance.cosine(a, b)

def jaccard_sim(l1, l2): 
    if len(l2) == 0:
        return 0
    a = set(l1) 
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def word_overlap(l1, l2):
    if len(l2) == 0 or len(l1) == 0:
        return 0
    a = set(l1) 
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / min(len(a),len(b))

def get_rouges(t1, t2):
    return rouge.score(t1, t2)

In [9]:
# Dataframe utility functions

def process_df(df):
    print('- text processing')
    df['processed'] = df['text'].progress_apply(lambda x: process(x))

def get_w2v4df(df):
    print('- computing Word2Vec embeddings')
    df['w2v'] = df['processed'].progress_apply(lambda x: get_w2v(x))  
    
def get_infersent4df(df):
    print('- computing InferSent embeddings')
    df['infersent'] = df['processed'].progress_apply(lambda x: get_infersent(x)) 
    
def get_bert4df(df):
    print('- computing BERT embeddings')
    df['bert'] = df['processed'].progress_apply(lambda x: get_bert(x)) 

In [23]:
# Feature generation and grade prediction functions

def generate_embedding_feats(ans_df, ref_df, que_df, emb):
    
    col_suf = '_' + emb
    zeros = np.zeros(len(ans_df.index))
    emb_size = globals()[emb + '_size']
    
    ans_df['Diff' + col_suf] = ''
    
    for i in ans_df.index:
        ans_row = ans_df.loc[i]
        ref_row = ref_df[ref_df.key==ans_row.key]
        que_row = que_df[que_df.key==ans_row.key]
        
        ans_emb = ans_row[emb]
        ref_emb = ref_row[emb].values
            
        if len(ans_emb) > 0 and len(ref_emb) > 0:
            
            ans_emb = ans_emb[:emb_size]
            ref_emb = ref_emb[0][:emb_size]
            
            if np.all(ans_emb==0):
                ans_df.at[i, 'Diff' + col_suf] = np.zeros(emb_size)
            else:
                diff = ans_emb - ref_emb

                ans_df.at[i, 'Diff'  + col_suf] = diff
                
def generate_text_feats(ans_df, ref_df, que_df):
    
    zeros = np.zeros(len(ans_df.index))
    
    ans_df['Jaccard'] = zeros
    ans_df['Overlap']  = zeros
    ans_df['NumNouns'] = zeros
    ans_df['NumVerbs'] = zeros
    ans_df['Rouge1'] = zeros
    ans_df['RougeL'] = zeros
        
    for i in tqdm(ans_df.index):
        ans_row = ans_df.loc[i]
        ref_row = ref_df[ref_df.key==ans_row.key]
        que_row = que_df[que_df.key==ans_row.key]

        if ans_row['processed'] == '' or ref_row['processed'].values[0] == '':
            continue
            
        ans_lemmas = [t.split('\t')[2] for t in ans_row['processed'].split('\n') 
                      if t.split('\t')[1] in lemma_pos_tags]
        ref_lemmas = [t.split('\t')[2] for t in ref_row['processed'].values[0].split('\n') 
                      if t.split('\t')[1] in lemma_pos_tags]
        que_lemmas = [t.split('\t')[2] for t in que_row['processed'].values[0].split('\n') 
                      if t.split('\t')[1] in lemma_pos_tags]
        
         # question demoting
        for q in que_lemmas:
            if q in ans_lemmas:
                ans_lemmas.remove(q)
                
        if len(ans_lemmas) == 0 or len(ref_lemmas) == 0:
            continue
         
        ans_df.at[i, 'Overlap']  = word_overlap(ans_lemmas, ref_lemmas)
        ans_df.at[i, 'Jaccard'] = jaccard_sim(ans_lemmas, ref_lemmas)

        rouges = get_rouges(' '.join(ans_lemmas), ' '.join(ref_lemmas))
        ans_df.at[i, 'Rouge1'] = rouges['rouge1'].fmeasure
        ans_df.at[i, 'RougeL'] = rouges['rougeL'].fmeasure
            
        num_nouns_ans = sum([1 for t in ans_row['processed'].split('\n') 
                             if t.split('\t')[1] in ['NOUN', 'PROPN']])
        num_verbs_ans = sum([1 for t in ans_row['processed'].split('\n') 
                             if t.split('\t')[1] == 'VERB'])
        ans_df.at[i, 'NumNouns'] = num_nouns_ans
        ans_df.at[i, 'NumVerbs'] = num_verbs_ans

In [17]:
models = {
    'SVR': make_pipeline(StandardScaler(), SVR())
}

def evaluate_model(model, X, y):
    scores = cross_val_predict(model, list(X), list(y), cv=5)
    return scores

def stacked_regr(df, cols, emb, mode=None):
    suf = ''
    if mode:
        suf += '_' + mode
    X = df[cols + ['Diff_' + emb + suf]]
    y = df.score
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    for name, model in models.items():
        scores = evaluate_model(model, X_train['Diff_'+ emb + suf], y_train)
        X_train[name] = scores
        model.fit(list(X_train['Diff_'+ emb + suf]), y_train)
        scores = model.predict(list(X_test['Diff_'+ emb + suf]))
        X_test[name] = scores
        cols+=[name]

    pipeline = Pipeline([('scaler', StandardScaler()), ('model', Lasso())])
    
    # Search for optimal Lasso hyperparams
    search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error")
    
    search.fit(list(X_train[cols].values), y_train)
    
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)
    
    # Select important features
    new_cols = X_train[cols].columns[importance>0]
    
    print('Regression results - {}:'.format(emb))
    print('- best features: {}'.format(list(new_cols)))
    
    regressor = make_pipeline(StandardScaler(), SVR())
    regressor.fit(list(X_train[new_cols].values), y_train)
    
    y_pred = regressor.predict(list(X_test[new_cols].values))
    r = np.corrcoef(y_pred,y_test)
    rmse = sqrt(mean_squared_error(y_pred,y_test))
    print('- Pearson\'s r:', round(r[0,1]*100, 2))
    print('- RMSE:', round(rmse*100, 2), '\n')
    return new_cols, pd.DataFrame(list(zip(X_test.index, y_pred, abs(y_test-y_pred))), columns=['', 'pred', 'diff']).set_index('')

In [12]:
# Reading answers and populating dataframes

keys = os.listdir('data/ShortAnswerGrading_v2.0/data/scores')

raw_stud_ans = list()

for key in keys: 
    with open('data/ShortAnswerGrading_v2.0/data/scores/' + key + '/ave', 'r') as f:
        score_lines = f.read().split('\n')
    with open('data/ShortAnswerGrading_v2.0/data/sent/' + key, 'r') as f:
        lines = f.read().split('\n')
        for l in range(0, len(lines)-1):
            line = lines[l].split(' ', 1)
            raw_stud_ans.append([line[0], line[1], float(score_lines[l])])
            
raw_ref_ans = list()

with open('data/ShortAnswerGrading_v2.0/data/sent/answers', 'r') as f:
    lines = f.read().split('\n')
    for l in range(0, len(lines)-1):
        line = lines[l].split(' ', 1)
        raw_ref_ans.append((line[0], line[1]))
        
raw_que = list()

with open('data/ShortAnswerGrading_v2.0/data/sent/questions', 'r') as f:
    lines = f.read().split('\n')
    for l in range(0, len(lines)-1):
        line = lines[l].split(' ', 1)
        raw_que.append((line[0], line[1]))     

# Read Word2Vec
w2v_path = 'models/GoogleNews-vectors-negative300.txt'
with open(w2v_path, 'r', encoding='utf-8') as f:
    f.readline()
    w2v_flines = f.readlines()
w2v_mapping = dict()
for i in range(0, len(w2v_flines)):
    w2v_mapping[w2v_flines[i].split()[0]] = i  

print('Processing questions:')
que_df = pd.DataFrame(raw_que, columns=['key', 'text'])
process_df(que_df)
get_w2v4df(que_df)
get_infersent4df(que_df)
get_bert4df(que_df)

print('Processing reference answers:')
ref_ans_df = pd.DataFrame(raw_ref_ans, columns=['key', 'text'])
process_df(ref_ans_df)
get_w2v4df(ref_ans_df)
get_infersent4df(ref_ans_df)
get_bert4df(ref_ans_df)

print('Processing student answers:')
stud_ans_df = pd.DataFrame(raw_stud_ans, columns=['key', 'text', 'score'])
process_df(stud_ans_df)
get_w2v4df(stud_ans_df)
get_infersent4df(stud_ans_df)
get_bert4df(stud_ans_df)

Processing questions:
- text processing


100%|██████████| 87/87 [00:01<00:00, 45.76it/s]


- computing Word2Vec embeddings


100%|██████████| 87/87 [00:00<00:00, 953.07it/s]


- computing InferSent embeddings


100%|██████████| 87/87 [00:05<00:00, 14.95it/s]


- computing BERT embeddings


100%|██████████| 87/87 [00:06<00:00, 13.21it/s]


Processing reference answers:
- text processing


100%|██████████| 87/87 [00:00<00:00, 94.52it/s] 


- computing Word2Vec embeddings


100%|██████████| 87/87 [00:00<00:00, 655.75it/s]


- computing InferSent embeddings


100%|██████████| 87/87 [00:06<00:00, 12.88it/s]


- computing BERT embeddings


100%|██████████| 87/87 [00:07<00:00, 11.44it/s]


Processing student answers:
- text processing


100%|██████████| 2442/2442 [00:26<00:00, 93.50it/s] 


- computing Word2Vec embeddings


100%|██████████| 2442/2442 [00:04<00:00, 509.37it/s]


- computing InferSent embeddings


100%|██████████| 2442/2442 [04:17<00:00,  9.48it/s]


- computing BERT embeddings


100%|██████████| 2442/2442 [04:48<00:00,  8.45it/s]


In [24]:
# Generating features for UNT

generate_text_feats(stud_ans_df, ref_ans_df, que_df)
generate_embedding_feats(stud_ans_df, ref_ans_df, que_df, 'w2v')
generate_embedding_feats(stud_ans_df, ref_ans_df, que_df, 'infersent')
generate_embedding_feats(stud_ans_df, ref_ans_df, que_df, 'bert')

100%|██████████| 2442/2442 [00:03<00:00, 657.02it/s]


In [25]:
 cols = ['Jaccard','Overlap', 'NumNouns', 'NumVerbs', 'Rouge1', 'RougeL']

In [26]:
# STKD (stacked) model

print('-STKD-\n')

# Stacked regression w2v
feats_w2v, res_df_w2v = stacked_regr(stud_ans_df, cols.copy(), emb='w2v')

# Stacked regression InferSent
feats_infersent, res_df_infersent = stacked_regr(stud_ans_df, cols.copy(), emb='infersent')

# Stacked regression BERT
feats_bert, res_df_bert = stacked_regr(stud_ans_df, cols.copy(), emb='bert')

-STKD-

Regression results - w2v:
- best features: ['Overlap', 'Rouge1', 'SVR']
- Pearson's r: 67.64
- RMSE: 76.28 

Regression results - infersent:
- best features: ['Overlap', 'SVR']
- Pearson's r: 66.74
- RMSE: 77.15 

Regression results - bert:
- best features: ['Overlap', 'SVR']
- Pearson's r: 70.36
- RMSE: 73.35 

