In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
import enchant
from spellchecker import SpellChecker
import time
from nltk.corpus import wordnet
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor

from feature_extractor import *

  from numpy.core.umath_tests import inner1d


In [2]:
essay_set1 = pd.read_pickle('processed_data_files/essay set/essay_set1')

In [3]:
X = essay_set1.drop(['essay_id','essay_set','essay','domain1_score','clean_essay','word_tokens','sent_tokens','corrected_essay','essay_documents','corrected_tokens'],axis=1)
y = essay_set1['domain1_score']

In [4]:
X.reset_index(drop=True,inplace = True)

In [5]:
#-----------TFIDF Model-----------------
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(essay_set1['essay_documents'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist() 
df = pd.DataFrame(denselist, columns=feature_names)

In [6]:
final_df = pd.concat([X,df],axis=1)

In [7]:
final_df.head()

Unnamed: 0,word_count,sent_count,spell_err,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,syn_overlap,...,yup,zap,zero,zingbobway,zip,zombie,zone,zoning,zoo,zoom
0,344,16,11,78,37,15,28,74.02,0.618182,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,413,20,16,103,55,10,24,67.08,0.569444,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,276,14,2,74,31,4,18,68.2,0.646154,70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,488,27,24,134,53,24,45,53.34,0.584559,85,...,0.0,0.0,0.0,0.0,0.09828,0.0,0.0,0.0,0.0,0.0
4,469,30,13,117,40,11,27,72.66,0.531818,62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def eval_metrics():
    print("MAE:",metrics.mean_absolute_error(y_test, y_pred))
    print("MSE:",metrics.mean_squared_error(y_test, y_pred))
    print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('Cohen\'s kappa score: %.2f' % metrics.cohen_kappa_score(np.rint(y_pred), y_test,weights='quadratic'))

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(final_df,y,test_size=0.3)

In [13]:
regressor=GradientBoostingRegressor()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

In [14]:
eval_metrics()

MAE: 0.593967341789098
MSE: 0.6143690678285865
RMSE: 0.7838169861827354
Cohen's kappa score: 0.84


In [15]:
######### Function for creating features of the essays input by users ###############

def create_data(essay,essay_set,essay_prompt_df,essay_source_df,vectorizer):
    clean_essay = remove_special_char(essay)
    word_tokens = word_tokenizer(clean_essay)
    sent_tokens = sent_tokenizer(essay)
    word_count = count(word_tokens)
    sent_count = count(sent_tokens)
    spell_err , corrected_essay = check_spell(word_tokens)
    essay_documents = create_documents(corrected_essay)
    corrected_tokens = word_tokenizer(essay_documents)
    noun_count,verb_count,adv_count,adj_count = pos_count(corrected_tokens)
    readability_score = compute_redability(essay)
    unique_word_ratio = unique_word_prop(corrected_tokens)
    
    if essay_set in [1,2,7,8]:
        prompt_tokens = essay_prompt_df.loc[essay_prompt_df['essay_set'] == essay_set,'tokens'][essay_set]
        synonyms = essay_prompt_df.loc[essay_prompt_df['essay_set'] == essay_set,'synonyms'][essay_set]
        synonyms_overlap_temp = [word for word in corrected_tokens if word in synonyms]
        prompt_overlap_temp = [word for word in corrected_tokens if word in prompt_tokens]
        synonyms_overlap = (len(synonyms_overlap_temp))
        synonyms_overlap_prop = (len(synonyms_overlap_temp)/(len(corrected_tokens)+1))
        prompt_overlap = (len(prompt_overlap_temp))
        prompt_overlap_prop = (len(prompt_overlap_temp)/(len(corrected_tokens)+1))
        
        X = pd.DataFrame({'word_count' : word_count, 'sent_count' : sent_count, 'spell_err' : spell_err,'noun_count': noun_count,
                        'verb_count' : verb_count,'adv_count':adv_count, 'adj_count':adj_count, 'readability_score':readability_score, 
                        'unique_word_ratio':unique_word_ratio,'syn_overlap':synonyms_overlap, 'syn_overlap_prop':synonyms_overlap_prop,
                        'prompt_overlap':prompt_overlap,'prompt_overlap_prop':prompt_overlap_prop},index=[0])
    
    
    else:
        source_tokens = essay_source_df.loc[essay_source_df['essay_set'] == essay_set,'pos(nouns & verbs)'][essay_set]
        source_overlap_temp = [word for word in corrected_tokens if word in source_tokens]
        source_overlap = (len(source_overlap_temp))
        source_overlap_prop = (len(source_overlap_temp)/(len(corrected_tokens)+1))
        synonyms = essay_source_df.loc[essay_source_df['essay_set'] == essay_set,'synonyms'][essay_set]
        synonyms_overlap_temp = [word for word in corrected_tokens if word in synonyms]    
        synonyms_overlap = (len(synonyms_overlap_temp))
        synonyms_overlap_prop = (len(synonyms_overlap_temp)/(len(corrected_tokens)+1))
        
        X = pd.DataFrame({'word_count' : word_count, 'sent_count' : sent_count, 'spell_err' : spell_err,'noun_count': noun_count,
                        'verb_count' : verb_count,'adv_count':adv_count, 'adj_count':adj_count, 'readability_score':readability_score, 
                        'unique_word_ratio':unique_word_ratio,'syn_overlap':synonyms_overlap, 'syn_overlap_prop':synonyms_overlap_prop,
                        'source_overlap':source_overlap,'source_overlap_prop':source_overlap_prop},index=[0])
    
    vectors = vectorizer.transform([essay_documents])
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist() 
    df = pd.DataFrame(denselist, columns=feature_names)
    
    final_df = pd.concat([X,df],axis=1)
    
    return final_df

In [19]:
########### Score prediction of the user input essay##############

essay = essay_set1['essay'][0]
essay_prompt_df = pd.read_pickle('processed_data_files/essay_prompt_df')
essay_source_df = pd.read_pickle('processed_data_files/essay_source_df')
df = create_data(essay,7,essay_prompt_df,essay_source_df,vectorizer)
pred = regressor.predict(df)
pred = np.rint(pred)
print("Score:",pred[0])

Score: 8.0
