In [None]:
## Essay Scoring ######

In [1]:
#Defining Imports
import pandas as pd
import re
from itertools import chain
import itertools
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import string
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from string import digits
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import operator
from nltk.corpus import stopwords
from textblob import TextBlob
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [None]:
#Defining Objects
lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
stop=[]
stop.extend(stopwords.words('english'))
nlp = en_core_web_sm.load()

In [None]:
#Defining features used for model training
class Features:
    
    def get_part_of_speech_tags(self,sentences):
        all_sample_tags = []
        for i in sentences:
            dict_count = {}
            result = (TextBlob(i))
            for words, tag in result.tags:
                if tag not in dict_count:
                    dict_count[tag] = 1
                else:
                    dict_count[tag]  += 1
            all_sample_tags.append(dict_count)

        cc = pd.DataFrame.from_dict(all_sample_tags)
        cc.fillna(0,inplace=True)
        return cc


    def get_sen_len(self,sentences):
        sen_len = []
        for i in sentences:
            i = len(i.split(' '))
            sen_len.append(i)
        return sen_len

    def preprocess_data(self,s):
        s = s.lower()
        data = re.sub(r'[^\x00-\x7F]+', ' ', s)
        final_str = data.translate(str.maketrans('', '', string.punctuation))
        filter_str = final_str.translate(str.maketrans('', '', digits))
        nltk_tokens = nltk.word_tokenize(filter_str)
        #Next find the roots of the word
        str_= ''
        for w in nltk_tokens:

            if w not in stop:
                str_ += ' '  + (lemmatizer.lemmatize(w))

        return str_.strip()



    def get_vector(self,model,all_data):
        vector_sen = []
        for d in all_data:
            single_sen_vec = []
            words = d.split(' ')
            for w in words:
                try:   
                    get_word_vec = model[w]
                except:
                    pass
                single_sen_vec.append(get_word_vec)
            v = np.array(single_sen_vec).mean(axis=0)
            vector_sen.append(v)
        return vector_sen



In [None]:
#Defining parameters of LGB
def ml_model(X_train, X_test, y_train_internal, y_test_internal,num_cla):
    params = {
                    'num_leaves': 60,
                    'max_bin': 110,
                    'num_class':num_cla,
                    'min_data_in_leaf': 50,
                    'learning_rate': 0.01,
                    'min_sum_hessian_in_leaf': 0.000446,
                    'bagging_fraction': 0.60,
                    'bagging_freq': 15,
                    'max_depth': 20,
                    'save_binary': True,
                    'seed': 31452,
                    'feature_fraction_seed': 31415,
                    'feature_fraction': 0.51,
                    'bagging_seed': 31415,
                    'drop_seed': 31415,
                    'data_random_seed': 31415,
                    'objective': 'multiclass',
                    'boosting_type': 'gbdt',
                    'verbose': 1,
                    'metric': 'multi_logloss',
                    'is_unbalance': False,

    }

    d_train = lgb.Dataset(X_train, label=y_train_internal)
    clf = lgb.train(params, d_train,30000)
    return clf

In [None]:
def feature_eng(single_df):
    #ner_tags = get_ner_tags(list(single_df['EssayText']))
    #noun_p = get_noun_phrase(list(single_df['EssayText']))  
    parts_dataset = fe.get_part_of_speech_tags(list(single_df['EssayText']))
    single_df['EssayText']=single_df['EssayText'].apply(fe.preprocess_data)
    sen_len = fe.get_sen_len(list(single_df['EssayText']))
    vectors = fe.get_vector(model,list(single_df['EssayText']))
    new_tfidf_features = pd.DataFrame(vectors)
    new_tfidf_features['clarity'] = list(single_df['clarity'])
    new_tfidf_features['sen_len'] = sen_len
    #new_tfidf_features['noun_p'] = noun_p
    new_tfidf_features['coherent'] = list(single_df['coherent'])
    new_tfidf_features['kitna_aacha'] = list(single_df['kitna_aacha'])
    new_tfidf_features['source_'] = list(single_df['source_'])
    new_tfidf_features = pd.concat([new_tfidf_features,parts_dataset],axis=1)
    new_tfidf_features.dropna(axis=0,inplace=True)
    all_preprocessed_single_q_data =  pd.get_dummies(new_tfidf_features,columns=['clarity','coherent'])
    preprocessed_main_test = all_preprocessed_single_q_data.loc[all_preprocessed_single_q_data['source_'] == 0]
    preprocessed_main_train = all_preprocessed_single_q_data.loc[all_preprocessed_single_q_data['source_'] == 1]
    preprocessed_main_test.drop(['source_','kitna_aacha'],axis =1,inplace=True)
    y_train = (preprocessed_main_train['kitna_aacha'])
    x_train = preprocessed_main_train.drop(['kitna_aacha','source_'],axis =1)
    return x_train,y_train,preprocessed_main_test


In [None]:
def different_models(model,multi_datasets):
    flag = 1
    for i in multi_datasets:
        print(i)
        
        single_df = multi_datasets[i]
        single_df.reset_index(inplace= True)
        single_df.dropna(axis=0,inplace=True)

        test_id = list(single_df.loc[single_df['source_'] ==0]['ID'])
        es = list(single_df.loc[single_df['source_'] ==0]['Essayset'])
        if len(test_id) != len(es):
            print('PANGA')
            break

        single_df['candi_score'] = single_df[['score_1','score_2' ,'score_3' ,'score_4' ,'score_5']].mean(axis=1)
        single_df['candi_score'] = list(map(lambda x : round(x),single_df['candi_score']))
        single_df.drop(['score_1','score_2' ,'score_3' ,'score_4' ,'score_5','ID','index'],inplace=True,axis=1)
        single_df['kitna_aacha']=single_df['candi_score']
        single_df.drop(['min_score','max_score','Essayset','candi_score'],inplace=True,axis=1)
        
        x_train,y_train,preprocessed_main_test = feature_eng(single_df)
        
        X_train, X_test, y_train_internal, y_test_internal = train_test_split(x_train,y_train ,test_size =0.0)
        from pprint import pprint
        num_cla = len(set(y_train))
        print('----')
        print(num_cla) 
        clf = ml_model(X_train, X_test, y_train_internal, y_test_internal,num_cla)
           
        if flag ==0 :
            pred_internal = clf.predict(X_test)
            print(pred_internal)
            pred_internal = list(map(lambda x : max(enumerate(x), key=operator.itemgetter(1))[0],pred_internal))
            from sklearn import metrics
            pp.append(metrics.accuracy_score(y_test_internal, pred_internal))
        if flag ==1:
            pred_internal = clf.predict(preprocessed_main_test)
            pred_internal = list(map(lambda x : max(enumerate(x), key=operator.itemgetter(1))[0],pred_internal))
            res =list(zip(test_id,es,pred_internal))
            resul_list.append(res)
    return resul_list

In [None]:
df_test = pd.read_csv('test_dataset.csv')
df_test.head()
df_train = pd.read_csv('train_dataset.csv')

df_train['source_'] = 1
df_test['source_'] =0

df = pd.concat([df_train,df_test])
df['score_1'].fillna(1, inplace = True)
df['score_2'].fillna(1, inplace = True)
df['score_3'].fillna(1, inplace = True)
df['score_4'].fillna(1, inplace = True)
df['score_5'].fillna(1, inplace = True)

df.reset_index(inplace=True)
df.drop(['index'],inplace=True,axis=1)

multi_datasets = {}
groups = df.groupby('Essayset')
for name, group in groups:
    multi_datasets[name] = group

resul_list = []

In [None]:
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [None]:
fe = Features()
predictions = different_models(model,multi_datasets)

In [None]:
l = list(itertools.chain(*predictions))

In [None]:
res_ = pd.DataFrame(l,columns=['id','essay_set','essay_score'])

In [None]:
res_.to_csv('embed_pos_sen_len.csv',index=False)