### FEATURE EXTRACTION 

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re, collections
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('set5.csv', encoding = 'ISO-8859-1')

In [3]:
data 

Unnamed: 0,essay_id,essay,domain1_score,Unnamed: 3,Unnamed: 4
0,11827,"In this memoir of Narciso Rodriguez, @PERSON3'...",2,,
1,11828,Throughout the excerpt from Home the Blueprint...,2,,
2,11829,The mood the author created in the memoir is l...,3,,
3,11830,The mood created by the author is showing how ...,1,,
4,11831,The mood created in the memoir is happiness an...,3,,
...,...,...,...,...,...
1800,13627,The mood of this memoir is nonfiction. The moo...,2,,
1801,13628,The mood was created by the author in the memo...,0,,
1802,13629,"In the memoir ""Narciso Rodriguez"", the mood cr...",4,,
1803,13630,"The mood created @CAPS3 the author, Narciso Ro...",3,,


In [4]:
data.drop(data.iloc[:, 3:5], inplace=True, axis=1)

In [5]:
data

Unnamed: 0,essay_id,essay,domain1_score
0,11827,"In this memoir of Narciso Rodriguez, @PERSON3'...",2
1,11828,Throughout the excerpt from Home the Blueprint...,2
2,11829,The mood the author created in the memoir is l...,3
3,11830,The mood created by the author is showing how ...,1
4,11831,The mood created in the memoir is happiness an...,3
...,...,...,...
1800,13627,The mood of this memoir is nonfiction. The moo...,2
1801,13628,The mood was created by the author in the memo...,0
1802,13629,"In the memoir ""Narciso Rodriguez"", the mood cr...",4
1803,13630,"The mood created @CAPS3 the author, Narciso Ro...",3


In [6]:
def sentence_to_wordlist(raw_sentence):
    
    clean_sentence = re.sub("[^a-zA-Z0-9]"," ", raw_sentence)
    tokens = nltk.word_tokenize(clean_sentence)
    
    return tokens

In [7]:
def tokenize(essay):
    stripped_essay = essay.strip()
    
    raw_sentences = nltk.sent_tokenize(essay)
    
    tokenized_sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences.append(sentence_to_wordlist(raw_sentence))
    
    return tokenized_sentences

In [31]:
# The idea for this feature is taken from Reference paper number 1 (list of refrence paper is at the end of this notebook)
def avg_word_len(essay):
    
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    return sum(len(word) for word in words) / len(words)

In [26]:
# The idea for this feature is taken from Reference paper number 1 (list of refrence paper is at the end of this notebook)
def word_count(essay):
    
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    return len(words)

In [12]:
# The idea for this feature is taken from Reference paper number 1 (list of refrence paper is at the end of this notebook)
def sent_count(essay):
    
    sentences = nltk.sent_tokenize(essay)
    
    return len(sentences)

In [13]:
# The idea for this feature is taken from Reference paper number 2 (list of refrence paper is at the end of this notebook)
def count_lemmas(essay):
    
    tokenized_sentences = tokenize(essay)      
    
    lemmas = []
    wordnet_lemmatizer = WordNetLemmatizer()
    
    for sentence in tokenized_sentences:
        tagged_tokens = nltk.pos_tag(sentence) 
        
        for token_tuple in tagged_tokens:
        
            pos_tag = token_tuple[1]
        
            if pos_tag.startswith('N'): 
                pos = wordnet.NOUN
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            elif pos_tag.startswith('J'):
                pos = wordnet.ADJ
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            elif pos_tag.startswith('V'):
                pos = wordnet.VERB
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            elif pos_tag.startswith('R'):
                pos = wordnet.ADV
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            else:
                pos = wordnet.NOUN
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
    
    lemma_count = len(set(lemmas))
    
    return lemma_count

In [14]:
# The idea for this feature is taken from Reference paper number 1 (list of refrence paper is at the end of this notebook)
# The feature has been modified although the basic idea comes from this the reference given in above line
import enchant
def count_spell_error(essay):
    
    d = enchant.Dict("en_US")
    
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    mispell = 0
    
    for word in words:
        if(d.check(word)==False):
            mispell = mispell + 1
    
    total_words = word_count(essay)
    mispell_prop = mispell/total_words
    
    return mispell_prop

In [15]:
# The idea for this feature is taken from Reference paper number 2 (list of refrence paper is at the end of this notebook)
# The feature has been modified
def count_pos(essay):
    
    tokenized_sentences = tokenize(essay)
    
    noun_count = 0
    adj_count = 0
    verb_count = 0
    adv_count = 0
    
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    for sentence in tokenized_sentences:
        tagged_tokens = nltk.pos_tag(sentence)
        
        for token_tuple in tagged_tokens:
            pos_tag = token_tuple[1]
        
            if pos_tag.startswith('N'): 
                noun_count += 1
            elif pos_tag.startswith('J'):
                adj_count += 1
            elif pos_tag.startswith('V'):
                verb_count += 1
            elif pos_tag.startswith('R'):
                adv_count += 1
    
    return noun_count/len(words), adj_count/len(words), verb_count/len(words), adv_count/len(words)

In [16]:
# The idea for this feature is taken from Reference paper number 4 (list of refrence paper is at the end of this notebook)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def sentiment_tagger(essay):
    neg_sent = 0
    pos_sent = 0
    nue_sent = 0
    ss = sid.polarity_scores(essay)
    for k in sorted(ss):
        if k == 'compound':
            pass
        elif k == 'neg':
            neg_sent = neg_sent + ss[k]
        elif k == 'pos':
            pos_sent = pos_sent + ss[k]
        elif k == 'neu':
            nue_sent = nue_sent + ss[k]
    return neg_sent, pos_sent, nue_sent

In [19]:
def get_tfidf_vectors(essays):
    vectorizer = TfidfVectorizer(stop_words='english')
    
    words = []
    for essay in essays:
        clean_essay = re.sub(r'\W', ' ', essay)
        words.append(nltk.word_tokenize(clean_essay))
        
    lemmatizer = WordNetLemmatizer()
    docs_lemmatized = [[lemmatizer.lemmatize(j) for j in i]for i in words]
    
    corpus = [' '.join(i) for i in docs_lemmatized]
    
    vectors = vectorizer.fit_transform(corpus)
    
    feature_names = vectorizer.get_feature_names()
    
    return feature_names, vectors

In [20]:
feature_names_cv,vectors_all = get_tfidf_vectors(data['essay'])

In [21]:
vectors_all

<1805x4268 sparse matrix of type '<class 'numpy.float64'>'
	with 69696 stored elements in Compressed Sparse Row format>

In [22]:
index_high = data.index[data['domain1_score'] == 4].tolist()
n = len(index_high)
n

258

In [23]:
# This feature has not been taken from any of the papers
from sklearn.metrics.pairwise import cosine_similarity
def get_similarity(essay_id):
    j = data.index[data['essay_id'] == essay_id]
    similarity = 0
    for i in index_high:
        similarity += cosine_similarity(vectors_all[i,:],vectors_all[j,:])
    similarity /= n
    return np.asscalar(similarity)

In [32]:
def extract_features(data):
    
    features = data.copy()
    
    features['word_count'] = features['essay'].apply(word_count)
    
    features['sent_count'] = features['essay'].apply(sent_count)
    
    features['avg_word_len'] = features['essay'].apply(avg_word_len)
    
    features['lemma_count'] = features['essay'].apply(count_lemmas)
    
    features['spell_err_count'] = features['essay'].apply(count_spell_error)
    
    features['noun_count'], features['adj_count'], features['verb_count'], features['adv_count'] = zip(*features['essay'].map(count_pos))
    
    features['neg_score'], features['pos_score'], features['nue_score'] = zip(*features['essay'].map(sentiment_tagger))
    
    features['similarity'] = features['essay_id'].apply(get_similarity)
    
    return features

In [33]:
features_set1 = extract_features(data)

In [35]:
features_set1

Unnamed: 0,essay_id,essay,domain1_score,word_count,sent_count,avg_word_len,lemma_count,spell_err_count,noun_count,adj_count,verb_count,adv_count,neg_score,pos_score,nue_score,similarity
0,11827,"In this memoir of Narciso Rodriguez, @PERSON3'...",2,133,8,4.383459,81,0.067669,0.255639,0.030075,0.218045,0.022556,0.000,0.153,0.847,0.102273
1,11828,Throughout the excerpt from Home the Blueprint...,2,168,7,4.285714,102,0.011905,0.220238,0.083333,0.184524,0.071429,0.009,0.189,0.802,0.098348
2,11829,The mood the author created in the memoir is l...,3,112,6,4.580357,75,0.080357,0.321429,0.080357,0.169643,0.035714,0.000,0.205,0.795,0.169553
3,11830,The mood created by the author is showing how ...,1,75,3,4.226667,44,0.146667,0.240000,0.106667,0.200000,0.013333,0.030,0.138,0.832,0.034662
4,11831,The mood created in the memoir is happiness an...,3,127,8,4.283465,68,0.047244,0.228346,0.086614,0.173228,0.070866,0.000,0.260,0.740,0.158431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1800,13627,The mood of this memoir is nonfiction. The moo...,2,132,7,4.393939,71,0.015152,0.318182,0.053030,0.174242,0.037879,0.000,0.203,0.797,0.106523
1801,13628,The mood was created by the author in the memo...,0,30,1,4.033333,21,0.133333,0.333333,0.000000,0.166667,0.000000,0.000,0.148,0.852,0.061781
1802,13629,"In the memoir ""Narciso Rodriguez"", the mood cr...",4,166,9,4.421687,93,0.066265,0.246988,0.114458,0.222892,0.012048,0.000,0.239,0.761,0.103630
1803,13630,"The mood created @CAPS3 the author, Narciso Ro...",3,132,6,4.492424,86,0.090909,0.318182,0.037879,0.166667,0.060606,0.000,0.195,0.805,0.165012


In [36]:
features_set1.to_csv('features.csv', index=False)

#### REFRENCES:

1. Mahana, M., Johns, M., & Apte, A. (2012). Automated essay grading using machine learning. Mach. Learn. Session, Stanford University.
2. Suresh, A., & Jha, M. (2018). Automated essay grading using natural language processing and support vector machine. International Journal of Computing and Technology, 5(2), 18-21.
3. Rokade, A., Patil, B., Rajani, S., Revandkar, S., & Shedge, R. (2018, April). Automated Grading System Using Natural Language Processing. In 2018 Second International Conference on Inventive Communication and Computational Technologies (ICICCT) (pp. 1123-1127). IEEE.
4. Song, S., & Zhao, J. (2013). Automated essay scoring using machine learning. Stanford University.