# Auto Grading By Using KNearestNeighborClassifier

The data that is used in this project is obtained from https://www.kaggle.com/c/asap-aes.
However, it is changed easier implemenentation.

### Import packages

In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import cohen_kappa_score

from scipy import sparse

In [16]:
def StemmingWordList(arrayList):
    ps = nltk.PorterStemmer()
    newList = [ps.stem(word) for word in arrayList]
    return ''.join(newList)

svd = TruncatedSVD(n_iter=10, n_components=100)
tfidf = TfidfVectorizer(min_df = 0.01, max_df=0.85, stop_words='english')

### Import data

In [17]:
train = pd.read_excel('./Data/training_set_rel3_set1.xlsx')
test = pd.read_excel('./Data/valid_set_set1.xlsx')
train.set_index('ID')
test.set_index('essay_id')
y_train = train['Score']
y_test = test['Score']
X = pd.concat([train,test])


## Creating 2 addtional features
### "Number of words" and "Number of sentences" improve the accuracy

In [19]:
train_numberOfSentences = X['Essay Content'].apply(lambda x: len(x.split('.')))
train_numberOfWords = X['Essay Content'].apply(lambda x: len(x.split()))

In [20]:
content = X['Essay Content']
content = content.apply(lambda x: re.sub('^[a-zA-Z]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))

In [21]:
x_transform = tfidf.fit_transform(content)

In [22]:
x_transform = sparse.hstack((x_transform, train_numberOfSentences[:,None]))
x_transform = sparse.hstack((x_transform, train_numberOfWords[:,None]))

In [23]:
x_transform = svd.fit_transform(x_transform)

In [24]:
x_train = x_transform[:len(train)]
x_test = x_transform[len(train):]

In [25]:
neighbors = 6
nearestNeighbors = NearestNeighbors(n_neighbors=neighbors)
nearestNeighbors.fit(x_train)

NearestNeighbors(n_neighbors=6)

In [26]:
test_dist, test_ind = nearestNeighbors.kneighbors(x_test)

# Using custom median to predict score

In [27]:
i_prediction = list()
for i_dist in range(0, len(test_dist)):
    t = 0
    idx = 0
    avg = sum(test_dist[i_dist])/2
    for dist in test_dist[i_dist]:
        t += dist
        if (t < avg):
            idx += 1
        else:
            i_prediction.append(test_ind[i_dist][idx])
            break


In [28]:
prediction_list = list()
for i in i_prediction:
    prediction_list.append(y_train[i])



In [29]:
cohen_kappa_score(y_test, prediction_list,weights='quadratic')  

0.7122312208455323

# Using true median

In [30]:
prediction_list = list()
for val in test_ind:
    prediction_list.append(y_train[val[round(neighbors/2)]])


In [31]:
cohen_kappa_score(y_test, prediction_list,weights='quadratic') 

0.7147278223741393

# Using closest item

In [32]:
prediction_list = list()
for val in test_ind:
    prediction_list.append(y_train[val[0]])

In [33]:
cohen_kappa_score(y_test, prediction_list,weights='quadratic') 

0.7321063545431559

# Using mean of score

In [34]:
prediction_list = list()
for val in test_ind:
    total = 0
    for i in val:
        total += y_train[i]
    avg = round(total / len(val)) 
    prediction_list.append(avg)

In [35]:
cohen_kappa_score(y_test, prediction_list,weights='quadratic') 

0.8507457643725832