In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fasttext/wiki.simple.vec
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.50d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt
/kaggle/input/hp-grading-dataset/Guidelines for Transcribing Student Essays.docx
/kaggle/input/hp-grading-dataset/public_leaderboard.tsv
/kaggle/input/hp-grading-dataset/train.tsv
/kaggle/input/hp-grading-dataset/public_leaderboard_solution.csv
/kaggle/input/hp-grading-dataset/train_rel_2.tsv
/kaggle/input/hp-grading-dataset/private_leaderboard.tsv
/kaggle/input/hp-grading-dataset/test.csv
/kaggle/input/hp-grading-dataset/length_benchmark.csv
/kaggle/input/hp-grading-dataset/bag_of_words_benchmark.csv
/kaggle/input/hp-grading-dataset/public_leaderboard_rel_2.tsv


In [3]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 100)

In [4]:
DIR = '/kaggle/input/hp-grading-dataset/'

df = pd.read_csv(DIR + 'train.tsv', sep='\t', encoding="latin1", error_bad_lines=False)

In [5]:
df.head()

Unnamed: 0,Id,EssaySet,Score1,Score2,EssayText
0,1,1,1,1,Some additional information that we would need to replicate the experiment is how much vinegar s...
1,2,1,1,1,"After reading the expirement, I realized that the additional information you need to replicate t..."
2,3,1,1,1,"What you need is more trials, a control set up, and an exact amount of vinegar to pour in each c..."
3,4,1,0,0,The student should list what rock is better and what rock is the worse in the procedure.
4,5,1,2,2,"For the students to be able to make a replicate, they would need to tell use how much vinegar is..."


In [6]:
df['EssaySet'].value_counts()

3     1891
7     1799
8     1799
9     1798
6     1797
5     1795
4     1738
1     1672
10    1640
2     1278
Name: EssaySet, dtype: int64

**Preprocessing**

In [7]:
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
stop_words = set(stopwords.words('english')) 


def preprocess(df):
    #lowercasing
    df['EssayText'] = df['EssayText'].apply(lambda sent: ' '.join(word.lower() for word in sent.split()))
    #Stemming
#     porter_stemmer = PorterStemmer()
#     stem_words = lambda words: ' '.join([porter_stemmer.stem(word=word) for word in words])
#     df['EssayText'] = df['EssayText'].apply(lambda sent: stem_words(sent.split()))
    
    lemmatizer = WordNetLemmatizer()
    lemmatize_words = lambda words: ' '.join([lemmatizer.lemmatize(word=word) for word in words])
    
    df['EssayText'] = df['EssayText'].apply(lambda sent: lemmatize_words(sent.split()))
    
    def removeStopwords(sent):
        words = sent.split()
        result = []
        for word in words:
            if word in stop_words:
                continue
            result.append(word)
        return ' '.join(result)
    
    df['EssayText'] = df['EssayText'].apply(lambda sent: removeStopwords(sent))

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
preprocess(df)
df['EssayText'].head()

0    additional information would need replicate experiment much vinegar placed identical container, ...
1    reading expirement, realized additional information need replicate expireiment one, amant vinega...
2    need trials, control set up, exact amount vinegar pour cup/beaker. could also take check mass ev...
3                                                         student list rock better rock worse procedure.
4    student able make replicate, would need tell use much vinegar used tipe material needed expirement.
Name: EssayText, dtype: object

In [9]:
from ml_metrics import quadratic_weighted_kappa
quadratic_weighted_kappa(df['Score1'], df['Score2'])

0.9125360856314775

In [10]:
# df['Score1'] = df['Score1'].apply(lambda score: 0 if score <= 1 else 1)
# df['Score2'] = df['Score2'].apply(lambda score: 0 if score <= 1 else 1)

# df['Score1'].value_counts()

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import mean_squared_error
np.random.seed(42)

def getResultsForEssay(num, model):
    """
    return Kappa score for particular essay
    """

    essaySetNum = num
    essayDataframe = df[df['EssaySet'] == essaySetNum]
    essayDataframe.reset_index(inplace=True)

    vec = TfidfVectorizer(stop_words='english', min_df=1)
    vec.fit(essayDataframe['EssayText'])
    
    X = vec.transform(essayDataframe['EssayText'])
    y = essayDataframe['Score1']

    clusterModel = model.fit(X)
    closest, _ = pairwise_distances_argmin_min(clusterModel.cluster_centers_, X)
    
    
    closestScores = np.array(essayDataframe.loc[closest]['Score1'])
    
    
    predictedScores = np.zeros(shape=(X.shape[0]))
    for i, label in enumerate(model.labels_):
        predictedScores[i] = closestScores[label]
    actualScores = np.array(y)
    
    score = quadratic_weighted_kappa(predictedScores, actualScores)
    
    print(f'For essay set {num} got kappa: {score}')
    return score

In [12]:
nClusters = list(range(270, 312, 10))
avgKappas = []
for nCluster in nClusters:
    kmeans = KMeans(n_clusters = nCluster)
    scoreSum = 0
    print(f"With n_clusters = {nCluster} ")
    for essayNum in df['EssaySet'].unique():
        scoreSum += getResultsForEssay(essayNum, kmeans)
    print('\n\n\n')
    avgKappa = scoreSum / len(df['EssaySet'].unique())
    avgKappas.append(avgKappa)

With n_clusters = 270 
For essay set 1 got kappa: 0.10630190729542222
For essay set 2 got kappa: 0.2687907965823303
For essay set 3 got kappa: 0.21812985388504336
For essay set 4 got kappa: 0.4021292322648492
For essay set 5 got kappa: 0.5964229814086224
For essay set 6 got kappa: 0.5400009828521636
For essay set 7 got kappa: 0.18778751593640375
For essay set 8 got kappa: 0.03838217634346919
For essay set 9 got kappa: 0.02504878311923009
For essay set 10 got kappa: 0.5175398295665061




With n_clusters = 280 
For essay set 1 got kappa: 0.16156759620856798
For essay set 2 got kappa: 0.27143554316823737
For essay set 3 got kappa: 0.20257764083026253
For essay set 4 got kappa: 0.44938558352297375
For essay set 5 got kappa: 0.587078838311317
For essay set 6 got kappa: 0.39321200581477256
For essay set 7 got kappa: 0.19186103349907235
For essay set 8 got kappa: 0.0644319681815162
For essay set 9 got kappa: 0.02870657669190868
For essay set 10 got kappa: 0.5601628181230266




With n_cluste

In [16]:
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [17]:
def getResultsForUniversalSentenceEncoder(num, model):
    """
    return Kappa score for particular essay
    """

    essaySetNum = num
    essayDataframe = df[df['EssaySet'] == essaySetNum]
    essayDataframe.reset_index(inplace=True)
    
    
    embeddings = np.zeros((len(essayDataframe), 512))
    for i, sent in enumerate(essayDataframe['EssayText']):
        embeddings[i] = np.array(embed([sent]))[0]
        
        
    
    X = embeddings
    y = essayDataframe['Score1']

    clusterModel = model.fit(X)
    closest, _ = pairwise_distances_argmin_min(clusterModel.cluster_centers_, X)
    
    
    closestScores = np.array(essayDataframe.loc[closest]['Score1'])
    
    
    predictedScores = np.zeros(shape=(X.shape[0]))
    for i, label in enumerate(model.labels_):
        predictedScores[i] = closestScores[label]
    actualScores = np.array(y)
    
    score = quadratic_weighted_kappa(predictedScores, actualScores)
    
    print(f'For essay set {num} got kappa: {score}')
    return score

In [18]:
nClusters = [300]
avgKappas = []
for nCluster in nClusters:
    kmeans = KMeans(n_clusters = nCluster)
    scoreSum = 0
    print(f"With n_clusters = {nCluster} ")
    for essayNum in df['EssaySet'].unique():
        scoreSum += getResultsForUniversalSentenceEncoder(essayNum, kmeans)
    print('\n\n\n')
    avgKappa = scoreSum / len(df['EssaySet'].unique())
    avgKappas.append(avgKappa)

With n_clusters = 300 
For essay set 1 got kappa: 0.5477328055740945
For essay set 2 got kappa: 0.44789639236594747
For essay set 3 got kappa: 0.23309034686077734
For essay set 4 got kappa: 0.5686808820460912
For essay set 5 got kappa: 0.6018905767371673
For essay set 6 got kappa: 0.6394356750603795
For essay set 7 got kappa: 0.4227664771766009
For essay set 8 got kappa: 0.38483267479228145
For essay set 9 got kappa: 0.5845619645680191
For essay set 10 got kappa: 0.6349145275655471






In [19]:
for i in range(1, 11):
    lengthSum = 0
    for sentence in df[df['EssaySet'] == i]['EssayText']:
        lengthSum += len(sentence.split(' '))
    avg = lengthSum / len(df[df['EssaySet'] == i])
    print(f"Average length of EssaySet (num words) {i} is: {avg}")
    print(f"Total number of instances is {len(df[df['EssaySet'] == i])}")

Average length of EssaySet (num words) 1 is: 23.99700956937799
Total number of instances is 1672
Average length of EssaySet (num words) 2 is: 32.13145539906103
Total number of instances is 1278
Average length of EssaySet (num words) 3 is: 25.794288736118457
Total number of instances is 1891
Average length of EssaySet (num words) 4 is: 21.005753739930956
Total number of instances is 1738
Average length of EssaySet (num words) 5 is: 13.76100278551532
Total number of instances is 1795
Average length of EssaySet (num words) 6 is: 13.519755147468002
Total number of instances is 1797
Average length of EssaySet (num words) 7 is: 20.579766536964982
Total number of instances is 1799
Average length of EssaySet (num words) 8 is: 28.675931072818234
Total number of instances is 1799
Average length of EssaySet (num words) 9 is: 25.873192436040046
Total number of instances is 1798
Average length of EssaySet (num words) 10 is: 23.664634146341463
Total number of instances is 1640
