# Import Libraries


In [1]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import util
# Beautiful soup might be useful
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import LogisticRegressionCV as LogRegCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import discriminant_analysis as da
from sklearn import tree
# from sklearn.cross_validation import cross_val_predict 
# from sklearn import cross_validation
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score
from nltk.corpus import wordnet
%matplotlib inline

In [2]:
def append_regularized_scores(old_df):
    new_df = old_df.copy()
    new_df['std_score'] = new_df.groupby(['essay_set'])[['score']].apply(lambda x: (x - np.mean(x)) / (np.std(x)))
    return new_df

def create_regularization_data(old_df):
    #getting the number of datasets
    max_essay_set = max(old_df['essay_set'])
    #list of the regularized values
    regularization_data = []
    for i in range(max_essay_set+1):
        mean = np.mean((old_df[old_df['essay_set'] == i + 1])['score'])
        std = np.std((old_df[old_df['essay_set'] == i + 1])['score'])
        regularization_data.append([i + 1, mean, std])
    return regularization_data

In [3]:
# Read in training data
# Note that for essay set 2, score becomes average of 2 domain scores
train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score']
train_df = pd.read_csv('../../data/training_set_rel3.tsv',encoding = "ISO-8859-1", delimiter='\t', usecols=train_cols)
for i in range(train_df.shape[0]):
    if not np.isnan(train_df.get_value(i, 'domain2_score')):
        assert train_df.get_value(i, 'essay_set') == 2
        new_val = train_df.get_value(i, 'domain1_score') + train_df.get_value(i, 'domain2_score')
        train_df.set_value(i, 'domain1_score', new_val) 
train_df = train_df.drop('domain2_score', axis=1)
train_df = train_df.rename(columns={'domain1_score': 'score'})

################
regularization_data = create_regularization_data(train_df)
train_df = append_regularized_scores(train_df)

print ("The regularized data for each essay set = ", regularization_data)
print ("\n")

#print train_df[train_df['essay_set'] == 2].head()
print (train_df.head())
print ("\n")

#validate that the standardization works
max_essay_set = max(train_df['essay_set'])
for i in range (max_essay_set):
    valid = train_df[train_df["essay_set"] == i + 1]["std_score"]
    print ("mean and standard deviation of essay set " + str(i + 1) + " = ", np.mean(valid), ",", np.std(valid))
################

The regularized data for each essay set =  [[1, 8.528323051037576, 1.5381336495587767], [2, 6.749444444444444, 1.3844371990179603], [3, 1.8482039397450754, 0.8149207612821795], [4, 1.4322033898305084, 0.9395167668768533], [5, 2.4088642659279778, 0.9705520523317599], [6, 2.72, 0.970360757656664], [7, 16.062460165710643, 4.583888354164165], [8, 36.95020746887967, 5.749521294509325], [9, nan, nan]]


   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   score  std_score  
0      8  -0.343483  
1      9   0.306655  
2      7  -0.993622  
3     10   0.956794  
4      8  -0.343483  


me

In [4]:
# Read in validation data
valid_cols = ['essay_id', 'essay_set', 'essay', 'domain1_predictionid', 'domain2_predictionid']
valid_df = pd.read_csv('../../data/valid_set.tsv', delimiter='\t', encoding = "ISO-8859-1",usecols=valid_cols)
valid_df['score'] = pd.Series([0] * valid_df.shape[0], index=valid_df.index)

# scores are stored in separate data set, we'll put them in same one
valid_scores = pd.read_csv('../../data/valid_sample_submission_5_column.csv', encoding = "ISO-8859-1",delimiter=',')

# put each score in our data set, and make sure to handle essay set 2
for i in range(valid_df.shape[0]):
    dom1_predid = valid_df.get_value(i, 'domain1_predictionid')
    row = valid_scores[valid_scores['prediction_id'] == dom1_predid]
    score = row.get_value(row.index[0], 'predicted_score')
    
    dom2_predid = valid_df.get_value(i, 'domain2_predictionid')
    if not np.isnan(dom2_predid):
        assert valid_df.get_value(i, 'essay_set') == 2
        rowB = valid_scores[valid_scores['prediction_id'] == dom2_predid]
        scoreB = rowB.get_value(rowB.index[0], 'predicted_score')
        score += scoreB
        
    valid_df.set_value(i, 'score', score)
        
valid_df = valid_df.drop(['domain1_predictionid', 'domain2_predictionid'], axis=1)
#print valid_df[valid_df['essay_set'] == 2].head()
valid_df.head()

Unnamed: 0,essay_id,essay_set,essay,score
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",7
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,8
2,1790,1,"Dear Local newspaper, Have you been spending a...",9
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",9
4,1792,1,"Dear newspaper, I strongly believe that comput...",9


In [5]:
# returned a copy of old_df, with essays cleaned for count vectorizer
# cleaning returns essay with only lowercase words separated by space
def vectorizer_clean(old_df):
    new_df = old_df.copy()
    for i in range(new_df.shape[0]):
        new_df.set_value(i, 'essay', " ".join(re.sub('[^a-zA-Z\d\s]', '', new_df['essay'].iloc[i]).lower().split())) 
    return new_df

In [6]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_train = vectorizer_clean(train_df)
vectorizer_train.head()

Unnamed: 0,essay_id,essay_set,essay,score,std_score
0,1,1,dear local newspaper i think effects computers...,8,-0.343483
1,2,1,dear caps1 caps2 i believe that using computer...,9,0.306655
2,3,1,dear caps1 caps2 caps3 more and more people us...,7,-0.993622
3,4,1,dear local newspaper caps1 i have found that m...,10,0.956794
4,5,1,dear location1 i know having computers has a p...,8,-0.343483


In [7]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_valid = vectorizer_clean(valid_df)
vectorizer_valid.head()

Unnamed: 0,essay_id,essay_set,essay,score
0,1788,1,dear organization1 caps1 more and more people ...,7
1,1789,1,dear location1 time caps1 me tell you what i t...,8
2,1790,1,dear local newspaper have you been spending a ...,9
3,1791,1,dear readers caps1 you imagine how life would ...,9
4,1792,1,dear newspaper i strongly believe that compute...,9


In [23]:
vectorizer = TfidfVectorizer(stop_words = 'english')

#Get all the text from data
train_essays = vectorizer_train['essay'].values

#Turn each text into an array of word counts
train_vectors = vectorizer.fit_transform(train_essays).toarray()

#normalizing for y
train_std_scores = np.asarray(vectorizer_train['std_score'], dtype="byte")
print (train_std_scores[:5])

[0 0 0 0 0]


# Different Classification Models

Trying out LDA, QDA, Decision Trees, and Random Forests

# LDA

In [9]:
LDA = da.LinearDiscriminantAnalysis()
LDA.fit(train_vectors, train_std_scores)
valid_vectors = vectorizer.transform(vectorizer_valid['essay'].values).toarray()

KeyboardInterrupt: 

In [None]:
valid_pred_std_scores_lda = LDA.predict(valid_vectors)
# Appending predicted scores to validation data set
valid_df["LDA predicted_scores"] = valid_pred_std_scores_lda

In [None]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['LDA predicted_scores']
    for value in current_set:
        stand_pred_values_lda.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l2

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_LDA'] = stand_pred_values_lda

In [None]:
###############
#   Scoring   #
###############

#Scoring the predicted values with the actual values
lda_count = 0
for i in range(len(valid_df)):
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_lda']:
        lda_count += 1
        
print "LDA"
print "Number of correct predictions =", lda_count
print "Total number of observations =", len(valid_df)
print "Score =", float(lda_count) / len(valid_df)


# Count Misspelled words

In [24]:
# input is list of words in text, output percentage spelling correct
def percentage_correct_spelling(text):
    text_len = len(text)
    correct = 0
    for word in text:
        try:
            if wordnet.synsets(word):
                correct += 1
        except:
            correct+= 0
    return 1. * correct / text_len

In [25]:
spelling_feature_x = []
for train in train_essays:
    sentence = train.split()
    percent = percentage_correct_spelling(sentence)
    spelling_feature_x.append([percent])


In [26]:
print (spelling_feature_x[:10])

[[0.685459940652819], [0.6945107398568019], [0.6989247311827957], [0.6335877862595419], [0.6752688172043011], [0.6219512195121951], [0.6833667334669339], [0.6804979253112033], [0.7013574660633484], [0.6673306772908366]]


In [27]:
valid_essays = vectorizer_valid['essay'].values
valid_spelling_x = []
for valid in valid_essays:
    sentence = valid.split()
    percent = percentage_correct_spelling(sentence)
    valid_spelling_x.append([percent])
print (valid_spelling_x[:10])

[[0.6956521739130435], [0.6967741935483871], [0.6281179138321995], [0.6620498614958449], [0.7373068432671082], [0.6847826086956522], [0.7149220489977728], [0.6864864864864865], [0.7084745762711865], [0.6654545454545454]]


# L2 Log Regression - spelling

In [28]:
logistic_l2 = LogReg(penalty='l2', solver='liblinear', n_jobs=4)
xs = np.array(spelling_feature_x)
logistic_l2.fit(xs, train_std_scores)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=4,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
# My guess is we will want to denormalize these scores for quadratic weighted k
valid_pred_std_scores_l2 = logistic_l2.predict(valid_spelling_x)
# Appending predicted scores to validation data set
valid_df["Log_L2 predicted_scores"] = valid_pred_std_scores_l2

In [30]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L2 predicted_scores']
    for value in current_set:
        stand_pred_values_l2.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
#print (stand_pred_values_l2)

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l2'] = stand_pred_values_l2

# L1 Log Reg - Spelling

In [31]:
logistic_l1 = LogReg(penalty='l1', solver='liblinear', n_jobs=4)
xs = np.array(spelling_feature_x)
logistic_l1.fit(xs, train_std_scores)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=4,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
# My guess is we will want to denormalize these scores for quadratic weighted k
valid_pred_std_scores_l1 = logistic_l1.predict(valid_spelling_x)
# Appending predicted scores to validation data set
valid_df["Log_L1 predicted_scores"] = valid_pred_std_scores_l1

In [33]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l1 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L1 predicted_scores']
    for value in current_set:
        stand_pred_values_l1.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l1'] = stand_pred_values_l1

# Scoring with Spelling as Feature Extraction

In [35]:
###############
#   Scoring   #
###############

#Scoring the predicted values with the actual values
log_l1_count = 0
log_l2_count = 0
for i in range(len(valid_df)):
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l2']:
        log_l2_count += 1
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l1']:
        log_l1_count += 1
        
print ("LOGISTIC L2 using Feature: Spelling")
print ("Number of correct predictions =", log_l2_count)
print ("Total number of observations =", len(valid_df))
print ("Score =", float(log_l2_count) / len(valid_df))

print ("")
print ("LOGISTIC L1 using Feature: Spelling")
print ("Number of correct predictions =", log_l1_count)
print ("Total number of observations =", len(valid_df))
print ("Score =", float(log_l1_count) / len(valid_df))

LOGISTIC L2 using Feature: Spelling
Number of correct predictions = 1273
Total number of observations = 4218
Score = 0.30180180180180183

LOGISTIC L1 using Feature: Spelling
Number of correct predictions = 1273
Total number of observations = 4218
Score = 0.30180180180180183


# Number of Sentences

In [36]:
def append_regularized_sentence_length(old_df):
    new_df = old_df.copy()
    new_df['std_sentence_len'] = new_df.groupby(['essay_set'])[['sentence_length']].apply(lambda x: (x - np.mean(x)) / (np.std(x)))
    return new_df

def create_regularization_sentence_length(old_df):
    #getting the number of datasets
    max_essay_set = max(old_df['essay_set'])
    #list of the regularized values
    regularization_data = []
    for i in range(max_essay_set+1):
        mean = np.mean((old_df[old_df['essay_set'] == i + 1])['sentence_length'])
        std = np.std((old_df[old_df['essay_set'] == i + 1])['sentence_length'])
        regularization_data.append([i + 1, mean, std])
    return regularization_data

In [37]:
def sentences(par):
    split_sent = re.split(r'[.!?]+', par)
    return len(split_sent)

In [38]:
numOfSent_train = []
for essay in train_df['essay']:
    sent = sentences(essay)
    numOfSent_train.append(sent)

In [39]:
numOfSent_valid = []
for essay in valid_df['essay']:
    sent = sentences(essay)
    numOfSent_valid.append(sent)

In [40]:
train_df['sentence_length'] = numOfSent_train
valid_df['sentence_length'] = numOfSent_valid

In [41]:
train_df.head()

Unnamed: 0,essay_id,essay_set,essay,score,std_score,sentence_length
0,1,1,"Dear local newspaper, I think effects computer...",8,-0.343483,17
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.306655,21
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,-0.993622,15
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.956794,28
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,-0.343483,31


In [42]:
valid_df.head()

Unnamed: 0,essay_id,essay_set,essay,score,Log_L2 predicted_scores,newly_predicted_scores_log_l2,Log_L1 predicted_scores,newly_predicted_scores_log_l1,sentence_length
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",7,0,8,0,8,14
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,8,0,8,0,8,22
2,1790,1,"Dear Local newspaper, Have you been spending a...",9,0,8,0,8,16
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",9,0,8,0,8,25
4,1792,1,"Dear newspaper, I strongly believe that comput...",9,0,8,0,8,35


In [43]:
regularization_data_sentence = create_regularization_sentence_length(train_df)
train_df = append_regularized_sentence_length(train_df)

In [44]:
train_df.head()

Unnamed: 0,essay_id,essay_set,essay,score,std_score,sentence_length,std_sentence_len
0,1,1,"Dear local newspaper, I think effects computer...",8,-0.343483,17,-0.761714
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.306655,21,-0.327943
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,-0.993622,15,-0.9786
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.956794,28,0.431156
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,-0.343483,31,0.756484


# L2 Log Regression - Number of Sentences

In [45]:
logistic_l2 = LogReg(penalty='l2', solver='liblinear', n_jobs=4)
xs = [[x] for x in np.array(train_df['sentence_length'])]
logistic_l2.fit(xs, train_std_scores)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=4,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['sentence_length']
    for value in current_set:
        stand_pred_values_l2.append(int(float(value) * float(regularization_data_sentence[i][2]) + (regularization_data_sentence[i][1])))

#adding the denormalizede predicted values to the valid_df dataset
valid_df['new_sentence_length_std'] = stand_pred_values_l2

In [47]:
# My guess is we will want to denormalize these scores for quadratic weighted k
valid_x = [[x] for x in np.array(valid_df['new_sentence_length_std'])]
valid_pred_std_scores_l2 = logistic_l2.predict(valid_x)
# Appending predicted scores to validation data set
valid_df["Log_L2 predicted_scores"] = valid_pred_std_scores_l2

In [48]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L2 predicted_scores']
    for value in current_set:
        stand_pred_values_l2.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l2

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l2'] = stand_pred_values_l2

# L1 Log Regression - Number of Sentences

In [49]:
logistic_l1 = LogReg(penalty='l1', solver='liblinear', n_jobs=4)
logistic_l1.fit(xs, train_std_scores)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=4,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
# My guess is we will want to denormalize these scores for quadratic weighted k
valid_pred_std_scores_l1 = logistic_l1.predict(valid_x)
# Appending predicted scores to validation data set
valid_df["Log_L1 predicted_scores"] = valid_pred_std_scores_l1

In [51]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l1 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L1 predicted_scores']
    for value in current_set:
        stand_pred_values_l1.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l1'] = stand_pred_values_l1

# Scoring using Log Regression - Number of Sentences

In [53]:
###############
#   Scoring   #
###############

#Scoring the predicted values with the actual values
log_l1_count = 0
log_l2_count = 0
for i in range(len(valid_df)):
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l2']:
        log_l2_count += 1
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l1']:
        log_l1_count += 1
        
print ("LOGISTIC L2 using Feature: Number of Sentences")
print ("Number of correct predictions =", log_l2_count)
print ("Total number of observations =", len(valid_df))
print ("Score =", float(log_l2_count) / len(valid_df))

print ("")
print ("LOGISTIC L1 using Feature: Number of Sentences")
print( "Number of correct predictions =", log_l1_count)
print ("Total number of observations =", len(valid_df))
print ("Score =", float(log_l1_count) / len(valid_df))

LOGISTIC L2 using Feature: Number of Sentences
Number of correct predictions = 974
Total number of observations = 4218
Score = 0.23091512565196776

LOGISTIC L1 using Feature: Number of Sentences
Number of correct predictions = 948
Total number of observations = 4218
Score = 0.22475106685633


In [54]:
import sys  
import pandas as pd
#sys.setdefaultencoding('utf8')
train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score']
train_df = pd.read_csv('../../data/training_set_rel3.tsv', delimiter='\t', encoding = "ISO-8859-1",usecols=train_cols)