In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
import language_check
import re
from nltk.stem import WordNetLemmatizer
import enchant
from spellchecker import SpellChecker
import time
from nltk.corpus import wordnet
import textstat

In [2]:
df = pd.read_csv('data/training_set_rel3.tsv',sep='\t',encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12976 entries, 0 to 12975
Data columns (total 28 columns):
essay_id          12976 non-null int64
essay_set         12976 non-null int64
essay             12976 non-null object
rater1_domain1    12976 non-null int64
rater2_domain1    12976 non-null int64
rater3_domain1    128 non-null float64
domain1_score     12976 non-null int64
rater1_domain2    1800 non-null float64
rater2_domain2    1800 non-null float64
domain2_score     1800 non-null float64
rater1_trait1     2292 non-null float64
rater1_trait2     2292 non-null float64
rater1_trait3     2292 non-null float64
rater1_trait4     2292 non-null float64
rater1_trait5     723 non-null float64
rater1_trait6     723 non-null float64
rater2_trait1     2292 non-null float64
rater2_trait2     2292 non-null float64
rater2_trait3     2292 non-null float64
rater2_trait4     2292 non-null float64
rater2_trait5     723 non-null float64
rater2_trait6     723 non-null float64
rater3_trait1     128

In [5]:
df = df[['essay_id','essay_set','essay','domain1_score']]

In [9]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay
0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...


In [111]:
def remove_special_char(essay):
    return re.sub('@\S+|[^A-Za-z0-9]',' ',essay)

In [8]:
df['clean_essay'] = df['essay'].apply(remove_special_char)

In [112]:
def word_count(essay):
    return len(nltk.word_tokenize(essay))

In [11]:
df['word_count'] =df['clean_essay'].apply(word_count)

In [113]:
def sent_count(essay):
    return len(nltk.sent_tokenize(essay))

In [13]:
df['sent_count'] =df['essay'].apply(sent_count)

In [None]:
def check_spell(essay):
    d = enchant.Dict("en_US")
    spell = SpellChecker()
    words = essay.split()
    misspelled = set()
    err_count = 0
    for word in words:
        if d.check(word) == False:
            misspelled.add(word)
            err_count += 1
    corr_dict = {}
    for word in misspelled:
        corr_dict[word] = spell.correction(word)
    essay_df = pd.DataFrame(essay.split())
    essay_df.replace(corr_dict,inplace=True)
    essay = ' '.join(list(essay_df[0]))
    return err_count,essay

In [None]:
df['spell_err'],df['corrected_essay'] = zip(*df['clean_essay'].map(check_spell))

In [92]:
clean_df.to_csv('corr_essay',index=False)

In [2]:
clean_df = pd.read_csv('corr_essay')

In [6]:
clean_df.head()

Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_count,sent_count,spell_error,spell_err1,corrected_essay,unique_word_ratio
0,0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...,344,16,11,11,Dear local newspaper I think effects computers...,0.468023
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...,413,20,16,16,Dear I believe that using computers will benef...,0.450363
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...,276,14,2,2,Dear More and more people use computers but no...,0.514493
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...,488,27,24,24,Dear Local Newspaper I have found that many ex...,0.459016
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...,469,30,13,13,Dear I know having computers has a positive ef...,0.411514


In [115]:
def create_documents(essay):
    stop_words=set(stopwords.words('english'))
    stop_words.remove('not')
    lemmatizer = WordNetLemmatizer()
    essay = essay.lower()
    essay = nltk.word_tokenize(essay)
    essay=[lemmatizer.lemmatize(word) for word in essay if not word in stop_words]
    essay=' '.join(essay)
    return essay

In [17]:
clean_df['essay_documents'] = clean_df['corrected_essay'].apply(create_documents)

In [114]:
def pos_count(essay):
    noun_count = 0
    verb_count = 0
    adv_count = 0
    adj_count = 0
    word_pos = nltk.pos_tag(essay.split())
    for pos in word_pos:
        if pos[1][0] == 'N':
            noun_count += 1
        elif pos[1][0] == 'V':
            verb_count += 1
        elif pos[1][0] == 'J':
            adj_count += 1
        elif pos[1][0] == 'R':
            adv_count += 1
    return noun_count, verb_count, adv_count, adj_count

In [None]:
clean_df['noun_count'], clean_df['verb_count'], clean_df['adv_count'], clean_df['adj_count'] = zip(*clean_df['essay_documents'].map(pos_count))

In [37]:
def compute_redability(essay):
    return textstat.flesch_reading_ease(essay)

In [20]:
clean_df['readability_score'] = clean_df['essay'].apply(compute_redability)

In [4]:
def unique_word_ratio(essay):
    tokens = nltk.word_tokenize(essay.lower())
    ratio = len(set(tokens))/len(tokens)
    return ratio

In [53]:
clean_df['unique_word_ratio'] = clean_df['essay_documents'].apply(unique_word_ratio)

In [55]:
clean_df.to_csv('clean_df',index=False)

In [62]:
clean_df = pd.read_csv("clean_df")

In [63]:
clean_df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_count,sent_count,spell_err1,corrected_essay,essay_documents,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio
0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...,344,16,11,Dear local newspaper I think effects computers...,dear local newspaper think effect computer peo...,78,37,15,28,74.02,0.618182
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...,413,20,16,Dear I believe that using computers will benef...,dear believe using computer benefit u many way...,103,55,10,24,67.08,0.569444
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...,276,14,2,Dear More and more people use computers but no...,dear people use computer not everyone agrees b...,74,31,4,18,68.2,0.646154
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...,488,27,24,Dear Local Newspaper I have found that many ex...,dear local newspaper found many expert say com...,134,53,24,45,53.34,0.584559
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...,469,30,13,Dear I know having computers has a positive ef...,dear know computer positive effect people comp...,117,40,11,27,72.66,0.531818


In [58]:
persuasive_essay = clean_df[(clean_df['essay_set']== 1) | (clean_df['essay_set']== 2) | (clean_df['essay_set']== 7) | (clean_df['essay_set']== 8)]

In [59]:
persuasive_essay.reset_index(drop=True,inplace=True)

In [104]:
persuasive_essay.tail()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_count,sent_count,spell_err1,corrected_essay,essay_documents,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,syn_overlap,syn_overlap_prop,prompt_overlap,prompt_overlap_prop
5870,21626,8,In most stories mothers and daughters are eit...,35,In most stories mothers and daughters are eit...,806,27,1,In most stories mothers and daughters are eith...,story mother daughter either enemy friend stor...,132,73,52,60,56.05,0.605479,52,0.142077,33,0.090164
5871,21628,8,I never understood the meaning laughter is th...,32,I never understood the meaning laughter is th...,526,35,5,I never understood the meaning laughter is the...,never understood meaning laughter shortest dis...,93,53,40,45,50.0,0.51626,33,0.133603,25,0.101215
5872,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40,When you laugh is out of habit or is cau...,777,41,7,When you laugh is out of habit or is cause Wha...,laugh habit cause cause laughing even thing ca...,137,88,31,60,60.79,0.723837,27,0.078261,20,0.057971
5873,21630,8,Trippin' on fen...,40,Trippin on fen...,555,39,3,tripping on fences I am years young and in tho...,tripping fence year young short year ever reme...,79,56,38,45,72.05,0.643777,26,0.111111,21,0.089744
5874,21633,8,Many people believe that laughter can improve...,40,Many people believe that laughter can improve...,460,29,1,Many people believe that laughter can improve ...,many people believe laughter improve life laug...,80,52,29,46,72.05,0.665138,37,0.16895,25,0.114155


In [131]:
prompt1 = open("Prompt1.txt",'r',encoding='utf-8')
prompt1 = prompt1.read()
prompt2 = open("Prompt2.txt",'r',encoding='utf-8')
prompt2 = prompt2.read()
prompt7 = open("Prompt7.txt",'r',encoding='utf-8')
prompt7 = prompt7.read()
prompt8 = open("Prompt8.txt",'r',encoding='utf-8')
prompt8 = prompt8.read()

In [132]:
persuasive_set = [1,2,7,8]
prompts = [prompt1,prompt2,prompt7,prompt8]
essay_prompt_df = pd.DataFrame({'essay_set':persuasive_set,'prompt':prompts},index=persuasive_set)

In [133]:
essay_prompt_df['clean_prompt'] = essay_prompt_df['prompt'].apply(remove_special_char)
essay_prompt_df['documents'] = essay_prompt_df['clean_prompt'].apply(create_documents)

In [134]:
essay_prompt_df

Unnamed: 0,essay_set,prompt,clean_prompt,documents
1,1,"More and more people use computers, but not ev...",More and more people use computers but not ev...,people use computer not everyone agrees benefi...
2,2,"Censorship in the Libraries\n""All of us can th...",Censorship in the Libraries All of us can thi...,censorship library u think book hope none chil...
7,7,Write about patience. Being patient means that...,Write about patience Being patient means that...,write patience patient mean understanding tole...
8,8,We all understand the benefits of laughter. Fo...,We all understand the benefits of laughter Fo...,understand benefit laughter example someone sa...


In [118]:
def get_synonyms(prompt):
    synonyms = set()
    tokens = nltk.word_tokenize(prompt)
    for word in tokens:
        synset = nltk.wordnet.wordnet.synsets(word)
        for ss in synset:
            for swords in ss.lemma_names():
                synonyms.add(swords.lower())
    return list(synonyms)

In [119]:
essay_prompt_df['synonyms'] = essay_prompt_df['documents'].apply(get_synonyms)

In [120]:
essay_prompt_df

Unnamed: 0,essay_set,prompt,clean_prompt,documents,synonyms
1,1,"More and more people use computers, but not ev...",More and more people use computers but not ev...,people use computer not everyone agrees benefi...,"[spill_the_beans, proofreader, accompaniment, ..."
2,2,Censorship in the Libraries All of us can thin...,Censorship in the Libraries All of us can thin...,censorship library u think book hope none chil...,"[subroutine_library, shelf, rule, drive, run, ..."
7,7,Write about patience. Being patient means that...,Write about patience Being patient means that...,write patience patient mean understanding tole...,"[pursuit, chase, means, meter, mortal, unmatch..."
8,8,We all understand the benefits of laughter. Fo...,We all understand the benefits of laughter Fo...,understand benefit laughter example someone sa...,"[mortal, unmatchable, severalise, partly, offi..."


In [121]:
essay_prompt_df.to_pickle('essay_prompt_df')

In [122]:
essay_prompt_df = pd.read_pickle('essay_prompt_df')

In [61]:
synonyms_overlap = []
synonyms_overlap_prop = []
prompt_overlap = []
prompt_overlap_prop = []

for i in range(len(persuasive_essay)):    
    essay_set = persuasive_essay['essay_set'][i]
    essay_tokens = nltk.word_tokenize(persuasive_essay['essay_documents'][i])
    prompt_tokens = nltk.word_tokenize(essay_prompt_df.loc[essay_prompt_df['essay_set'] == essay_set,'documents'][essay_set])
    synonyms = essay_prompt_df.loc[essay_prompt_df['essay_set'] == essay_set,'synonyms'][essay_set]
    synonyms_overlap_temp = [word for word in essay_tokens if word in synonyms]
    prompt_overlap_temp = [word for word in essay_tokens if word in prompt_tokens]
    
    synonyms_overlap.append(len(synonyms_overlap_temp))
    synonyms_overlap_prop.append(len(synonyms_overlap_temp)/(len(essay_tokens)+1))
    prompt_overlap.append(len(prompt_overlap_temp))
    prompt_overlap_prop.append(len(prompt_overlap_temp)/(len(essay_tokens)+1))

In [62]:
persuasive_essay['syn_overlap'] = synonyms_overlap
persuasive_essay['syn_overlap_prop'] = synonyms_overlap_prop
persuasive_essay['prompt_overlap'] = prompt_overlap
persuasive_essay['prompt_overlap_prop'] = prompt_overlap_prop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [63]:
persuasive_essay.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_count,sent_count,spell_err1,corrected_essay,essay_documents,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,syn_overlap,syn_overlap_prop,prompt_overlap,prompt_overlap_prop
0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...,344,16,11,Dear local newspaper I think effects computers...,dear local newspaper think effect computer peo...,78,37,15,28,74.02,0.618182,59,0.355422,46,0.277108
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...,413,20,16,Dear I believe that using computers will benef...,dear believe using computer benefit u many way...,103,55,10,24,67.08,0.569444,56,0.258065,39,0.179724
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...,276,14,2,Dear More and more people use computers but no...,dear people use computer not everyone agrees b...,74,31,4,18,68.2,0.646154,70,0.534351,62,0.473282
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...,488,27,24,Dear Local Newspaper I have found that many ex...,dear local newspaper found many expert say com...,134,53,24,45,53.34,0.584559,85,0.311355,55,0.201465
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...,469,30,13,Dear I know having computers has a positive ef...,dear know computer positive effect people comp...,117,40,11,27,72.66,0.531818,62,0.280543,51,0.230769


In [64]:
persuasive_essay.to_csv('persuasive_essay',index=False)

In [4]:
persuasive_essay = pd.read_csv('persuasive_essay')

In [5]:
persuasive_essay.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_count,sent_count,spell_err1,corrected_essay,essay_documents,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,syn_overlap,syn_overlap_prop,prompt_overlap,prompt_overlap_prop
0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...,344,16,11,Dear local newspaper I think effects computers...,dear local newspaper think effect computer peo...,78,37,15,28,74.02,0.618182,59,0.355422,46,0.277108
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...,413,20,16,Dear I believe that using computers will benef...,dear believe using computer benefit u many way...,103,55,10,24,67.08,0.569444,56,0.258065,39,0.179724
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...,276,14,2,Dear More and more people use computers but no...,dear people use computer not everyone agrees b...,74,31,4,18,68.2,0.646154,70,0.534351,62,0.473282
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...,488,27,24,Dear Local Newspaper I have found that many ex...,dear local newspaper found many expert say com...,134,53,24,45,53.34,0.584559,85,0.311355,55,0.201465
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...,469,30,13,Dear I know having computers has a positive ef...,dear know computer positive effect people comp...,117,40,11,27,72.66,0.531818,62,0.280543,51,0.230769


In [6]:
essay_set1 = persuasive_essay[persuasive_essay['essay_set']==1]
essay_set2 = persuasive_essay[persuasive_essay['essay_set']==2]
essay_set7 = persuasive_essay[persuasive_essay['essay_set']==7]
essay_set8 = persuasive_essay[persuasive_essay['essay_set']==8]

In [7]:
essay_set1.reset_index(inplace=True,drop=True)
essay_set2.reset_index(inplace=True,drop=True)
essay_set7.reset_index(inplace=True,drop=True)
essay_set8.reset_index(inplace=True,drop=True)

In [105]:
source_essay = clean_df[(clean_df['essay_set']== 3) | (clean_df['essay_set']== 4) | (clean_df['essay_set']== 5) | (clean_df['essay_set']== 6)]

In [107]:
source_essay.reset_index(drop=True,inplace=True)

In [162]:
source_essay.tail()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_count,sent_count,spell_err1,corrected_essay,essay_documents,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio
7096,16629,6,The one obstacle the builders had when trying ...,0,The one obstacle the builders had when trying ...,152,8,10,The one obstacle the builders had when trying ...,one obstacle builder trying build building not...,36,13,10,9,77.27,0.706667
7097,16630,6,Some of the problems with the constructing of ...,2,Some of the problems with the constructing of ...,66,3,0,Some of the problems with the constructing of ...,problem constructing docking dirigible natural...,13,8,4,11,57.3,0.925
7098,16631,6,The builders of the Empire State building face...,3,The builders of the Empire State building face...,105,5,1,The builders of the Empire State building face...,builder empire state building faced obstacle a...,30,8,5,9,58.62,0.814815
7099,16632,6,The obstacles the builders of the Empire State...,2,The obstacles the builders of the Empire State...,68,2,1,The obstacles the builders of the Empire State...,obstacle builder empire state building could n...,18,7,3,5,53.89,0.810811
7100,16633,6,You want me to tell you what they had to go th...,2,You want me to tell you what they had to go th...,157,9,2,You want me to tell you what they had to go th...,want tell go attempt allow dirigible dock well...,33,19,4,14,64.78,0.632911


In [130]:
source3 = open("Source3.txt",'r',encoding='utf-8')
source3 = source3.read()
source4 = open("Source4.txt",'r',encoding='utf-8')
source4 = source4.read()
source5 = open("Source5.txt",'r',encoding='utf-8')
source5 = source5.read()
source6 = open("Source6.txt",'r',encoding='utf-8')
source6 = source6.read()

In [135]:
source_set = [3,4,5,6]
source = [source3,source4,source5,source6]
essay_source_df = pd.DataFrame({'essay_set':source_set,'source':source},index=source_set)

In [137]:
essay_source_df['clean_source'] = essay_source_df['source'].apply(remove_special_char)
essay_source_df['documents'] = essay_source_df['clean_source'].apply(create_documents)

In [155]:
def source_pos_tags(source):
    word_pos = nltk.pos_tag(source.split())
    pos_tags = set()
    for pos in word_pos:
        if (pos[1][0] == 'N') | (pos[1][0] == 'V'):
            pos_tags.add(pos[0])
    return list(pos_tags)

In [156]:
essay_source_df['pos(nouns & verbs)'] = essay_source_df['documents'].apply(source_pos_tags)

In [167]:
# synonyms_overlap = []
# synonyms_overlap_prop = []
source_overlap = []
source_overlap_prop = []

for i in range(len(source_essay)):    
    essay_set = source_essay['essay_set'][i]
    essay_tokens = nltk.word_tokenize(source_essay['essay_documents'][i])
    source_tokens = essay_source_df.loc[essay_source_df['essay_set'] == essay_set,'pos(nouns & verbs)'][essay_set]
#     synonyms = essay_source_df.loc[essay_source_df['essay_set'] == essay_set,'synonyms'][essay_set]
#     synonyms_overlap_temp = [word for word in essay_tokens if word in synonyms]
    source_overlap_temp = [word for word in essay_tokens if word in source_tokens]
    
#     synonyms_overlap.append(len(synonyms_overlap_temp))
#     synonyms_overlap_prop.append(len(synonyms_overlap_temp)/(len(essay_tokens)+1))
    source_overlap.append(len(source_overlap_temp))
    source_overlap_prop.append(len(source_overlap_temp)/(len(essay_tokens)+1))
source_essay['source_overlap'] = source_overlap
source_essay['source_overlap_prop'] = source_overlap_prop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [64]:
source_essay.head()

NameError: name 'source_essay' is not defined

In [49]:
X = essay_set1.drop(['domain1_score','essay_id','essay_set','essay','clean_essay','corrected_essay','essay_documents'],axis=1)
y = essay_set1['domain1_score']

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(essay_set1['essay_documents'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist() 
df = pd.DataFrame(denselist, columns=feature_names)

In [51]:
df.head()

Unnamed: 0,aa,abandon,abandoned,abbreviated,abbreviation,abd,abdication,abducted,abduction,abe,...,yup,zap,zero,zingbobway,zip,zombie,zone,zoning,zoo,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.09828,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
final_df = pd.concat([X,df],axis=1)

In [53]:
final_df.head()

Unnamed: 0,word_count,sent_count,spell_err1,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,syn_overlap,...,yup,zap,zero,zingbobway,zip,zombie,zone,zoning,zoo,zoom
0,344,16,11,78,37,15,28,74.02,0.618182,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,413,20,16,103,55,10,24,67.08,0.569444,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,276,14,2,74,31,4,18,68.2,0.646154,70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,488,27,24,134,53,24,45,53.34,0.584559,85,...,0.0,0.0,0.0,0.0,0.09828,0.0,0.0,0.0,0.0,0.0
4,469,30,13,117,40,11,27,72.66,0.531818,62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X.values,y.values,test_size=0.3)

In [55]:
X_train

array([[4.29000000e+02, 3.00000000e+01, 1.40000000e+01, ...,
        2.21238938e-01, 3.90000000e+01, 1.72566372e-01],
       [2.79000000e+02, 2.10000000e+01, 1.00000000e+01, ...,
        2.55813953e-01, 2.20000000e+01, 1.70542636e-01],
       [4.45000000e+02, 2.60000000e+01, 5.00000000e+00, ...,
        2.80373832e-01, 4.20000000e+01, 1.96261682e-01],
       ...,
       [2.77000000e+02, 1.20000000e+01, 1.10000000e+01, ...,
        2.63565891e-01, 2.40000000e+01, 1.86046512e-01],
       [3.83000000e+02, 2.70000000e+01, 9.00000000e+00, ...,
        2.28723404e-01, 2.70000000e+01, 1.43617021e-01],
       [3.36000000e+02, 2.00000000e+01, 2.00000000e+00, ...,
        2.58620690e-01, 2.70000000e+01, 1.55172414e-01]])

In [56]:
def eval_metrics():
    from sklearn import metrics
    print("MAE:",metrics.mean_absolute_error(y_test, y_pred))
    print("MSE:",metrics.mean_squared_error(y_test, y_pred))
    print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('Cohen\'s kappa score: %.2f' % metrics.cohen_kappa_score(np.rint(y_pred), y_test,weights='quadratic'))

In [48]:
start = time.time()
from sklearn.model_selection import GridSearchCV
GBR=GradientBoostingRegressor()
optimization_dict = {'max_depth': [2,4,6],
                     'n_estimators': [100,200,300,500,1000],
                    'learning_rate':[0.01,0.05,0.1,0.5],
                    'max_features':['auto'],
                    'min_samples_split':[2,4,10,15],
                    'min_samples_leaf':[2,4,10,15]}

gridsearch = GridSearchCV(GBR, optimization_dict, verbose=2, cv=3,n_jobs=3)
gridsearch.fit(X_train,y_train)
print("best_accuracy:",gridsearch.best_score_)
print("best_parameters:",gridsearch.best_params_)
print(time.time()-start)

Fitting 3 folds for each of 960 candidates, totalling 2880 fits


[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    3.6s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   15.8s
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed:   48.2s
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done 1007 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1452 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1979 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done 2586 tasks      | elapsed:  7.6min
[Parallel(n_jobs=3)]: Done 2880 out of 2880 | elapsed:  8.4min finished


best_accuracy: 0.7246536104167913
best_parameters: {'learning_rate': 0.05, 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
502.8731334209442


In [61]:
start = time.time()
from sklearn.ensemble import GradientBoostingRegressor
GBR=GradientBoostingRegressor()
GBR.fit(X_train,y_train)
y_pred = GBR.predict(X_test)
eval_metrics()
print(time.time()-start)

MAE: 0.6636596716243375
MSE: 0.7020185224452309
RMSE: 0.8378654560520029
Cohen's kappa score: 0.83
0.09994387626647949


In [328]:
y_pred = GBR.predict(final_df)

In [329]:
prediction = pd.DataFrame({"y_pred":np.rint(y_pred),'y_act':y})

In [331]:
prediction

Unnamed: 0,y_pred,y_act
0,8.0,8
1,9.0,9
2,8.0,7
3,10.0,10
4,9.0,8
5,7.0,8
6,10.0,10
7,10.0,10
8,9.0,9
9,9.0,9
