In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
import enchant
from spellchecker import SpellChecker
import time
from nltk.corpus import wordnet
import textstat

from feature_extractor import *

In [2]:
df = pd.read_csv('data/training_set_rel3.tsv',sep='\t',encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12976 entries, 0 to 12975
Data columns (total 28 columns):
essay_id          12976 non-null int64
essay_set         12976 non-null int64
essay             12976 non-null object
rater1_domain1    12976 non-null int64
rater2_domain1    12976 non-null int64
rater3_domain1    128 non-null float64
domain1_score     12976 non-null int64
rater1_domain2    1800 non-null float64
rater2_domain2    1800 non-null float64
domain2_score     1800 non-null float64
rater1_trait1     2292 non-null float64
rater1_trait2     2292 non-null float64
rater1_trait3     2292 non-null float64
rater1_trait4     2292 non-null float64
rater1_trait5     723 non-null float64
rater1_trait6     723 non-null float64
rater2_trait1     2292 non-null float64
rater2_trait2     2292 non-null float64
rater2_trait3     2292 non-null float64
rater2_trait4     2292 non-null float64
rater2_trait5     723 non-null float64
rater2_trait6     723 non-null float64
rater3_trait1     128

In [5]:
df = df[['essay_id','essay_set','essay','domain1_score']]

In [6]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [7]:
#---------------- Removing sepecial characters from essay -------
df['clean_essay'] = df['essay'].apply(remove_special_char)

In [8]:
#-------------- Tokenizing the essay into words ------
df['word_tokens'] =df['clean_essay'].apply(word_tokenizer)

In [9]:
#------------- Tokenizing the essay into sentences-----
df['sent_tokens'] =df['essay'].apply(sent_tokenizer)

In [10]:
#---------- Finding the number of words in the essay---------
df['word_count'] = df['word_tokens'].apply(count)

In [11]:
#---------- Finding the number of sentences in the essay---------
df['sent_count'] = df['sent_tokens'].apply(count)

In [12]:
#----------- Finding spelling errors and replacing the errors with correct words----------
df['spell_err'],df['corrected_essay'] = zip(*df['word_tokens'].map(check_spell))

In [13]:
#----------- creating essay documents for Tfidf model------------
df['essay_documents'] = df['corrected_essay'].apply(create_documents)

In [14]:
# ------------ tokeninzing essay documents----------
df['corrected_tokens'] = df['essay_documents'].apply(word_tokenizer)

In [15]:
#----------POS tags-------------
df['noun_count'], df['verb_count'], df['adv_count'], df['adj_count'] = zip(*df['corrected_tokens'].map(pos_count))

In [16]:
#----------Readability score computation----------
df['readability_score'] = df['essay'].apply(compute_redability)

In [17]:
#-------------ratio of unique words to the total words---------
df['unique_word_ratio'] = df['corrected_tokens'].apply(unique_word_prop)

In [18]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_tokens,sent_tokens,word_count,sent_count,spell_err,corrected_essay,essay_documents,corrected_tokens,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio
0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...,"[Dear, local, newspaper, I, think, effects, co...","[Dear local newspaper, I think effects compute...",344,16,11,Dear local newspaper I think effects computers...,dear local newspaper think effect computer peo...,"[dear, local, newspaper, think, effect, comput...",78,37,15,28,74.02,0.618182
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...,"[Dear, I, believe, that, using, computers, wil...","[Dear @CAPS1 @CAPS2, I believe that using comp...",413,20,16,Dear I believe that using computers will benef...,dear believe using computer benefit u many way...,"[dear, believe, using, computer, benefit, u, m...",103,55,10,24,67.08,0.569444
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...,"[Dear, More, and, more, people, use, computers...","[Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...",276,14,2,Dear More and more people use computers but no...,dear people use computer not everyone agrees b...,"[dear, people, use, computer, not, everyone, a...",74,31,4,18,68.2,0.646154
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...,"[Dear, Local, Newspaper, I, have, found, that,...","[Dear Local Newspaper, @CAPS1 I have found tha...",488,27,24,Dear Local Newspaper I have found that many ex...,dear local newspaper found many expert say com...,"[dear, local, newspaper, found, many, expert, ...",134,53,24,45,53.34,0.584559
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...,"[Dear, I, know, having, computers, has, a, pos...","[Dear @LOCATION1, I know having computers has ...",469,30,13,Dear I know having computers has a positive ef...,dear know computer positive effect people comp...,"[dear, know, computer, positive, effect, peopl...",117,40,11,27,72.66,0.531818




### Feature extraction based on the Prompt and Source of the Essay

Feature extraction based on the prompt and source have been done by using different approches
for Persuasive and Source type of essay.

For the persuasive type of essay, synonyms of all words(except stopwords) in the prompt have been 
collected and then finds how many words in the essay are present in the synonyms of the prompt 
words and actual prompts and the ratio of the matched words to the total words.

For Source type of essays only Nouns and Verbs are used instead of all words in the souce because of 
the size of the source.


In [19]:
persuasive_essay = df[(df['essay_set']== 1) | (df['essay_set']== 2) | (df['essay_set']== 7) | (df['essay_set']== 8)]
persuasive_essay.reset_index(drop=True,inplace=True)
persuasive_essay.tail()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_tokens,sent_tokens,word_count,sent_count,spell_err,corrected_essay,essay_documents,corrected_tokens,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio
5870,21626,8,In most stories mothers and daughters are eit...,35,In most stories mothers and daughters are eit...,"[In, most, stories, mothers, and, daughters, a...",[ In most stories mothers and daughters are ei...,806,27,1,In most stories mothers and daughters are eith...,story mother daughter either enemy friend stor...,"[story, mother, daughter, either, enemy, frien...",132,73,52,60,56.05,0.605479
5871,21628,8,I never understood the meaning laughter is th...,32,I never understood the meaning laughter is th...,"[I, never, understood, the, meaning, laughter,...",[ I never understood the meaning laughter is t...,526,35,5,I never understood the meaning laughter is the...,never understood meaning laughter shortest dis...,"[never, understood, meaning, laughter, shortes...",93,53,40,45,50.0,0.51626
5872,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40,When you laugh is out of habit or is cau...,"[When, you, laugh, is, out, of, habit, or, is,...","[When you laugh, is @CAPS5 out of habit, or is...",777,41,7,When you laugh is out of habit or is cause Wha...,laugh habit cause cause laughing even thing ca...,"[laugh, habit, cause, cause, laughing, even, t...",137,88,31,60,60.79,0.723837
5873,21630,8,Trippin' on fen...,40,Trippin on fen...,"[Trippin, on, fences, I, am, years, young, and...",[ Trippin' on fe...,555,39,3,tripping on fences I am years young and in tho...,tripping fence year young short year ever reme...,"[tripping, fence, year, young, short, year, ev...",79,56,38,45,72.05,0.643777
5874,21633,8,Many people believe that laughter can improve...,40,Many people believe that laughter can improve...,"[Many, people, believe, that, laughter, can, i...",[ Many people believe that laughter can improv...,460,29,1,Many people believe that laughter can improve ...,many people believe laughter improve life laug...,"[many, people, believe, laughter, improve, lif...",80,52,29,46,72.05,0.665138


In [20]:
prompt1 = open("processed_data_files/prompt/Prompt1.txt",'r',encoding='utf-8')
prompt1 = prompt1.read()
prompt2 = open("processed_data_files/prompt/Prompt2.txt",'r',encoding='utf-8')
prompt2 = prompt2.read()
prompt7 = open("processed_data_files/prompt/Prompt7.txt",'r',encoding='utf-8')
prompt7 = prompt7.read()
prompt8 = open("processed_data_files/prompt/Prompt8.txt",'r',encoding='utf-8')
prompt8 = prompt8.read()

persuasive_set = [1,2,7,8]
prompts = [prompt1,prompt2,prompt7,prompt8]

essay_prompt_df = pd.DataFrame({'essay_set':persuasive_set,'prompt':prompts},index=persuasive_set)

In [21]:
essay_prompt_df['clean_prompt'] = essay_prompt_df['prompt'].apply(remove_special_char)
essay_prompt_df['documents'] = essay_prompt_df['clean_prompt'].apply(create_documents)
essay_prompt_df['tokens'] = essay_prompt_df['documents'].apply(word_tokenizer)
essay_prompt_df['synonyms'] = essay_prompt_df['tokens'].apply(get_synonyms)

In [22]:
essay_prompt_df

Unnamed: 0,essay_set,prompt,clean_prompt,documents,tokens,synonyms
1,1,"More and more people use computers, but not ev...",More and more people use computers but not ev...,people use computer not everyone agrees benefi...,"[people, use, computer, not, everyone, agrees,...","[overconfident, fellowship, place, clip, see, ..."
2,2,"Censorship in the Libraries\n""All of us can th...",Censorship in the Libraries All of us can thi...,censorship library u think book hope none chil...,"[censorship, library, u, think, book, hope, no...","[remove, depart, book, place, cerebrate, clip,..."
7,7,Write about patience. Being patient means that...,Write about patience Being patient means that...,write patience patient mean understanding tole...,"[write, patience, patient, mean, understanding...","[fib, patient, average, receive, unitary, see,..."
8,8,We all understand the benefits of laughter. Fo...,We all understand the benefits of laughter Fo...,understand benefit laughter example someone sa...,"[understand, benefit, laughter, example, someo...","[fib, set_forth, depart, unretentive, unitary,..."


In [23]:
synonyms_overlap = []
synonyms_overlap_prop = []
prompt_overlap = []
prompt_overlap_prop = []

for i in range(len(persuasive_essay)):    
    essay_set = persuasive_essay['essay_set'][i]
    essay_tokens = persuasive_essay['corrected_tokens'][i]
    prompt_tokens = essay_prompt_df.loc[essay_prompt_df['essay_set'] == essay_set,'tokens'][essay_set]
    synonyms = essay_prompt_df.loc[essay_prompt_df['essay_set'] == essay_set,'synonyms'][essay_set]
    synonyms_overlap_temp = [word for word in essay_tokens if word in synonyms]
    prompt_overlap_temp = [word for word in essay_tokens if word in prompt_tokens]
    
    synonyms_overlap.append(len(synonyms_overlap_temp))
    synonyms_overlap_prop.append(len(synonyms_overlap_temp)/(len(essay_tokens)+1))
    prompt_overlap.append(len(prompt_overlap_temp))
    prompt_overlap_prop.append(len(prompt_overlap_temp)/(len(essay_tokens)+1))

persuasive_essay['syn_overlap'] = synonyms_overlap
persuasive_essay['syn_overlap_prop'] = synonyms_overlap_prop
persuasive_essay['prompt_overlap'] = prompt_overlap
persuasive_essay['prompt_overlap_prop'] = prompt_overlap_prop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
persuasive_essay.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_tokens,sent_tokens,word_count,sent_count,spell_err,...,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,syn_overlap,syn_overlap_prop,prompt_overlap,prompt_overlap_prop
0,1,1,"Dear local newspaper, I think effects computer...",8,Dear local newspaper I think effects computer...,"[Dear, local, newspaper, I, think, effects, co...","[Dear local newspaper, I think effects compute...",344,16,11,...,78,37,15,28,74.02,0.618182,59,0.355422,46,0.277108
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,Dear I believe that using computers will b...,"[Dear, I, believe, that, using, computers, wil...","[Dear @CAPS1 @CAPS2, I believe that using comp...",413,20,16,...,103,55,10,24,67.08,0.569444,56,0.258065,39,0.179724
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,Dear More and more people use computers...,"[Dear, More, and, more, people, use, computers...","[Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...",276,14,2,...,74,31,4,18,68.2,0.646154,70,0.534351,62,0.473282
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,Dear Local Newspaper I have found that many...,"[Dear, Local, Newspaper, I, have, found, that,...","[Dear Local Newspaper, @CAPS1 I have found tha...",488,27,24,...,134,53,24,45,53.34,0.584559,85,0.311355,55,0.201465
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,Dear I know having computers has a positive ...,"[Dear, I, know, having, computers, has, a, pos...","[Dear @LOCATION1, I know having computers has ...",469,30,13,...,117,40,11,27,72.66,0.531818,62,0.280543,51,0.230769


In [25]:
source_essay = df[(df['essay_set']== 3) | (df['essay_set']== 4) | (df['essay_set']== 5) | (df['essay_set']== 6)]
source_essay.reset_index(drop=True,inplace=True)
source_essay.tail()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_tokens,sent_tokens,word_count,sent_count,spell_err,corrected_essay,essay_documents,corrected_tokens,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio
7096,16629,6,The one obstacle the builders had when trying ...,0,The one obstacle the builders had when trying ...,"[The, one, obstacle, the, builders, had, when,...",[The one obstacle the builders had when trying...,152,8,10,The one obstacle the builders had when trying ...,one obstacle builder trying build building not...,"[one, obstacle, builder, trying, build, buildi...",36,13,10,9,77.27,0.706667
7097,16630,6,Some of the problems with the constructing of ...,2,Some of the problems with the constructing of ...,"[Some, of, the, problems, with, the, construct...",[Some of the problems with the constructing of...,66,3,0,Some of the problems with the constructing of ...,problem constructing docking dirigible natural...,"[problem, constructing, docking, dirigible, na...",13,8,4,11,57.3,0.925
7098,16631,6,The builders of the Empire State building face...,3,The builders of the Empire State building face...,"[The, builders, of, the, Empire, State, buildi...",[The builders of the Empire State building fac...,105,5,1,The builders of the Empire State building face...,builder empire state building faced obstacle a...,"[builder, empire, state, building, faced, obst...",30,8,5,9,58.62,0.814815
7099,16632,6,The obstacles the builders of the Empire State...,2,The obstacles the builders of the Empire State...,"[The, obstacles, the, builders, of, the, Empir...",[The obstacles the builders of the Empire Stat...,68,2,1,The obstacles the builders of the Empire State...,obstacle builder empire state building could n...,"[obstacle, builder, empire, state, building, c...",18,7,3,5,53.89,0.810811
7100,16633,6,You want me to tell you what they had to go th...,2,You want me to tell you what they had to go th...,"[You, want, me, to, tell, you, what, they, had...",[You want me to tell you what they had to go t...,157,9,2,You want me to tell you what they had to go th...,want tell go attempt allow dirigible dock well...,"[want, tell, go, attempt, allow, dirigible, do...",33,19,4,14,64.78,0.632911


In [26]:
source3 = open("processed_data_files/source/Source3.txt",'r',encoding='utf-8')
source3 = source3.read()
source4 = open("processed_data_files/source/Source4.txt",'r',encoding='utf-8')
source4 = source4.read()
source5 = open("processed_data_files/source/Source5.txt",'r',encoding='utf-8')
source5 = source5.read()
source6 = open("processed_data_files/source/Source6.txt",'r',encoding='utf-8')
source6 = source6.read()

source_set = [3,4,5,6]
source = [source3,source4,source5,source6]

essay_source_df = pd.DataFrame({'essay_set':source_set,'source':source},index=source_set)

In [27]:
essay_source_df['clean_source'] = essay_source_df['source'].apply(remove_special_char)
essay_source_df['documents'] = essay_source_df['clean_source'].apply(create_documents)
essay_source_df['tokens'] = essay_source_df['documents'].apply(word_tokenizer)
essay_source_df['pos(nouns & verbs)'] = essay_source_df['tokens'].apply(nouns_and_verbs_pos)
essay_source_df['synonyms'] = essay_source_df['pos(nouns & verbs)'].apply(get_synonyms)

In [28]:
essay_source_df

Unnamed: 0,essay_set,source,clean_source,documents,tokens,pos(nouns & verbs),synonyms
3,3,ROUGH ROAD AHEAD: Do Not Exceed Posted Speed L...,ROUGH ROAD AHEAD Do Not Exceed Posted Speed L...,rough road ahead not exceed posted speed limit...,"[rough, road, ahead, not, exceed, posted, spee...","[joe, face, hope, jaunt, place, energy, bait, ...","[face, twenty_dollar_bill, energy, mental_capa..."
4,4,"Winter Hibiscus by Minfong Ho\nSaeng, a teenag...",Winter Hibiscus by Minfong Ho Saeng a teenage...,winter hibiscus minfong ho saeng teenage girl ...,"[winter, hibiscus, minfong, ho, saeng, teenage...","[delicate, strand, lanna, moment, yes, afterno...","[mutter, shard, face, bemire, lap, equilibrise..."
5,5,Narciso Rodriguez\nfrom Home: The Blueprints o...,Narciso Rodriguez from Home The Blueprints of...,narciso rodriguez home blueprint life parent o...,"[narciso, rodriguez, home, blueprint, life, pa...","[extended, arnold, blond, building, solidarity...","[face, twenty_dollar_bill, go_past, clip, warm..."
6,6,The Mooring Mast\nby Marcia Amidon Lüsted\nWhe...,The Mooring Mast by Marcia Amidon L sted When ...,mooring mast marcia amidon l sted empire state...,"[mooring, mast, marcia, amidon, l, sted, empir...","[weighted, designing, chrysler, learning, conc...","[desexualize, encyclopedism, ship's_company, f..."


In [29]:
synonyms_overlap = []
synonyms_overlap_prop = []

source_overlap = []
source_overlap_prop = []

for i in range(len(source_essay)):
    
    essay_set = source_essay['essay_set'][i]
    essay_tokens = source_essay['corrected_tokens'][i]
    
    source_tokens = essay_source_df.loc[essay_source_df['essay_set'] == essay_set,'pos(nouns & verbs)'][essay_set]
    source_overlap_temp = [word for word in essay_tokens if word in source_tokens]
    source_overlap.append(len(source_overlap_temp))
    source_overlap_prop.append(len(source_overlap_temp)/(len(essay_tokens)+1))

    synonyms = essay_source_df.loc[essay_source_df['essay_set'] == essay_set,'synonyms'][essay_set]
    synonyms_overlap_temp = [word for word in essay_tokens if word in synonyms]    
    synonyms_overlap.append(len(synonyms_overlap_temp))
    synonyms_overlap_prop.append(len(synonyms_overlap_temp)/(len(essay_tokens)+1))

source_essay['source_overlap'] = source_overlap
source_essay['source_overlap_prop'] = source_overlap_prop
source_essay['synonyms_overlap'] = synonyms_overlap
source_essay['synonyms_overlap_prop'] = synonyms_overlap_prop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
source_essay.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,clean_essay,word_tokens,sent_tokens,word_count,sent_count,spell_err,...,noun_count,verb_count,adv_count,adj_count,readability_score,unique_word_ratio,source_overlap,source_overlap_prop,synonyms_overlap,synonyms_overlap_prop
0,5978,3,The features of the setting affect the cyclist...,1,The features of the setting affect the cyclist...,"[The, features, of, the, setting, affect, the,...",[The features of the setting affect the cyclis...,51,3,1,...,11,5,0,5,71.14,0.714286,8,0.363636,14,0.636364
1,5979,3,The features of the setting affected the cycli...,2,The features of the setting affected the cycli...,"[The, features, of, the, setting, affected, th...",[The features of the setting affected the cycl...,174,12,11,...,39,15,6,17,62.92,0.8,41,0.476744,40,0.465116
2,5980,3,Everyone travels to unfamiliar places. Sometim...,1,Everyone travels to unfamiliar places Sometim...,"[Everyone, travels, to, unfamiliar, places, So...","[Everyone travels to unfamiliar places., Somet...",96,8,0,...,19,12,7,9,72.36,0.816327,19,0.38,24,0.48
3,5981,3,I believe the features of the cyclist affected...,1,I believe the features of the cyclist affected...,"[I, believe, the, features, of, the, cyclist, ...",[I believe the features of the cyclist affecte...,87,3,10,...,15,9,3,12,53.21,0.95122,10,0.238095,15,0.357143
4,5982,3,The setting effects the cyclist because of the...,2,The setting effects the cyclist because of the...,"[The, setting, effects, the, cyclist, because,...",[The setting effects the cyclist because of th...,133,3,8,...,27,16,3,14,51.89,0.661538,19,0.287879,28,0.424242


### Saving all processed data into pickle files

In [31]:
df.to_pickle('processed_data_files/clean_df')

essay_prompt_df.to_pickle('processed_data_files/essay_prompt_df')

persuasive_essay.to_pickle('processed_data_files/persuasive_essay')

for i in [1,2,7,8]:
    essay_set = persuasive_essay[persuasive_essay['essay_set']==i]
    essay_set.to_pickle('processed_data_files/essay set/essay_set' + str(i))

essay_source_df.to_pickle('processed_data_files/essay_source_df')
source_essay.to_pickle('processed_data_files/source_essay')


for i in [3,4,5,6]:
    essay_set = source_essay[source_essay['essay_set']==i]
    essay_set.to_pickle('processed_data_files/essay set/' +'essay_set' + str(i))