In [108]:
%matplotlib inline
from __future__ import unicode_literals 
from spacy.en import English
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn

import MySQLdb as mdb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches

import sys  
#reload(sys)  
#sys.setdefaultencoding('utf8')

# Helper Functions 

### Fetch Data From MySQL 

In [15]:
def fetch(cols,table,where=None,group_by=None,order_by=None,Desc=False,limit=None):
    """
    Purpose: Makes the MySQL quesries more efficitent and faster.
    
    Note: All inputs have to be in STRING format except the 'limit' which is an int
    """
    
    # Opening a Connection to donorsChoose Database
    con = mdb.connect('localhost', 'idx', 'donorsChoose', 'donors');
    cur = con.cursor()
    
    # Creating a query
    query = 'SELECT ' + cols + ' FROM ' + table + ' '
    if where: query += ' WHERE ' + where + ' '
    if group_by: query += 'GROUP BY ' + group_by + ' '
    if order_by: query += 'ORDER BY ' + order_by + ' '
    if Desc: query += 'DESC '
    if limit: query += 'LIMIT ' + str(limit)
    
    # fetching the data and turn it into a pandas dataframe
    cur.execute(query)
    output = np.array(cur.fetchall())
    colm = [f[0] for f in cur.description]
    output = pd.DataFrame(output)
    output.columns = colm

    # Closing the connection to the database
    con.close()
    
    return output

## Finding The Essays for LA Projects 

It turned out that the essay data is super messy and not clean. It's not possible to create a MySQL database from the CSV file as there are commas within an essay and split up a paragraph into several columns. Not fun!

So, let's start simple. Let's find all the LA projects essays using the projects_ID. Next, we will delve into cleaning the dataset.

In [45]:
la_projectID = fetch('_projectid','projects',where="school_county='Los Angeles'")

Instead of grabbing the LA projects essay from MySQL, I open a connection to the CSV file. The CSV file is several GB so I cannot just read the whole file.

In [63]:
con = open('opendata_essays.csv','r')

Need to know how many essays are avaliable. That will be the number of iterations. The number of essays is equal to the number of projects.

In [57]:
numberofEssays = fetch('count(_projectid)','projects')

Look up for sets are much faster so let's convert the projects ID list to a set. 

In [58]:
la_projectID = set(la_projectID.iloc[:,0]) ## Look up in set is much faster I believe

In the essay table, projects ID are written as """porjectID""". Therefore, I use regular expression for extracting the projectID in each line. 

In [64]:
la_essays = []

header = con.readline() # First line is the header

for i in xrange(numberofEssays.iloc[0,0]):  # numberofEssays is 1x1 Data Frame!
    aLine = con.readline()
    try:
        # Finding each project done in LA
        if re.search('("""[^"]*""")',aLine).group(1) in la_projectID:
            la_essays.append(aLine)
    except:
        pass
con.close()

In [67]:
la_essays[0]

'"""13c49e14e9f1fab543d57df3cb462417""","""da5e9d2e9ccb8192a62309841135d10d""","""Kinesthetic Approach To Math Illiteracy""","""Many children are frustrated by mathematics  because they are unable to grasp the connections between various mathematical concepts and the real world.   During the development of 6th grade number sense, students must comprehend the relationship between fractions,...""","""The cost of Fraction, Percent, and Decimal Tower Activity Sets from Delta Education is $576, including shipping and <a target=""""new"""" href=""""http://www.donorschoose.org/html/fulfillment.htm"""" onclick=""""g_openWindow(\'http://www.donorschoose.org/html/fulfillment.htm\', 300, 800, \'fulfillwindow\');return false;"""">fulfillment</a>.""","""Many children are frustrated by mathematics  because they are unable to grasp the connections between various mathematical concepts and the real world.   During the development of 6th grade number sense, students must comprehend the relationship bet

Time to cleanup this mess!

# Cleaning Data 

Keeping a copy of not cleaned data around just in case.

In [69]:
la_essays_dirty = la_essays[:]

Most of the trial and errors for parsing are not included in this notebook to keep it easier to follow the flow. Below are the necessary steps.

### Removing HTML tags

In [70]:
for i in xrange(len(la_essays_dirty)):
    proposal = la_essays_dirty[i]
    try:
        la_essays_dirty[i] = re.sub("<[^>]*>", "",proposal)
    except:
        pass

### Removing Extra Quotation Marks

Removing 3 to 7 quotation marks in the text. Different number of Quotation marks are used for quoting in a proposal. Not consistent!

In [71]:
for i in xrange(len(la_essays_dirty)):
    proposal = la_essays_dirty[i]
    try:
        la_essays_dirty[i] = re.sub('[^,]"{3,7}[^,]', "",proposal)
    except:
        pass

### Parsing 

The key is undertanding the pattern. It is """TEXT""","""TEXT""" to the end. But the problem is that the TEXT may contain """, in it. That's why I removed HTML and [^,]"{3,7}[^,]. 

In [73]:
pattern = re.compile(r'("""[^"]*"""),("""[^"]*"""),("""[^"]*""")?,("""[^"]*""")?,\
("""[^"]*""")?,("""[^"]*""")?,("""[^"]*""")?,("""[^"]*""")?,("""[^"]*""")?,\
("""[^"]*""")?,("""[^"]*""")?,("""[^"]*""")?')

In [74]:
not_follow_the_pattern = []
parsed = []

# Let's find all that match the pattern
for i in xrange(len(la_essays_dirty)):
    proposal = la_essays_dirty[i]
    try:
        results = pattern.match(proposal)
        parsed.append(results.groups())
    except:
        not_follow_the_pattern.append(i)

In [78]:
len(not_follow_the_pattern)/float(len(la_essays))

0.0872241689553935

Still there are 10% won't be recovered but that's okay. I will look into one I have more time!

### Parsed Data to DataFrame 

In [80]:
essays_la_clean = pd.DataFrame(parsed)

# Correcting the Columns Names
essays_la_clean.columns =['_projectid','_teacher_acctid','title','short_description','need_statement','essay'
                          ,'paragraph1','paragraph2','paragraph3','paragraph4','thankyou_note','impact_letter']

Impact letter and essay were used only for few years and are outdates. Also, thank you note will not be part of the analysis.

In [84]:
essays_la_clean = essays_la_clean.drop(['thankyou_note','impact_letter','essay'],axis=1)
# To make life easier
essays_la_clean = essays_la_clean.dropna()
essays_la_clean.head()

Unnamed: 0,_projectid,_teacher_acctid,title,short_description,need_statement,paragraph1,paragraph2,paragraph3,paragraph4
1096,"""""""2128b81b571d16dcba07bcd12b84fbe4""""""","""""""a44ab9dfc3abd165344b69d20cfa6fc2""""""","""""""Math Literacy in the Classroom""""""","""""""I have requested two books titledath Doesn'...","""""""My 50 students need opportunities to read ...","""""""Math Literacy Counts.\r\nI am a 6th grade t...","""""""Integrating literacy in Math and Science ar...","""""""I have requested two books titledath Doesn'...","""""""Donating to this project will make a big di..."
1307,"""""""49757e8d0a75080330f66399d8a7dbad""""""","""""""653b846827143f24d85c5b8411fcb0f4""""""","""""""Loose DVD Round-Up: File Cabinet Needed!""""""","""""""We don't have room to organize and store ou...","""""""My students need a metal file cabinet.""""""","""""""At-risk teens are turning their lives aroun...","""""""The good news? Recently, DonorsChoose made ...","""""""The solution to our problem is easy for a g...","""""""Your donation of a locking file cabinet wil..."
1522,"""""""110b2c2e211ed90d0dcf7977a9e8b7b4""""""","""""""0e874ded725f79084fcee5038d08b83c""""""","""""""If You Give A Child A Book...Great Authors ...","""""""If you give a child a book, he or she will ...","""""""My project needs ten books by the author La...","""""""If you give a child a book, he or she will ...","""""""Last year our school purchased the Accelera...","""""""The kinds of books that inspire my students...","""""""Your generous book donation will make it po..."
2085,"""""""22a6e1647a14e4ea45aa34d9a762fcb9""""""","""""""2d48efb095521aa0374ee34f680fea70""""""","""""""Racquet Sports for Seventh Graders""""""","""""""We have the racquets and a teacher with the...","""""""My project needs practice tennis balls.""""""","""""""I am a seventh grade Physical Education tea...","""""""In my class students are taught to face the...","""""""My students would like to learn how to play...","""""""Your donation would help introduce this exc..."
2135,"""""""5d0b532db5283bb6dffff3d7cfe48dd3""""""","""""""6286d37992e286cabaad02ae33c444ab""""""","""""""Stop Iliteracy """"""","""""""I would like to provide my students with a ...","""""""My students need a classroom library set of...","""""""I am a second grade teacher at an inner cit...","""""""My students face many obstacles in their ho...","""""""I believe these materials will be an excell...","""""""Your help will ensure that my students get ..."


In [85]:
essays_la_clean.shape[0]/float(len(la_essays))

0.8450989677052752

I have lost 15% of the essays. There are about 45K essays cleaned and ready to be used. If there are some amazing features, I will spend more time on retrieving that 15%.

# Feature Engineering

Need to add the funding status to the essay dataset to examine the predictive value of a feature.

In [86]:
funding_status = fetch('_projectid,funding_status','projects',where="school_county='Los Angeles'")

In [89]:
essays_la_clean = pd.merge(essays_la_clean, funding_status, how='inner', on='_projectid')

In [91]:
essays_la_clean.head(2)

Unnamed: 0,_projectid,_teacher_acctid,title,short_description,need_statement,paragraph1,paragraph2,paragraph3,paragraph4,funding_status
0,"""""""2128b81b571d16dcba07bcd12b84fbe4""""""","""""""a44ab9dfc3abd165344b69d20cfa6fc2""""""","""""""Math Literacy in the Classroom""""""","""""""I have requested two books titledath Doesn'...","""""""My 50 students need opportunities to read ...","""""""Math Literacy Counts.\r\nI am a 6th grade t...","""""""Integrating literacy in Math and Science ar...","""""""I have requested two books titledath Doesn'...","""""""Donating to this project will make a big di...",completed
1,"""""""49757e8d0a75080330f66399d8a7dbad""""""","""""""653b846827143f24d85c5b8411fcb0f4""""""","""""""Loose DVD Round-Up: File Cabinet Needed!""""""","""""""We don't have room to organize and store ou...","""""""My students need a metal file cabinet.""""""","""""""At-risk teens are turning their lives aroun...","""""""The good news? Recently, DonorsChoose made ...","""""""The solution to our problem is easy for a g...","""""""Your donation of a locking file cabinet wil...",completed


All right, it's been added. Good!

## Comparing the length of different parts of Essays 

In [94]:
nlp = English()
def lengthS(aString): 
    aString = unicode(str(aString), "utf-8")  # nlp works only on unicodes
    aString = re.sub('["]','',aString)          # Removing the "
    aString = re.sub('','',aString)          # Removing the "
    return len(nlp(aString))

In [95]:
essays_la_clean['lengthTitle'] = map(lengthS,essays_la_clean.title)
essays_la_clean['lengthNeed'] = map(lengthS,essays_la_clean.need_statement)
essays_la_clean['lengthPar1'] = map(lengthS,essays_la_clean.paragraph1)
essays_la_clean['lengthPar2'] = map(lengthS,essays_la_clean.paragraph2)
essays_la_clean['lengthPar3'] = map(lengthS,essays_la_clean.paragraph3)
essays_la_clean['lengthPar4'] = map(lengthS,essays_la_clean.paragraph4)

In [111]:
print essays_la_clean.groupby('funding_status').lengthTitle.mean()
print essays_la_clean.groupby('funding_status').lengthNeed.mean()
print essays_la_clean.groupby('funding_status').lengthPar1.mean()
print essays_la_clean.groupby('funding_status').lengthPar2.mean()
print essays_la_clean.groupby('funding_status').lengthPar3.mean()
print essays_la_clean.groupby('funding_status').lengthPar4.mean()

funding_status 
completed    5.881984 
expired      5.793045 
Name: lengthTitle, dtype: float64 
funding_status
completed    21.631494
expired      21.483938
Name: lengthNeed, dtype: float64
funding_status
completed    45.319851
expired      45.438403
Name: lengthPar1, dtype: float64
funding_status
completed    92.315343
expired      91.761163
Name: lengthPar2, dtype: float64
funding_status
completed    111.097808
expired      110.138130
Name: lengthPar3, dtype: float64
funding_status
completed    62.027017
expired      62.303405
Name: lengthPar4, dtype: float64

basically, noooooo difference!

## Sentiment Analysis 

In [12]:
def sentiment_scores(text):
    """
    Purpose: estimating how positive an essay is by measureing 
    the ratio of positive score to the negative score
    """
    
    # Spiliting the essay into sentences
    sentences = re.findall(r"[\w']+|[.,!?;]", text)
    
    pos_total, neg_total, obj_total, word_count = 0, 0, 0, 0
    for sentence in sentences:
        positive_score, negative_score, objective_score = 0, 0, 0 
        wordSent = swn.senti_synsets(sentence)
        if wordSent:
            word_count += 1
            for eachVersion in wordSent:
                positive_score += eachVersion.pos_score()
                negative_score += eachVersion.neg_score()
                objective_score += eachVersion.obj_score()
            pos_total += positive_score/len(wordSent)
            neg_total += negative_score/len(wordSent)
            obj_total += objective_score/len(wordSent)

    if neg_total:
        return pos_total/neg_total
    elif pos_total:
        return 100
    else:
        return -1

In [13]:
essays_la_clean['senti_Title'] = map(sentiment_scores,essays_la_clean.title)
essays_la_clean['senti_Need'] = map(sentiment_scores,essays_la_clean.need_statement)
essays_la_clean['senti_Par1'] = map(sentiment_scores,essays_la_clean.paragraph1)
essays_la_clean['senti_Par2'] = map(sentiment_scores,essays_la_clean.paragraph2)
essays_la_clean['senti_Par3'] = map(sentiment_scores,essays_la_clean.paragraph3)
essays_la_clean['senti_Par4'] = map(sentiment_scores,essays_la_clean.paragraph4)

In [19]:
print essays_la_clean.groupby('funding_status').senti_Title.mean()
print essays_la_clean.groupby('funding_status').senti_Need.mean()
print essays_la_clean.groupby('funding_status').senti_Par1.mean()
print essays_la_clean.groupby('funding_status').senti_Par2.mean()
print essays_la_clean.groupby('funding_status').senti_Par3.mean()
print essays_la_clean.groupby('funding_status').senti_Par4.mean()

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 27.2 µs
funding_status
completed    16.388922
expired      18.027007
Name: senti_Title, dtype: float64
funding_status
completed    1.376904
expired      1.401063
Name: senti_Need, dtype: float64
funding_status
completed    1.982552
expired      1.904928
Name: senti_Par1, dtype: float64
funding_status
completed    1.699763
expired      1.661424
Name: senti_Par2, dtype: float64
funding_status
completed    1.984542
expired      2.013779
Name: senti_Par3, dtype: float64
funding_status
completed    2.391677
expired      2.483846
Name: senti_Par4, dtype: float64


## Flesch Reading Ease

In [47]:
from textstat.textstat import textstat

In [48]:
def Flesch_Reading_Ease(text):
    try:
        return textstat.flesch_reading_ease(text)
    except:
        return -1

In [49]:
%%time

essays_la_clean['flesch_ease_Title'] = map(Flesch_Reading_Ease,essays_la_clean.title)
essays_la_clean['flesch_ease_Need'] = map(Flesch_Reading_Ease,essays_la_clean.need_statement)
essays_la_clean['flesch_ease_Par1'] = map(Flesch_Reading_Ease,essays_la_clean.paragraph1)
essays_la_clean['flesch_ease_Par2'] = map(Flesch_Reading_Ease,essays_la_clean.paragraph2)
essays_la_clean['flesch_ease_Par3'] = map(Flesch_Reading_Ease,essays_la_clean.paragraph3)
essays_la_clean['flesch_ease_Par4'] = map(Flesch_Reading_Ease,essays_la_clean.paragraph4)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


In [50]:
print essays_la_clean.groupby('funding_status').flesch_ease_Title.mean()
print essays_la_clean.groupby('funding_status').flesch_ease_Need.mean()
print essays_la_clean.groupby('funding_status').flesch_ease_Par1.mean()
print essays_la_clean.groupby('funding_status').flesch_ease_Par2.mean()
print essays_la_clean.groupby('funding_status').flesch_ease_Par3.mean()
print essays_la_clean.groupby('funding_status').flesch_ease_Par4.mean()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs
funding_status
completed    50.605404
expired      46.783612
Name: flesch_ease_Title, dtype: float64
funding_status
completed    64.017153
expired      63.466531
Name: flesch_ease_Need, dtype: float64
funding_status
completed    65.439093
expired      64.972581
Name: flesch_ease_Par1, dtype: float64
funding_status
completed    61.001218
expired      61.101375
Name: flesch_ease_Par2, dtype: float64
funding_status
completed    60.260241
expired      59.849520
Name: flesch_ease_Par3, dtype: float64
funding_status
completed    61.688667
expired      60.782079
Name: flesch_ease_Par4, dtype: float64


## Flesch-Kincaid Grade Level

In [52]:
def Flesch_Kincaid_grade(text):
    try:
        return textstat.flesch_kincaid_grade(text)
    except:
        return -1

In [53]:
%%time

essays_la_clean['flesch_grade_Title'] = map(Flesch_Kincaid_grade,essays_la_clean.title)
essays_la_clean['flesch_grade_Need'] = map(Flesch_Kincaid_grade,essays_la_clean.need_statement)
essays_la_clean['flesch_grade_Par1'] = map(Flesch_Kincaid_grade,essays_la_clean.paragraph1)
essays_la_clean['flesch_grade_Par2'] = map(Flesch_Kincaid_grade,essays_la_clean.paragraph2)
essays_la_clean['flesch_grade_Par3'] = map(Flesch_Kincaid_grade,essays_la_clean.paragraph3)
essays_la_clean['flesch_grade_Par4'] = map(Flesch_Kincaid_grade,essays_la_clean.paragraph4)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


In [56]:
print essays_la_clean.groupby('funding_status').flesch_grade_Title.mean()
print essays_la_clean.groupby('funding_status').flesch_grade_Need.mean()
print essays_la_clean.groupby('funding_status').flesch_grade_Par1.mean()
print essays_la_clean.groupby('funding_status').flesch_grade_Par2.mean()
print essays_la_clean.groupby('funding_status').flesch_grade_Par3.mean()
print essays_la_clean.groupby('funding_status').flesch_grade_Par4.mean()

funding_status
completed    3.571826
expired      3.809441
Name: flesch_grade_Title, dtype: float64
funding_status
completed    8.692135
expired      8.779853
Name: flesch_grade_Need, dtype: float64
funding_status
completed    8.161552
expired      8.214502
Name: flesch_grade_Par1, dtype: float64
funding_status
completed    8.890653
expired      8.797715
Name: flesch_grade_Par2, dtype: float64
funding_status
completed    9.77302
expired      9.81520
Name: flesch_grade_Par3, dtype: float64
funding_status
completed    9.273483
expired      9.371073
Name: flesch_grade_Par4, dtype: float64
CPU times: user 19.5 ms, sys: 502 µs, total: 20 ms
Wall time: 23.8 ms


## FOG index

In [58]:
def fog_index(text):
    try:
        return textstat.gunning_fog(text)
    except:
        return -1

In [59]:
%%time

essays_la_clean['fog_index_Title'] = map(fog_index,essays_la_clean.title)
essays_la_clean['fog_index_Need'] = map(fog_index,essays_la_clean.need_statement)
essays_la_clean['fog_index_Par1'] = map(fog_index,essays_la_clean.paragraph1)
essays_la_clean['fog_index_Par2'] = map(fog_index,essays_la_clean.paragraph2)
essays_la_clean['fog_index_Par3'] = map(fog_index,essays_la_clean.paragraph3)
essays_la_clean['fog_index_Par4'] = map(fog_index,essays_la_clean.paragraph4)

CPU times: user 7min 18s, sys: 3.07 s, total: 7min 21s
Wall time: 8min 27s


In [60]:
print essays_la_clean.groupby('funding_status').fog_index_Title.mean()
print essays_la_clean.groupby('funding_status').fog_index_Need.mean()
print essays_la_clean.groupby('funding_status').fog_index_Par1.mean()
print essays_la_clean.groupby('funding_status').fog_index_Par2.mean()
print essays_la_clean.groupby('funding_status').fog_index_Par3.mean()
print essays_la_clean.groupby('funding_status').fog_index_Par4.mean()

funding_status
completed    3.435590
expired      3.350491
Name: fog_index_Title, dtype: float64
funding_status
completed    9.013146
expired      9.034215
Name: fog_index_Need, dtype: float64
funding_status
completed    8.453722
expired      8.433761
Name: fog_index_Par1, dtype: float64
funding_status
completed    8.617303
expired      8.503852
Name: fog_index_Par2, dtype: float64
funding_status
completed    9.865758
expired      9.825000
Name: fog_index_Par3, dtype: float64
funding_status
completed    9.386503
expired      9.346261
Name: fog_index_Par4, dtype: float64


## SMOG Index

In [69]:
def smog_index(text):
    try:
        return textstat.smog_index(text)
    except:
        return -1

In [70]:
%%time

essays_la_clean['smog_index_Title'] = map(smog_index,essays_la_clean.title)
essays_la_clean['smog_index_Need'] = map(smog_index,essays_la_clean.need_statement)
essays_la_clean['smog_index_Par1'] = map(smog_index,essays_la_clean.paragraph1)
essays_la_clean['smog_index_Par2'] = map(smog_index,essays_la_clean.paragraph2)
essays_la_clean['smog_index_Par3'] = map(smog_index,essays_la_clean.paragraph3)
essays_la_clean['smog_index_Par4'] = map(smog_index,essays_la_clean.paragraph4)

CPU times: user 1min 45s, sys: 767 ms, total: 1min 46s
Wall time: 2min 5s


In [74]:
#print essays_la_clean.groupby('funding_status').smog_index_Title.mean()
print essays_la_clean.groupby('funding_status').smog_index_Need.mean()
print essays_la_clean.groupby('funding_status').smog_index_Par1.mean()
print essays_la_clean.groupby('funding_status').smog_index_Par2.mean()
print essays_la_clean.groupby('funding_status').smog_index_Par3.mean()
print essays_la_clean.groupby('funding_status').smog_index_Par4.mean()

funding_status
completed    7.510286
expired      7.241304
Name: smog_index_Need, dtype: float64
funding_status
completed    9.272929
expired      9.357267
Name: smog_index_Par1, dtype: float64
funding_status
completed    10.683811
expired      10.583881
Name: smog_index_Par2, dtype: float64
funding_status
completed    11.297554
expired      11.247198
Name: smog_index_Par3, dtype: float64
funding_status
completed    10.412469
expired      10.442699
Name: smog_index_Par4, dtype: float64


## Difficult Words 

In [75]:
def Difficult_word(text):
    try:
        return textstat.difficult_words(text)
    except:
        return -1

In [76]:
%%time

essays_la_clean['difficult_words_Title'] = map(Difficult_word,essays_la_clean.title)
essays_la_clean['difficult_words_Need'] = map(Difficult_word,essays_la_clean.need_statement)
essays_la_clean['difficult_words_Par1'] = map(Difficult_word,essays_la_clean.paragraph1)
essays_la_clean['difficult_words_Par2'] = map(Difficult_word,essays_la_clean.paragraph2)
essays_la_clean['difficult_words_Par3'] = map(Difficult_word,essays_la_clean.paragraph3)
essays_la_clean['difficult_words_Par4'] = map(Difficult_word,essays_la_clean.paragraph4)

CPU times: user 7min 53s, sys: 2.71 s, total: 7min 55s
Wall time: 8min 34s


In [77]:
print essays_la_clean.groupby('funding_status').difficult_words_Title.mean()
print essays_la_clean.groupby('funding_status').difficult_words_Need.mean()
print essays_la_clean.groupby('funding_status').difficult_words_Par1.mean()
print essays_la_clean.groupby('funding_status').difficult_words_Par2.mean()
print essays_la_clean.groupby('funding_status').difficult_words_Par3.mean()
print essays_la_clean.groupby('funding_status').difficult_words_Par4.mean()

funding_status
completed    2.248104
expired      2.305325
Name: difficult_words_Title, dtype: float64
funding_status
completed    6.415801
expired      6.506609
Name: difficult_words_Need, dtype: float64
funding_status
completed    9.881601
expired      9.928248
Name: difficult_words_Par1, dtype: float64
funding_status
completed    20.929284
expired      20.420128
Name: difficult_words_Par2, dtype: float64
funding_status
completed    24.738413
expired      24.553814
Name: difficult_words_Par3, dtype: float64
funding_status
completed    13.810744
expired      14.018127
Name: difficult_words_Par4, dtype: float64


## sentence count

In [79]:
def sent_count(text):
    try:
        return textstat.sentence_count(text)
    except:
        return -1

In [81]:
%%time

#essays_la_clean['sent_count_Title'] = map(sentence_count,essays_la_clean.title)
essays_la_clean['sent_count_Need'] = map(sent_count,essays_la_clean.need_statement)
essays_la_clean['sent_count_Par1'] = map(sent_count,essays_la_clean.paragraph1)
essays_la_clean['sent_count_Par2'] = map(sent_count,essays_la_clean.paragraph2)
essays_la_clean['sent_count_Par3'] = map(sent_count,essays_la_clean.paragraph3)
essays_la_clean['sent_count_Par4'] = map(sent_count,essays_la_clean.paragraph4)

CPU times: user 38.5 s, sys: 157 ms, total: 38.7 s
Wall time: 39 s


In [82]:
#print essays_la_clean.groupby('funding_status').sent_count_Title.mean()
print essays_la_clean.groupby('funding_status').sent_count_Need.mean()
print essays_la_clean.groupby('funding_status').sent_count_Par1.mean()
print essays_la_clean.groupby('funding_status').sent_count_Par2.mean()
print essays_la_clean.groupby('funding_status').sent_count_Par3.mean()
print essays_la_clean.groupby('funding_status').sent_count_Par4.mean()

funding_status
completed    1.129073
expired      1.120468
Name: sent_count_Need, dtype: float64
funding_status
completed    2.630829
expired      2.628776
Name: sent_count_Par1, dtype: float64
funding_status
completed    5.063343
expired      5.057024
Name: sent_count_Par2, dtype: float64
funding_status
completed    5.235674
expired      5.215634
Name: sent_count_Par3, dtype: float64
funding_status
completed    3.189466
expired      3.214879
Name: sent_count_Par4, dtype: float64


## Syllable Count

In [84]:
def syllab_count(text):
    try:
        return textstat.syllable_count(text)
    except:
        return -1

In [85]:
%%time

essays_la_clean['syllab_count_Title'] = map(syllab_count,essays_la_clean.title)
essays_la_clean['syllab_count_Need'] = map(syllab_count,essays_la_clean.need_statement)
essays_la_clean['syllab_count_Par1'] = map(syllab_count,essays_la_clean.paragraph1)
essays_la_clean['syllab_count_Par2'] = map(syllab_count,essays_la_clean.paragraph2)
essays_la_clean['syllab_count_Par3'] = map(syllab_count,essays_la_clean.paragraph3)
essays_la_clean['syllab_count_Par4'] = map(syllab_count,essays_la_clean.paragraph4)

CPU times: user 8.65 s, sys: 72.2 ms, total: 8.73 s
Wall time: 10.6 s


In [86]:
print essays_la_clean.groupby('funding_status').syllab_count_Title.mean()
print essays_la_clean.groupby('funding_status').syllab_count_Need.mean()
print essays_la_clean.groupby('funding_status').syllab_count_Par1.mean()
print essays_la_clean.groupby('funding_status').syllab_count_Par2.mean()
print essays_la_clean.groupby('funding_status').syllab_count_Par3.mean()
print essays_la_clean.groupby('funding_status').syllab_count_Par4.mean()

funding_status
completed    8.175983
expired      8.228285
Name: syllab_count_Title, dtype: float64
funding_status
completed    28.469101
expired      28.516994
Name: syllab_count_Need, dtype: float64
funding_status
completed    59.489045
expired      59.376133
Name: syllab_count_Par1, dtype: float64
funding_status
completed    126.324087
expired      123.920506
Name: syllab_count_Par2, dtype: float64
funding_status
completed    149.842837
expired      148.443353
Name: syllab_count_Par3, dtype: float64
funding_status
completed    83.407022
expired      84.060801
Name: syllab_count_Par4, dtype: float64


## lexicon count 

In [88]:
def Lexicon_count(text):
    try:
        return textstat.lexicon_count(text)
    except:
        return -1

In [89]:
%%time

essays_la_clean['lexicon_count_Title'] = map(Lexicon_count,essays_la_clean.title)
essays_la_clean['lexicon_count_Need'] = map(Lexicon_count,essays_la_clean.need_statement)
essays_la_clean['lexicon_count_Par1'] = map(Lexicon_count,essays_la_clean.paragraph1)
essays_la_clean['lexicon_count_Par2'] = map(Lexicon_count,essays_la_clean.paragraph2)
essays_la_clean['lexicon_count_Par3'] = map(Lexicon_count,essays_la_clean.paragraph3)
essays_la_clean['lexicon_count_Par4'] = map(Lexicon_count,essays_la_clean.paragraph4)

CPU times: user 30 s, sys: 236 ms, total: 30.3 s
Wall time: 34.2 s


In [90]:
print essays_la_clean.groupby('funding_status').lexicon_count_Title.mean()
print essays_la_clean.groupby('funding_status').lexicon_count_Need.mean()
print essays_la_clean.groupby('funding_status').lexicon_count_Par1.mean()
print essays_la_clean.groupby('funding_status').lexicon_count_Par2.mean()
print essays_la_clean.groupby('funding_status').lexicon_count_Par3.mean()
print essays_la_clean.groupby('funding_status').lexicon_count_Par4.mean()

funding_status
completed    5.205407
expired      5.087613
Name: lexicon_count_Title, dtype: float64
funding_status
completed    19.252949
expired      19.206571
Name: lexicon_count_Need, dtype: float64
funding_status
completed    40.502107
expired      40.234517
Name: lexicon_count_Par1, dtype: float64
funding_status
completed    83.342556
expired      81.625378
Name: lexicon_count_Par2, dtype: float64
funding_status
completed    100.456952
expired       99.092523
Name: lexicon_count_Par3, dtype: float64
funding_status
completed    56.133146
expired      56.143882
Name: lexicon_count_Par4, dtype: float64


## Automated Readability Index

In [92]:
def auto_readability_index(text):
    try:
        return textstat.automated_readability_index(text)
    except:
        return -1    

In [93]:
%%time

essays_la_clean['auto_readability_Title'] = map(auto_readability_index,essays_la_clean.title)
essays_la_clean['auto_readability_Need'] = map(auto_readability_index,essays_la_clean.need_statement)
essays_la_clean['auto_readability_Par1'] = map(auto_readability_index,essays_la_clean.paragraph1)
essays_la_clean['auto_readability_Par2'] = map(auto_readability_index,essays_la_clean.paragraph2)
essays_la_clean['auto_readability_Par3'] = map(auto_readability_index,essays_la_clean.paragraph3)
essays_la_clean['auto_readability_Par4'] = map(auto_readability_index,essays_la_clean.paragraph4)

CPU times: user 1min 15s, sys: 554 ms, total: 1min 15s
Wall time: 1min 21s


In [94]:
print essays_la_clean.groupby('funding_status').auto_readability_Title.mean()
print essays_la_clean.groupby('funding_status').auto_readability_Need.mean()
print essays_la_clean.groupby('funding_status').auto_readability_Par1.mean()
print essays_la_clean.groupby('funding_status').auto_readability_Par2.mean()
print essays_la_clean.groupby('funding_status').auto_readability_Par3.mean()
print essays_la_clean.groupby('funding_status').auto_readability_Par4.mean()

funding_status
completed    8.494319
expired      8.569316
Name: auto_readability_Title, dtype: float64
funding_status
completed    13.179178
expired      13.095582
Name: auto_readability_Need, dtype: float64
funding_status
completed    10.492065
expired      10.486027
Name: auto_readability_Par1, dtype: float64
funding_status
completed    10.894986
expired      10.700869
Name: auto_readability_Par2, dtype: float64
funding_status
completed    12.303069
expired      12.297338
Name: auto_readability_Par3, dtype: float64
funding_status
completed    11.708736
expired      11.746847
Name: auto_readability_Par4, dtype: float64


## The Coleman-Liau Index

In [96]:
def coleman_index(text):
    try:
        return textstat.coleman_liau_index(text)
    except:
        return -1    

In [97]:
%%time

essays_la_clean['coleman_index_Title'] = map(coleman_index,essays_la_clean.title)
essays_la_clean['coleman_index_Need'] = map(coleman_index,essays_la_clean.need_statement)
essays_la_clean['coleman_index_Par1'] = map(coleman_index,essays_la_clean.paragraph1)
essays_la_clean['coleman_index_Par2'] = map(coleman_index,essays_la_clean.paragraph2)
essays_la_clean['coleman_index_Par3'] = map(coleman_index,essays_la_clean.paragraph3)
essays_la_clean['coleman_index_Par4'] = map(coleman_index,essays_la_clean.paragraph4)

CPU times: user 1min 49s, sys: 1.03 s, total: 1min 50s
Wall time: 2min 11s


In [98]:
print essays_la_clean.groupby('funding_status').coleman_index_Title.mean()
print essays_la_clean.groupby('funding_status').coleman_index_Need.mean()
print essays_la_clean.groupby('funding_status').coleman_index_Par1.mean()
print essays_la_clean.groupby('funding_status').coleman_index_Par2.mean()
print essays_la_clean.groupby('funding_status').coleman_index_Par3.mean()
print essays_la_clean.groupby('funding_status').coleman_index_Par4.mean()

funding_status
completed    21.139509
expired      22.031798
Name: coleman_index_Title, dtype: float64
funding_status
completed    14.011942
expired      13.883053
Name: coleman_index_Need, dtype: float64
funding_status
completed    11.373425
expired      11.384670
Name: coleman_index_Par1, dtype: float64
funding_status
completed    11.663623
expired      11.577883
Name: coleman_index_Par2, dtype: float64
funding_status
completed    11.753972
expired      11.765655
Name: coleman_index_Par3, dtype: float64
funding_status
completed    11.661158
expired      11.767491
Name: coleman_index_Par4, dtype: float64


## Linsear Write Formula

In [100]:
def linsear_formula(text):
    try:
        return textstat.linsear_write_formula(text)
    except:
        return -1    

In [101]:
%%time

essays_la_clean['linsear_formula_Title'] = map(linsear_formula,essays_la_clean.title)
essays_la_clean['linsear_formula_Need'] = map(linsear_formula,essays_la_clean.need_statement)
essays_la_clean['linsear_formula_Par1'] = map(linsear_formula,essays_la_clean.paragraph1)
essays_la_clean['linsear_formula_Par2'] = map(linsear_formula,essays_la_clean.paragraph2)
essays_la_clean['linsear_formula_Par3'] = map(linsear_formula,essays_la_clean.paragraph3)
essays_la_clean['linsear_formula_Par4'] = map(linsear_formula,essays_la_clean.paragraph4)

CPU times: user 45min 42s, sys: 18.4 s, total: 46min 1s
Wall time: 1h 43s


In [102]:
print essays_la_clean.groupby('funding_status').linsear_formula_Title.mean()
print essays_la_clean.groupby('funding_status').linsear_formula_Need.mean()
print essays_la_clean.groupby('funding_status').linsear_formula_Par1.mean()
print essays_la_clean.groupby('funding_status').linsear_formula_Par2.mean()
print essays_la_clean.groupby('funding_status').linsear_formula_Par3.mean()
print essays_la_clean.groupby('funding_status').linsear_formula_Par4.mean()

funding_status
completed    1.331110
expired      1.308252
Name: linsear_formula_Title, dtype: float64
funding_status
completed    8.233673
expired      8.263973
Name: linsear_formula_Need, dtype: float64
funding_status
completed    7.638308
expired      7.612255
Name: linsear_formula_Par1, dtype: float64
funding_status
completed    7.808111
expired      7.636046
Name: linsear_formula_Par2, dtype: float64
funding_status
completed    9.182900
expired      9.191937
Name: linsear_formula_Par3, dtype: float64
funding_status
completed    8.862956
expired      8.878493
Name: linsear_formula_Par4, dtype: float64


## Dale-Chall Readability Score

In [104]:
def DCH_readability(text):
    try:
        return textstat.dale_chall_readability_score(text)
    except:
        return -1    

In [105]:
%%time

essays_la_clean['DCH_readability_Title'] = map(DCH_readability,essays_la_clean.title)
essays_la_clean['DCH_readability_Need'] = map(DCH_readability,essays_la_clean.need_statement)
essays_la_clean['DCH_readability_Par1'] = map(DCH_readability,essays_la_clean.paragraph1)
essays_la_clean['DCH_readability_Par2'] = map(DCH_readability,essays_la_clean.paragraph2)
essays_la_clean['DCH_readability_Par3'] = map(DCH_readability,essays_la_clean.paragraph3)
essays_la_clean['DCH_readability_Par4'] = map(DCH_readability,essays_la_clean.paragraph4)

CPU times: user 7min 19s, sys: 2.91 s, total: 7min 22s
Wall time: 8min 22s


In [106]:
print essays_la_clean.groupby('funding_status').DCH_readability_Title.mean()
print essays_la_clean.groupby('funding_status').DCH_readability_Need.mean()
print essays_la_clean.groupby('funding_status').DCH_readability_Par1.mean()
print essays_la_clean.groupby('funding_status').DCH_readability_Par2.mean()
print essays_la_clean.groupby('funding_status').DCH_readability_Par3.mean()
print essays_la_clean.groupby('funding_status').DCH_readability_Par4.mean()

funding_status
completed    7.439379
expired      7.458289
Name: DCH_readability_Title, dtype: float64
funding_status
completed     9.921313
expired      10.029196
Name: DCH_readability_Need, dtype: float64
funding_status
completed    8.346935
expired      8.391208
Name: DCH_readability_Par1, dtype: float64
funding_status
completed    8.487501
expired      8.451828
Name: DCH_readability_Par2, dtype: float64
funding_status
completed    8.550086
expired      8.565842
Name: DCH_readability_Par3, dtype: float64
funding_status
completed    8.495435
expired      8.551924
Name: DCH_readability_Par4, dtype: float64


# Writing To CSV

It seems that none of the created features have any predictive power. Too bad!

In [112]:
essays_la_clean.to_csv('essay_preProcessed.csv',index=False)