In [1]:
import pandas as pd

import spacy

import textdescriptives as td

from sklearn import linear_model

Import prepared `training_set_cleaned.csv` to DataFrame:
* Only essays 1, 2, 7, 8.
* Score converted to percentile.
* Grade level added.

Source:
* The Hewlett Foundation: Automated Essay Scoring
* https://www.kaggle.com/c/asap-aes/data

In [2]:
df = pd.read_csv('data/training_set_cleaned.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5875 entries, 0 to 5874
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   essay_id            5875 non-null   int64  
 1   essay_set           5875 non-null   int64  
 2   essay               5875 non-null   object 
 3   percentile_score    5875 non-null   float64
 4   actual grade level  5875 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 229.6+ KB


In [4]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,percentile_score,actual grade level
0,1,1,"Dear local newspaper, I think effects computer...",0.666667,8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",0.75,8
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",0.583333,8
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",0.833333,8
4,5,1,"Dear @LOCATION1, I know having computers has a...",0.666667,8


***
## Feature Selection via textdescriptives
* Create spaCy pipeline using all metrics in textdescriptives.
* Process first essay and store as new DataFrame.

## Compile time is ~15 minutes, loading pre-compiled from csv file provided below.

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives") 
doc = nlp(df.essay[0])
tdDF = td.extract_df(doc)

In [None]:
tdDF

* Process all remaining essays and append to dataframe.

In [None]:
for n in range(1, len(df)):
    tdDF = tdDF.append(
        td.extract_df( nlp(df.essay[n]) ),
        ignore_index = True )

In [None]:
tdDF

***
# Load processed essays via CSV

In [5]:
tdDF = pd.read_csv('data/training_set_processed.csv', index_col = 0)

***
`textdescriptives` valid `metrics` are:
* `all` - default
* `descriptive_stats`
* `readability`
* `dependency_distance`
* `pos_stats`


`NaN` cleaning
* Some metrics may not be there (i.e. PoS) so replacing with 0 corrects this issue.

In [6]:
tdDF[tdDF.columns[tdDF.isna().any()]]
tdDF.fillna(0, inplace = True)

In [7]:
tdDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5875 entries, 0 to 5874
Data columns (total 45 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   text                                    5875 non-null   object 
 1   token_length_mean                       5875 non-null   float64
 2   token_length_median                     5875 non-null   float64
 3   token_length_std                        5875 non-null   float64
 4   sentence_length_mean                    5875 non-null   float64
 5   sentence_length_median                  5875 non-null   float64
 6   sentence_length_std                     5875 non-null   float64
 7   syllables_per_token_mean                5875 non-null   float64
 8   syllables_per_token_median              5875 non-null   float64
 9   syllables_per_token_std                 5875 non-null   float64
 10  n_tokens                                5875 non-null   int6

In [8]:
tdDF.head()

Unnamed: 0,text,token_length_mean,token_length_median,token_length_std,sentence_length_mean,sentence_length_median,sentence_length_std,syllables_per_token_mean,syllables_per_token_median,syllables_per_token_std,...,pos_prop_SCONJ,pos_prop_PART,pos_prop_DET,pos_prop_PROPN,pos_prop_CCONJ,pos_prop_ADV,pos_prop_INTJ,pos_prop_NUM,pos_prop_X,pos_prop_SPACE
0,"Dear local newspaper, I think effects computer...",4.286127,4.0,2.259644,21.625,19.0,13.341078,1.225434,1.0,0.544059,...,0.032911,0.037975,0.050633,0.007595,0.035443,0.050633,0.005063,0.0,0.0,0.0
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",4.352381,4.0,2.351214,20.0,15.0,12.067272,1.297619,1.0,0.616823,...,0.015351,0.02193,0.078947,0.008772,0.039474,0.057018,0.0,0.010965,0.0,0.0
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.405018,4.0,2.383661,17.4375,15.0,10.885591,1.293907,1.0,0.638939,...,0.01634,0.03268,0.091503,0.006536,0.052288,0.04902,0.0,0.009804,0.0,0.0
3,"Dear Local Newspaper, @CAPS1 I have found that...",4.912713,4.0,2.601264,19.518519,17.0,11.830189,1.388994,1.0,0.768517,...,0.022569,0.039931,0.078125,0.041667,0.029514,0.046875,0.0,0.001736,0.0,0.0
4,"Dear @LOCATION1, I know having computers has a...",4.372591,4.0,2.325342,15.566667,14.0,6.581202,1.314775,1.0,0.632334,...,0.023166,0.034749,0.110039,0.005792,0.030888,0.07529,0.0,0.005792,0.0,0.0


In [9]:
tdDF.columns

Index(['text', 'token_length_mean', 'token_length_median', 'token_length_std',
       'sentence_length_mean', 'sentence_length_median', 'sentence_length_std',
       'syllables_per_token_mean', 'syllables_per_token_median',
       'syllables_per_token_std', 'n_tokens', 'n_unique_tokens',
       'proportion_unique_tokens', 'n_characters', 'n_sentences',
       'flesch_reading_ease', 'flesch_kincaid_grade', 'smog', 'gunning_fog',
       'automated_readability_index', 'coleman_liau_index', 'lix', 'rix',
       'dependency_distance_mean', 'dependency_distance_std',
       'prop_adjacent_dependency_relation_mean',
       'prop_adjacent_dependency_relation_std', 'pos_prop_ADJ',
       'pos_prop_NOUN', 'pos_prop_PUNCT', 'pos_prop_PRON', 'pos_prop_VERB',
       'pos_prop_ADP', 'pos_prop_AUX', 'pos_prop_SYM', 'pos_prop_SCONJ',
       'pos_prop_PART', 'pos_prop_DET', 'pos_prop_PROPN', 'pos_prop_CCONJ',
       'pos_prop_ADV', 'pos_prop_INTJ', 'pos_prop_NUM', 'pos_prop_X',
       'pos_prop_SPACE']

***
## Exploratory analysis for feature selection from `textdescriptives` results.
https://hlasse.github.io/TextDescriptives/

In [10]:
tdFeatures = list(tdDF.columns)[1:]

Store possible **Token** Features in list  
https://hlasse.github.io/TextDescriptives/descriptivestats.html  

0. 'token_length_mean'
1. 'token_length_median'
2. 'token_length_std'
3. 'syllables_per_token_mean'
4. 'syllables_per_token_median'
5. 'syllables_per_token_std'
6. 'n_tokens'
7. 'n_unique_tokens'
8. 'proportion_unique_tokens'
9. 'n_characters' (?)

In [11]:
tokenFeatures = tdFeatures[0:3] + tdFeatures[7:12] + tdFeatures[12:13]

Store possible **Sentence** Features in list  
https://hlasse.github.io/TextDescriptives/descriptivestats.html  

0. 'sentence_length_mean'
1. 'sentence_length_median'
2. 'sentence_length_std'
3. 'n_sentences'
4. 'n_characters' (?)

In [12]:
sentenceFeatures = tdFeatures[4:7] + tdFeatures[13:14] + tdFeatures[12:13]

Store possible **Readability** Features in list  
https://hlasse.github.io/TextDescriptives/readability.html  

0. 'flesch_reading_ease'
1. 'flesch_kincaid_grade'
2. 'smog'
3. 'gunning_fog'
4. 'automated_readability_index'
5. 'coleman_liau_index'
6. 'lix'
7. 'rix'

In [13]:
readabilityFeatures = tdFeatures[14:22]

Store possible **Dependency Distance** Features in list  
https://hlasse.github.io/TextDescriptives/dependencydistance.html  

0. 'dependency_distance_mean'
1. 'dependency_distance_std'
2. 'prop_adjacent_dependency_relation_mean'
3. 'prop_adjacent_dependency_relation_std'

In [14]:
dependencyFeatures = tdFeatures[22:26]

Store possible **Part of Speech** Features in list  
https://hlasse.github.io/TextDescriptives/posstats.html  
https://universaldependencies.org/u/pos/all.html  

0. 'pos_prop_ADJ'
1. 'pos_prop_NOUN'
2. 'pos_prop_PUNCT'
3. 'pos_prop_PRON'
4. 'pos_prop_VERB'
5. 'pos_prop_ADP'
6. 'pos_prop_AUX'
7. 'pos_prop_SYM'
8. 'pos_prop_SCONJ'
9. 'pos_prop_PART'
10. 'pos_prop_DET'
11. 'pos_prop_PROPN'
12. 'pos_prop_CCONJ'
13. 'pos_prop_ADV'
14. 'pos_prop_INTJ'
15. 'pos_prop_NUM'
16. 'pos_prop_X'
17. 'pos_prop_SPACE'

In [15]:
posFeatures = tdFeatures[26:]

***
Feature Selection via Filter
* tokenFeatures
* sentenceFeatures
* readabilityFeatures
* dependencyFeatures
* posFeatures

`percentile_score` is key feature we will be correlating with during feature selection.

In [16]:
tdDF['percentile_score'] = df['percentile_score']

Simple function to provide possible features to correlate & correlation coeficient threshold value.
* `features` is list of features to correlate
* `corValue` is float threshold for correlation coeficient

In [17]:
def suggestedFeatures(corFeatures, corValue = 0.5, feature = 'percentile_score'):
    corFeatures.append(feature)
    cor = tdDF[corFeatures].corr()
    cor_target = abs(cor[feature])
    relevant_features = cor_target[cor_target >= corValue]
    return(relevant_features)

Compare **tokenFeatures** and identify those with correlation $\geq 0.5$ for **percentile_score**.

In [18]:
tokenFeatures = tdFeatures[0:3] + tdFeatures[7:12] + tdFeatures[12:13]
suggestedFeatures(tokenFeatures)

token_length_mean    0.506202
n_tokens             0.567294
n_unique_tokens      0.628279
n_characters         0.612752
percentile_score     1.000000
Name: percentile_score, dtype: float64

* `n_unique_tokens` and `n_characters` have the two largest values but correlate highly to one another so we will choose `n_unique_tokens`.
* `n_unique_tokens` and `n_tokens` can be argued to be roughly equivalent metrics without data and the correlation value is *very* high, again we choose `n_unique_tokens`.
* `n_unique_tokens` and `token_length_mean` do not have a high correlation value so we will also use `token_length_mean`.

In [43]:
comparison = ['n_tokens', 'n_characters', 'n_unique_tokens', 'token_length_mean']
tdDF[comparison].corr()

Unnamed: 0,n_tokens,n_characters,n_unique_tokens,token_length_mean
n_tokens,1.0,0.989236,0.951958,0.332113
n_characters,0.989236,1.0,0.960595,0.443103
n_unique_tokens,0.951958,0.960595,1.0,0.413749
token_length_mean,0.332113,0.443103,0.413749,1.0


In [19]:
comparison = ['n_unique_tokens', 'n_characters']
tdDF[comparison].corr()

Unnamed: 0,n_unique_tokens,n_characters
n_unique_tokens,1.0,0.960595
n_characters,0.960595,1.0


In [20]:
comparison = ['n_unique_tokens', 'n_tokens']
tdDF[comparison].corr()

Unnamed: 0,n_unique_tokens,n_tokens
n_unique_tokens,1.0,0.951958
n_tokens,0.951958,1.0


In [21]:
comparison = ['n_unique_tokens', 'token_length_mean']
tdDF[comparison].corr()

Unnamed: 0,n_unique_tokens,token_length_mean
n_unique_tokens,1.0,0.413749
token_length_mean,0.413749,1.0


In [22]:
lrFeatures = ['n_unique_tokens', 'token_length_mean']

Compare **sentenceFeatures** and identify those with correlation $\geq 0.5$ for **percentile_score**.

In [23]:
sentenceFeatures = tdFeatures[4:7] + tdFeatures[13:14] + tdFeatures[12:13]
suggestedFeatures(sentenceFeatures)

n_sentences         0.546956
n_characters        0.612752
percentile_score    1.000000
Name: percentile_score, dtype: float64

* while `n_sentences` has a lower value than `n_characters` they both correlate highly together and we previously discarded `n_characters` so `n_sentences` is being selected.

In [24]:
comparison = ['n_sentences', 'n_characters']
tdDF[comparison].corr()

Unnamed: 0,n_sentences,n_characters
n_sentences,1.0,0.856189
n_characters,0.856189,1.0


In [25]:
lrFeatures.append('n_sentences')

Compare **readabilityFeatures** and identify those with correlation $\geq 0.5$ for **percentile_score**.

In [26]:
readabilityFeatures = tdFeatures[14:22]
suggestedFeatures(readabilityFeatures)

smog                0.520889
percentile_score    1.000000
Name: percentile_score, dtype: float64

In [27]:
readabilityFeatures = tdFeatures[14:22]
suggestedFeatures(readabilityFeatures, 0.25)

flesch_reading_ease    0.288028
smog                   0.520889
coleman_liau_index     0.487597
rix                    0.263844
percentile_score       1.000000
Name: percentile_score, dtype: float64

In [28]:
tdDF[['smog', 'coleman_liau_index']].corr()

Unnamed: 0,smog,coleman_liau_index
smog,1.0,0.769504
coleman_liau_index,0.769504,1.0


In [29]:
lrFeatures.append('smog')
lrFeatures.append('coleman_liau_index')

* Looking at these results we see that `smog` meets our threshold and the `coleman_liau_index` moves very close.  Due to our prior work showing that that *any* readability metric will fail given cases of irregular grammar a `min(colman_liau_index, flesch_kincaid_garde)` was a viable solution.

* Including `coleman_liau_index` and `min_readability` into the correlation we see that `min_readability` has less correlation with `percentile_score` than `coleman_liau_index` and correlates fairly well with `smog` (0.74) and `coleman_liau_index` (0.81).  The latter is not surpising since it includes the `coleman_liau_index` as a possible value.
* To avoid overrepresentation the features representing **readabilityFeatures** will be `smog` and `coleman_liau_index`.

Compare **dependencyFeatures** and identify those with correlation $\geq 0.5$ for **percentile_score**.

In [31]:
dependencyFeatures = tdFeatures[22:26]
suggestedFeatures(dependencyFeatures)

percentile_score    1.0
Name: percentile_score, dtype: float64

In [32]:
dependencyFeatures = tdFeatures[22:26]
suggestedFeatures(dependencyFeatures, 0.25)

percentile_score    1.0
Name: percentile_score, dtype: float64

In [33]:
dependencyFeatures = tdFeatures[22:26]
suggestedFeatures(dependencyFeatures, -1)

dependency_distance_mean                  0.074835
dependency_distance_std                   0.057266
prop_adjacent_dependency_relation_mean    0.090763
prop_adjacent_dependency_relation_std     0.143326
percentile_score                          1.000000
Name: percentile_score, dtype: float64

Although dependency distance is tied to syntactic complexity, which was suggested as an a metric for this project we do not see any feature pass the threshold much less have a value that we could make an argument for the inclusion.

Compare **posFeature** and identify those with correlation $\geq 0.5$ for **percentile_score**.

In [34]:
posFeatures = tdFeatures[26:]
suggestedFeatures(posFeatures)

percentile_score    1.0
Name: percentile_score, dtype: float64

In [35]:
posFeatures = tdFeatures[26:]
suggestedFeatures(posFeatures, 0.25)

pos_prop_NOUN       0.286037
pos_prop_PRON       0.401154
pos_prop_VERB       0.292548
percentile_score    1.000000
Name: percentile_score, dtype: float64

In [36]:
posFeatures = tdFeatures[26:]
suggestedFeatures(posFeatures, 0.2)

pos_prop_NOUN       0.286037
pos_prop_PUNCT      0.242250
pos_prop_PRON       0.401154
pos_prop_VERB       0.292548
pos_prop_ADP        0.212126
percentile_score    1.000000
Name: percentile_score, dtype: float64

In [37]:
headers = ['pos_prop_NOUN', 'pos_prop_PUNCT', 'pos_prop_PRON', 'pos_prop_VERB', 'pos_prop_ADP']
tdDF[headers].corr()

Unnamed: 0,pos_prop_NOUN,pos_prop_PUNCT,pos_prop_PRON,pos_prop_VERB,pos_prop_ADP
pos_prop_NOUN,1.0,0.011669,-0.656155,-0.432177,0.338659
pos_prop_PUNCT,0.011669,1.0,-0.223553,-0.324296,-0.108026
pos_prop_PRON,-0.656155,-0.223553,1.0,0.470273,-0.297702
pos_prop_VERB,-0.432177,-0.324296,0.470273,1.0,-0.315742
pos_prop_ADP,0.338659,-0.108026,-0.297702,-0.315742,1.0


In [39]:
tdDF[['n_unique_tokens', 'token_length_mean', 'n_sentences', 'smog', 'coleman_liau_index',
      'percentile_score']].corr()

Unnamed: 0,n_unique_tokens,token_length_mean,n_sentences,smog,coleman_liau_index,percentile_score
n_unique_tokens,1.0,0.413749,0.848128,0.436616,0.437235,0.628279
token_length_mean,0.413749,1.0,0.273734,0.728698,0.965531,0.506202
n_sentences,0.848128,0.273734,1.0,0.200088,0.204427,0.546956
smog,0.436616,0.728698,0.200088,1.0,0.769504,0.520889
coleman_liau_index,0.437235,0.965531,0.204427,0.769504,1.0,0.487597
percentile_score,0.628279,0.506202,0.546956,0.520889,0.487597,1.0


***
## MVLR Creation

In [92]:
target = pd.DataFrame(df['percentile_score'], columns = ['percentile_score'])
X = tdDF[lrFeatures].fillna(value = 0)
y = target['percentile_score']
lm = linear_model.LinearRegression()
model = lm.fit(X, y)
predictions = lm.predict(X)
features = list(tdDF.columns)[1:]

In [128]:
lrFeatures

['n_unique_tokens',
 'token_length_mean',
 'n_sentences',
 'smog',
 'coleman_liau_index']

In [142]:
tdDF['mvlr_score'] = \
      lm.coef_[0]*tdDF['n_unique_tokens']\
    + lm.coef_[1]*tdDF['token_length_mean']\
    + lm.coef_[2]*tdDF['n_sentences']\
    + lm.coef_[3]*tdDF['smog']\
    + lm.coef_[4]*tdDF['coleman_liau_index']\
    + lm.intercept_

In [141]:
tdDF['percent_error'] = abs(tdDF['percentile_score'] - tdDF['mvlr_score']) / tdDF['percentile_score']

In [139]:
lrCols = ['percentile_score', 'mvlr_score', 'percent_error']

In [140]:
tdDF[lrCols].describe()

Unnamed: 0,percentile_score,mvlr_score,percent_error
count,5875.0,5875.0,5875.0
mean,0.641257,0.641257,0.157704
std,0.152019,0.107795,0.205068
min,0.066667,0.277237,2.4e-05
25%,0.566667,0.569177,0.051665
50%,0.666667,0.651005,0.10664
75%,0.75,0.716056,0.188696
max,1.0,1.018221,4.342278


***
## grade scratchwork

In [101]:
gradeDF = pd.DataFrame(df['actual grade level'], columns = ['actual grade level'])
gradeDF.rename(columns={'actual grade level':'grade_level'}, inplace = True)
gradeDF['smog'] = tdDF['smog']
gradeDF['coleman_liau_index'] = tdDF['coleman_liau_index']
gradeDF['actual_score'] = tdDF['percentile_score']
gradeDF['mvlr_score'] = tdDF['mvlr_score']

* Assign letter grade to numeric grade based on U.S. Department of Education, Institute of Education Sciences, National Center for Education Statistics, The 2009 High School Transcript Study.
    * https://nces.ed.gov/nationsreportcard/hsts/howgpa.aspx
* View algorithm scores within $\pm$15, $\pm$10, and $\pm$5 of actual score.

In [102]:
def letter_grade(value):
    grades = {
        .90: 'A',
        .80: 'B',
        .70: 'C',
        .60: 'D',
        0: 'F'
    }
    for key, letter in grades.items():
        if value >= key:
            return(letter)

In [103]:
gradeDF['actual_letter'] = gradeDF['actual_score'].map(letter_grade)
gradeDF['mvlr_letter'] = gradeDF['mvlr_score'].map(letter_grade)

In [104]:
def letter_score(value):
    grades = {
        'A': .9,
        'B': .8,
        'C': .7,
        'D': .6,
        'F': .5
    }
    for key, score in grades.items():
        if value == key:
            return(score)

In [105]:
gradeDF['actual_letter_score'] = gradeDF['actual_letter'].map(letter_score)
gradeDF['mvlr_letter_score'] = gradeDF['mvlr_letter'].map(letter_score)

In [106]:
headers = ['actual_score', 'actual_letter', 'mvlr_letter']
gradeDF[headers]

Unnamed: 0,actual_score,actual_letter,mvlr_letter
0,0.666667,D,D
1,0.750000,C,C
2,0.583333,F,D
3,0.833333,B,B
4,0.666667,D,C
...,...,...,...
5870,0.583333,F,B
5871,0.533333,F,D
5872,0.666667,D,B
5873,0.666667,D,C


### grade analysis

Actual Letter Grade = Machine Letter Grade

In [144]:
headers = ['actual_letter', 'mvlr_letter']
gradeDF[(gradeDF.actual_letter_score == gradeDF.mvlr_letter_score)][headers]

Unnamed: 0,actual_letter,mvlr_letter
,,
0,D,D
1,C,C
3,B,B
5,D,D
8,C,C
...,...,...
5855,D,D
5856,B,B
5864,D,D


actual $\pm$1 machine

In [67]:
headers = ['actual_letter', 'mvlr_letter']
gradeDF[(gradeDF.actual_letter_score == gradeDF.mvlr_letter_score +.1) | \
        (gradeDF.actual_letter_score == gradeDF.mvlr_letter_score -.1)][headers]

Unnamed: 0,actual_letter,mvlr_letter
2,F,D
4,D,C
9,C,D
10,D,C
13,F,D
...,...,...
5857,D,C
5863,D,C
5871,F,D
5873,D,C


actual $\pm$2 machine

In [69]:
headers = ['actual_letter', 'mvlr_letter']
gradeDF[(gradeDF.actual_letter_score == gradeDF.mvlr_letter_score +.2) | \
        (gradeDF.actual_letter_score == gradeDF.mvlr_letter_score -.2)][headers]

Unnamed: 0,actual_letter,mvlr_letter
70,B,D
77,B,D
78,B,D
103,B,D
128,B,D
...,...,...
5095,B,D
5146,B,D
5163,C,A
5427,C,A


actual $\pm$3 machine

In [71]:
headers = ['actual_letter', 'mvlr_letter']
gradeDF[(gradeDF.actual_letter_score == gradeDF.mvlr_letter_score +.3) | \
        (gradeDF.actual_letter_score == gradeDF.mvlr_letter_score -.3)][headers]

Unnamed: 0,actual_letter,mvlr_letter
2405,B,F
2441,B,F
3604,B,F
3650,B,F
3672,B,F
3714,B,F
3778,B,F
4020,B,F
4034,B,F
4068,B,F


In [107]:
headers = ['grade_level', 'actual_score', 'mvlr_score', 'actual_letter', 'mvlr_letter']
len(gradeDF[(gradeDF.actual_score > gradeDF.mvlr_score) & (gradeDF.actual_letter == gradeDF.mvlr_letter)][headers])

960

In [108]:
headers = ['actual_score', 'actual_letter', 'mvlr_letter']
gradeDF[gradeDF.actual_letter_score > gradeDF.mvlr_letter_score][headers]

Unnamed: 0,actual_score,actual_letter,mvlr_letter
6,0.833333,B,C
7,0.833333,B,C
9,0.750000,C,D
15,1.000000,A,B
22,0.833333,B,C
...,...,...,...
5563,0.716667,C,D
5580,0.700000,C,D
5615,0.833333,B,C
5769,0.833333,B,C


In [109]:
headers = ['actual_score', 'actual_letter', 'mvlr_letter']
gradeDF[gradeDF.actual_letter_score > gradeDF.mvlr_letter_score][headers]

Unnamed: 0,actual_score,actual_letter,mvlr_letter
6,0.833333,B,C
7,0.833333,B,C
9,0.750000,C,D
15,1.000000,A,B
22,0.833333,B,C
...,...,...,...
5563,0.716667,C,D
5580,0.700000,C,D
5615,0.833333,B,C
5769,0.833333,B,C


In [110]:
def review(row):
    if row['smog'] >= 12:
        val = True
    elif row['coleman_liau_index'] >= 12:
        val = True
    else:
        val = False
    return val

In [111]:
gradeDF['review'] = gradeDF.apply(review, axis=1)

In [143]:
headers = ['grade_level', 'actual_letter', 'mvlr_letter', 'review', 'smog', 'coleman_liau_index']
gradeDF[headers][gradeDF.review == True]

Unnamed: 0,grade_level,actual_letter,mvlr_letter,review,smog,coleman_liau_index
,,,,,,
55.0,8.0,B,B,True,12.94862,12.454607
144.0,8.0,B,C,True,10.411451,12.375275
214.0,8.0,B,B,True,11.421723,12.218228
309.0,8.0,B,C,True,11.781823,12.064481
415.0,8.0,A,B,True,10.71804,12.121131
546.0,8.0,D,C,True,10.864195,12.141261
592.0,8.0,D,C,True,9.210783,13.020215
878.0,8.0,D,C,True,8.548687,12.474519
914.0,8.0,F,D,True,9.150863,12.180816


***
# Save grades to csv
* `essay_id` is original `essay_id` from raw dataset.
* `grade_level` is provided to sort (i.e. could be period number or class section
* `actual_letter` is based off human scored grades
* `mvlr_letter` is based off of feature selection grading without Parts of Speech
* `mvlr_letter_pos` is based off of feature selection grading with Parts of Speech
* `review` is **True** or **False** based off the `smog` or `coleman_liau_index` being above grade 12 level.

In [127]:
gradeDF['essay_id'] = df['essay_id']
headers = ['essay_id', 'grade_level', 'actual_letter', 'mvlr_letter', 'review']
gradeDF[headers].to_csv(path_or_buf='data/grades.csv')

In [152]:
len(tdDF.columns)

50