# proof of concept
## 15 papers from grade 9 students

In [1]:
import pdftotext
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from pathlib import Path
from readability import Readability
import numpy as np

***
## All files in directory

In [2]:
# adjust directory to point to location of files
directory = 'p3c_docs/'

# create file list of pdf in directory
pdf_folder = Path(directory).rglob('*.pdf')

# create list of files and verify contents
files = [file for file in pdf_folder]
#files

In [3]:
files.sort()

In [4]:
tokens = []
multi_corpus = []
stopWords = set(stopwords.words('english'))
actual_grade = []

# iterate every file in directory
for file in files:
    # open file
    with open(file, 'rb') as f:
        # conversion with pdftotext
        multi_pdf = pdftotext.PDF(f)
        # final token is grade, omit
        multi_corpus.append(''.join(multi_pdf))
        # place current pdf text into list of tokens
        tokens += nltk.word_tokenize(''.join(multi_pdf))
        # final token in each essay is grade, remove
        actual_grade.append(int(tokens[-1][:-4]))
        del tokens[-1]
        #corpus.append(tokens)

# update tokens by setting all to lowercase,
# removing stopwords,
# removing non-alphanumeric
tokens_removed = [word.lower() for word in tokens
                  if word.lower() not in stopWords
                  and word.isalpha()]

In [5]:
top_n_words = 10
fd = nltk.FreqDist(tokens_removed)
target_words = sorted(fd, key = fd.get, reverse = True)[:top_n_words]

In [6]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(multi_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
dfM = pd.DataFrame(denselist, columns=feature_names)

In [7]:
dropped_columns = list(set(feature_names).difference(target_words))
dfM.drop(columns = dropped_columns)

Unnamed: 0,computers,information,schools,students,tablets,technology,text,textbooks,use,would
0,0.050334,0.066137,0.036451,0.05822,0.308168,0.103377,0.0,0.171204,0.072901,0.018225
1,0.0,0.02185,0.036127,0.019234,0.118777,0.020491,0.0,0.033936,0.018063,0.144506
2,0.141418,0.049551,0.020482,0.116318,0.134683,0.007745,0.074327,0.153924,0.034137,0.102411
3,0.081497,0.095186,0.059018,0.146634,0.258721,0.111586,0.071389,0.166321,0.078691,0.039345
4,0.0,0.009209,0.053293,0.14592,0.114427,0.0,0.110509,0.178792,0.02284,0.015227
5,0.022032,0.096496,0.047865,0.084944,0.329727,0.0181,0.077197,0.299751,0.03191,0.03191
6,0.097102,0.0,0.035159,0.037438,0.214681,0.179485,0.085059,0.132111,0.01758,0.070319
7,0.076605,0.050328,0.041607,0.044303,0.312674,0.0,0.033552,0.260562,0.110951,0.013869
8,0.102062,0.053642,0.044346,0.204623,0.194404,0.083846,0.0,0.124974,0.029564,0.0
9,0.0,0.0,0.039149,0.041686,0.196136,0.029608,0.03157,0.294204,0.052199,0.01305


In [8]:
names = {}
for x in range(len(files)):
    # split at directory divide
    # take last entry as it has filename
    names[x] = str(files[x]).split('/')[-1]

In [9]:
dfM.rename(index=names, inplace = True)

In [10]:
dfM.drop(columns = dropped_columns)

Unnamed: 0,computers,information,schools,students,tablets,technology,text,textbooks,use,would
essay01.pdf,0.050334,0.066137,0.036451,0.05822,0.308168,0.103377,0.0,0.171204,0.072901,0.018225
essay02.pdf,0.0,0.02185,0.036127,0.019234,0.118777,0.020491,0.0,0.033936,0.018063,0.144506
essay03.pdf,0.141418,0.049551,0.020482,0.116318,0.134683,0.007745,0.074327,0.153924,0.034137,0.102411
essay04.pdf,0.081497,0.095186,0.059018,0.146634,0.258721,0.111586,0.071389,0.166321,0.078691,0.039345
essay05.pdf,0.0,0.009209,0.053293,0.14592,0.114427,0.0,0.110509,0.178792,0.02284,0.015227
essay06.pdf,0.022032,0.096496,0.047865,0.084944,0.329727,0.0181,0.077197,0.299751,0.03191,0.03191
essay07.pdf,0.097102,0.0,0.035159,0.037438,0.214681,0.179485,0.085059,0.132111,0.01758,0.070319
essay08.pdf,0.076605,0.050328,0.041607,0.044303,0.312674,0.0,0.033552,0.260562,0.110951,0.013869
essay09.pdf,0.102062,0.053642,0.044346,0.204623,0.194404,0.083846,0.0,0.124974,0.029564,0.0
essay10.pdf,0.0,0.0,0.039149,0.041686,0.196136,0.029608,0.03157,0.294204,0.052199,0.01305


In [11]:
dfSummary = pd.DataFrame({'word': [], 'max TF-IDF value' : [], 'file' : []})

In [12]:
for word in target_words:
    dfSummary = dfSummary.append({'word': word, 'max TF-IDF value' : dfM[word].max(), 'file' : dfM[word].idxmax()}, ignore_index = True)

***
## 10 Most common words sorted by max TF-IDF value and doc containing said value.

In [13]:
dfSummary.sort_values(by = 'max TF-IDF value', ascending = False)

Unnamed: 0,word,max TF-IDF value,file
0,tablets,0.329727,essay06.pdf
1,textbooks,0.299751,essay06.pdf
2,students,0.204623,essay09.pdf
9,technology,0.179485,essay07.pdf
6,information,0.171348,essay13.pdf
4,would,0.144506,essay02.pdf
5,computers,0.141418,essay03.pdf
3,use,0.134139,essay11.pdf
7,text,0.117311,essay14.pdf
8,schools,0.1159,essay13.pdf


***

In [14]:
# initialize list of Flesch-Kincaid Grade Level scores
fk_score = []
# initialize list of Coleman Liau Index scores
cl_score = []
# initialize list of TF-IDF scores for file among corpus top_n_words
corpus_score = []
# initialize list of TF-IDF scores for file top_n_words among corpus
doc_score = []

# iterate every file in directory
for file_index in range(len(files)):
    if len(multi_corpus[file_index].split(' ')) > 100:
        # append Flesch-Kincaid Grade Level Score
        fk_score.append( Readability(multi_corpus[file_index]).flesch_kincaid().score )
        # append Coleman Liau Index Score
        cl_score.append( Readability(multi_corpus[file_index]).coleman_liau().score )
    else:
        # future use np.NaN?
        fk_score.append(0)
        cl_score.append(0)
    # tokenize file from file_index document and remove stopWords
    target_tokens = [word.lower() for word in nltk.word_tokenize(''.join(multi_corpus[file_index])) if word.lower() not in stopWords and word.isalpha()]
    # append corpus score
    corpus_score.append(dfM.drop(columns = dropped_columns).iloc[file_index].mean())
    # append doc score
    fdDoc = nltk.FreqDist(target_tokens)
    target_doc_words = sorted(fdDoc, key = fdDoc.get, reverse = True)[:top_n_words]
    doc_score.append(dfM.drop(columns = list(set(feature_names).difference(target_doc_words))).iloc[file_index].mean())

In [15]:
#dfScore = pd.DataFrame(list(zip(fk_score, corpus_score, doc_score)), columns = ['Flesch-Kincaid', 'Corpus TF-IDF', 'Doc TF-IDF'])
#dfScore = pd.DataFrame(list(zip(fk_score, corpus_score, doc_score)), columns = ['Coleman Liau', 'Corpus TF-IDF', 'Doc TF-IDF'])
dfScore = pd.DataFrame(list(zip(cl_score, fk_score, corpus_score, doc_score)), 
                       columns = ['Coleman Liau', 'Flesch-Kincaid', 'Corpus TF-IDF', 'Doc TF-IDF'])
dfScore.rename(index=names, inplace = True)

***
## Display DataFrame of all essays with
* `Coleman Liau` Index Score
* `Flesch-Kincaid` Grade Level Score
* Average TF-IDF score of 10 most frequent words in corpus for file (`Corpus TF-IDF`)
* Average TF-IDF score of 10 most frequent words in file (`Doc TF-IDF`)

In [16]:
dfScore

Unnamed: 0,Coleman Liau,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF
essay01.pdf,11.044751,8.790555,0.088502,0.120488
essay02.pdf,8.438284,7.798837,0.041298,0.090532
essay03.pdf,9.887097,9.637066,0.0835,0.111753
essay04.pdf,11.933522,10.14739,0.110839,0.119473
essay05.pdf,9.686873,7.522859,0.065022,0.115118
essay06.pdf,9.866667,7.635331,0.103993,0.13225
essay07.pdf,11.821176,11.469634,0.086893,0.11333
essay08.pdf,9.757314,8.535339,0.094445,0.156098
essay09.pdf,9.899103,9.362091,0.083746,0.135828
essay10.pdf,11.01887,8.371108,0.06976,0.131819


***

## Readability Score Conversion
* High School grade level score is 9 - 12 (0.75 to 1)
  * Scores below or above are possible
* `F-K%` and `CL%` are `Flesh-Kincaid` and `Coleman Liau` scores divided by 12 to create proportion
* `TF-IDF Mean` is the average of the two vocab metrics above

In [17]:
dfScore['F-K%'] = dfScore['Flesch-Kincaid']/12
dfScore['CL%'] = dfScore['Coleman Liau']/12
dfScore['TF-IDF Mean'] = dfScore[['Corpus TF-IDF', 'Doc TF-IDF']].mean(axis = 1)

* F-K Grade and CL Grade are of 100 point scale
  * F-K% or CL% is summed with the TF-IDF Mean then the resultant sum is multiplied by 100
  * As a conservative estimation the floor function is applied to the product
  * The datatype is then cast to int64 to match `actual_grade` list entries from directory processing

In [18]:
dfScore['F-K Grade'] = np.floor(100*(dfScore['F-K%'] + dfScore['TF-IDF Mean'])).apply(np.int64)
dfScore['CL Grade'] = np.floor(100*(dfScore['CL%'] + dfScore['TF-IDF Mean'])).apply(np.int64)
dfScore['Actual Grade'] = actual_grade

***
## DataFrame observations
* `Flesh-Kincaid` has possible cascading errors due to structure of paper
  * grade level implies students are ***extremely*** advanced if no errors
    * `mean` = 18 th grade level
    * `max` = 151 st grade level
  * `F-K%` and `F-K Grade` have similarly high `mean` and `max` values as a result
* `Coleman Liau` appears numerically plausible
  * grade level implies students tend to be contained within the grade 9-12 expected result of a High School apper
    * `mean` = 10 th grade level
    * `max` = 12 th grade level
  * `CL%` and `CL Grade` have similarly plausible grades
  * `CL Grade` has potential issue with `mean` and `max` possibly indiciating overly high scores
    * `mean` = 92% average for all students
      * could be indicated grading is too generous or class is advanced or assignment is easy
    * `max` = 110%
      * could indicated exteremely advanced student, easy assignment, generous grading, or weighing needed to be done on grade calculation

In [19]:
dfScore.describe()

Unnamed: 0,Coleman Liau,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,CL%,TF-IDF Mean,F-K Grade,CL Grade,Actual Grade
count,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0
mean,9.948876,18.478622,0.077786,0.127574,1.539885,0.829073,0.10268,163.733333,92.6,83.733333
std,1.347383,36.741405,0.02221,0.020344,3.061784,0.112282,0.017012,305.785047,12.240448,9.866731
min,7.456336,5.975351,0.033715,0.090532,0.497946,0.621361,0.065915,60.0,69.0,66.0
25%,9.062579,8.084972,0.062958,0.114224,0.673748,0.755215,0.093848,77.0,84.5,76.0
50%,9.887097,9.003249,0.083746,0.131819,0.750271,0.823925,0.104495,83.0,93.0,83.0
75%,11.031811,9.892228,0.092583,0.138466,0.824352,0.919318,0.114394,94.0,101.5,91.0
max,11.933522,151.171855,0.110839,0.15882,12.597655,0.99446,0.126623,1268.0,110.0,100.0


## Possible Solution
* Investigate results using grading on `Coleman Liau`, `Flesch-Kincaid`, and `min` of both for grading compared to actual grades

In [20]:
dfScore['Min Score'] = dfScore[['Flesch-Kincaid', 'Coleman Liau']].min(axis=1)
dfScore['Min%'] = dfScore[['F-K%', 'CL%']].min(axis=1)
dfScore['Min Grade'] = dfScore[['F-K Grade', 'CL Grade']].min(axis=1)

* Grade Level Scores using `min` of `Flesch-Kincaid` and `Coleman Liau` produces `mean` $\approx$9, `min` $\approx$6, and `max` $\approx$ 11.
  * Students ***average*** grade level of 9 with student(s) below and above grade level (but not above high school level) which feels plausible.

In [21]:
dfScore[['Flesch-Kincaid', 'Coleman Liau', 'Min Score']].describe()

Unnamed: 0,Flesch-Kincaid,Coleman Liau,Min Score
count,15.0,15.0,15.0
mean,18.478622,9.948876,8.780541
std,36.741405,1.347383,1.465265
min,5.975351,7.456336,5.975351
25%,8.084972,9.062579,7.717084
50%,9.003249,9.887097,8.535339
75%,9.892228,11.031811,9.591842
max,151.171855,11.933522,11.469634


* Algorithm Grade using `min` of `Flesch-Kincaid` and `Coleman Liau` produces `mean` $\approx$ 83, `min` $\approx$ 60, and `max` = 105.
 * The `mean` falls in the B letter grade range (passing) and a class average that would not read as concerning in a quick scan.
 * The `min` of 60 falls in the lower bound of the D letter grade range which is passing but, would certainly warrant a follow-up meeting with the student in question or a human review of the paper if processed by machine first.
 * The `max` of 105 is above the upper bound of the A letter grade range which is passing and should indicate a human review to be done.  Depending on the student in question the teacher may have the human knowledge that this is a student with advanced writing skills that could ***break*** the computer scale to produce larger grades than normally expected.

In [22]:
dfScore[['F-K Grade', 'CL Grade', 'Min Grade']].describe()

Unnamed: 0,F-K Grade,CL Grade,Min Grade
count,15.0,15.0,15.0
mean,163.733333,92.6,82.866667
std,305.785047,12.240448,12.888903
min,60.0,69.0,60.0
25%,77.0,84.5,73.0
50%,83.0,93.0,83.0
75%,94.0,101.5,91.0
max,1268.0,110.0,105.0


***
## Percent Error
$ 100\cdot\frac{|\text{accepted value}-\text{experimental value}|}{\text{accepted value}}  $

In [23]:
dfScore['MG Error%'] = 100*abs(dfScore['Min Grade'] - dfScore['Actual Grade']) / dfScore['Actual Grade']
dfScore['F-K Error%'] = 100*abs(dfScore['F-K Grade'] - dfScore['Actual Grade']) / dfScore['Actual Grade']
dfScore['CL Error%'] = 100*abs(dfScore['CL Grade'] - dfScore['Actual Grade']) / dfScore['Actual Grade']

In [24]:
dfScore[['Actual Grade', 'Min Grade', 'MG Error%', 'F-K Grade', 'F-K Error%', 'CL Grade', 'CL Error%']]

Unnamed: 0,Actual Grade,Min Grade,MG Error%,F-K Grade,F-K Error%,CL Grade,CL Error%
essay01.pdf,75,83,10.666667,83,10.666667,102,36.0
essay02.pdf,70,71,1.428571,71,1.428571,76,8.571429
essay03.pdf,100,90,10.0,90,10.0,92,8.0
essay04.pdf,83,96,15.662651,96,15.662651,110,32.53012
essay05.pdf,93,71,23.655914,71,23.655914,89,4.301075
essay06.pdf,83,75,9.638554,75,9.638554,94,13.253012
essay07.pdf,95,105,10.526316,105,10.526316,108,13.684211
essay08.pdf,83,83,0.0,83,0.0,93,12.048193
essay09.pdf,95,88,7.368421,88,7.368421,93,2.105263
essay10.pdf,66,79,19.69697,79,19.69697,101,53.030303


* `MG Error%`
    * `mean` $\approx$ 13% is lower than the `CL Error%` of 17% and the `F-K Error%` of 119%
        * `F-K Error%` is result of potential scoring issues discussed above.
    * `min` = 0%
        * Other metrics have reasonable (0% and 2%) values, use of `min` for each metric provides best value.
    * `max` = $\approx$ 28%
        * Although the value is certainly above a desired result it is far better than the `Flesch-Kincaid` value and nearly half that of the `Coleman Liau` value.

In [25]:
dfScore[['MG Error%', 'F-K Error%', 'CL Error%']].describe()

Unnamed: 0,MG Error%,F-K Error%,CL Error%
count,15.0,15.0,15.0
mean,13.202512,118.75557,17.367276
std,8.3149,407.280203,14.792705
min,0.0,0.0,2.105263
25%,8.503488,8.751861,6.150538
50%,10.666667,10.666667,13.253012
75%,19.588745,21.676442,26.820371
max,27.710843,1590.666667,53.030303


***
## Performance
* Assign letter grade to numeric grade based on U.S. Department of Education, Institute of Education Sciences, National Center for Education Statistics, The 2009 High School Transcript Study.
    * https://nces.ed.gov/nationsreportcard/hsts/howgpa.aspx
* View algorithm scores within $\pm$15, $\pm$10, and $\pm$5 of actual score.

In [26]:
grades = {
    90: 'A',
    80: 'B',
    70: 'C',
    60: 'D',
    0: 'F'
}

def letter_grade(value):
    for key, letter in grades.items():
        if value >= key:
            return(letter)

In [27]:
dfScore['Actual Letter'] = dfScore['Actual Grade'].map(letter_grade)
dfScore['Min Letter'] = dfScore['Min Grade'].map(letter_grade)

In [28]:
dfScore[['Actual Letter', 'Actual Grade','Min Letter', 'Min Grade']]

Unnamed: 0,Actual Letter,Actual Grade,Min Letter,Min Grade
essay01.pdf,C,75,B,83
essay02.pdf,C,70,C,71
essay03.pdf,A,100,A,90
essay04.pdf,B,83,A,96
essay05.pdf,A,93,C,71
essay06.pdf,B,83,C,75
essay07.pdf,A,95,A,105
essay08.pdf,B,83,B,83
essay09.pdf,A,95,B,88
essay10.pdf,D,66,C,79


In [29]:
dfScore['Min - Actual'] = dfScore['Min Grade'] - dfScore['Actual Grade']
dfScore['In 15'] = abs(dfScore['Min - Actual']) <= 15
dfScore['In 10'] = abs(dfScore['Min - Actual']) <= 10
dfScore['In 5'] = abs(dfScore['Min - Actual']) <= 5

In [30]:
dfScore[['Actual Letter', 'Actual Grade','Min Letter', 'Min Grade', 'Min - Actual', 'In 15', 'In 10', 'In 5']]

Unnamed: 0,Actual Letter,Actual Grade,Min Letter,Min Grade,Min - Actual,In 15,In 10,In 5
essay01.pdf,C,75,B,83,8,True,True,False
essay02.pdf,C,70,C,71,1,True,True,True
essay03.pdf,A,100,A,90,-10,True,True,False
essay04.pdf,B,83,A,96,13,True,False,False
essay05.pdf,A,93,C,71,-22,False,False,False
essay06.pdf,B,83,C,75,-8,True,True,False
essay07.pdf,A,95,A,105,10,True,True,False
essay08.pdf,B,83,B,83,0,True,True,True
essay09.pdf,A,95,B,88,-7,True,True,False
essay10.pdf,D,66,C,79,13,True,False,False


## Results & Observations
* 12 scores $\pm$15 points of actual
* 8 scores $\pm$10 points of actual
* 3 scores $\pm$5 points of actual
* 1 score $\pm$0 points of actual

In [31]:
sub_columns = ['Actual Letter', 'Min Letter', 'Actual Grade', 'Min Grade', 'Min - Actual', 'Min Score', 'TF-IDF Mean']

In [32]:
dfScore.loc[dfScore['Actual Grade'] == dfScore['Min Grade']][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay08.pdf,B,B,83,83,0,8.535339,0.125272


In [33]:
len(dfScore[dfScore['In 15'] == True]), len(dfScore[dfScore['In 10'] == True]), len(dfScore[dfScore['In 5'] == True])

(12, 8, 3)

* 5 papers ($\frac{1}{3}$ of corpus) achieved same letter grade as actual

In [34]:
dfScore.loc[dfScore['Actual Letter'] == dfScore['Min Letter']][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay02.pdf,C,C,70,71,1,7.798837,0.065915
essay03.pdf,A,A,100,90,-10,9.637066,0.097626
essay07.pdf,A,A,95,105,10,11.469634,0.100112
essay08.pdf,B,B,83,83,0,8.535339,0.125272
essay12.pdf,C,C,75,78,3,8.353183,0.08706


* 3 papers (essay05.pdf, essay11.pdf, essay14.pdf) that are not $\pm$15 of actual have been scored significantly lower, in each case $\approx$20 points lower than actual grade.
    * Potential reason could be this reflects improvement in student work over course of time rather than only work existing in the vacuum of this assignment.
    * Looking at algorithm values all three were lower than grade level with low vocabulary mean.
      * Could support previous idea but, certainly, also indicate weighting should be investigated for components `Min Score` and `TF-IDF Mean`
          * Likely also for the `Corpus TF-IDF` and `Doc TF-IDF`.

In [35]:
dfScore['Min Score'].idxmax(), dfScore['Actual Grade'].idxmax(), dfScore['Min Grade'].idxmax()

('essay07.pdf', 'essay03.pdf', 'essay07.pdf')

In [36]:
dfScore.loc[dfScore['In 15'] == False][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay05.pdf,A,C,93,71,-22,7.522859,0.09007
essay11.pdf,B,D,89,69,-20,7.456336,0.078395
essay14.pdf,B,D,83,60,-23,5.975351,0.107148


* Lower rated than actual showed no letter change in 1 and full letter drop in 2.
* Higher rated than acutal showed no letter change in 3 and full letter raise in 5.
* Use of $\pm$ in 100 point to Letter conversion could lessen this impact.
    * Where grade is defined as -, neutral, + one could then investigate $\frac{1}{3}$, $\frac{2}{3}$, and whole grade moves.

In [37]:
letter_match_index = dfScore.loc[dfScore['Actual Letter'] == dfScore['Min Letter']].index
in_15_index = dfScore.loc[dfScore['In 15'] == True].index
dfScore.loc[~dfScore.index.isin(letter_match_index) & dfScore.index.isin(in_15_index)][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay01.pdf,C,B,75,83,8,8.790555,0.104495
essay04.pdf,B,A,83,96,13,10.14739,0.115156
essay06.pdf,B,C,83,75,-8,7.635331,0.118122
essay09.pdf,A,B,95,88,-7,9.362091,0.109787
essay10.pdf,D,C,66,79,13,8.371108,0.10079
essay13.pdf,B,A,89,103,14,11.106414,0.113632
essay15.pdf,C,A,77,92,15,9.546617,0.126623


* 7 papers $\pm$15 points of actual score without the `Actual Letter` and `Min Letter` being the same.

In [38]:
dfScore[~dfScore.index.isin(letter_match_index) & dfScore.index.isin(in_15_index) & (dfScore['Actual Grade'] > dfScore['Min Grade'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay06.pdf,B,C,83,75,-8,7.635331,0.118122
essay09.pdf,A,B,95,88,-7,9.362091,0.109787


* 2 papers marked lower than actual

In [39]:
dfScore[~dfScore.index.isin(letter_match_index) & dfScore.index.isin(in_15_index) & (dfScore['Actual Grade'] < dfScore['Min Grade'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay01.pdf,C,B,75,83,8,8.790555,0.104495
essay04.pdf,B,A,83,96,13,10.14739,0.115156
essay10.pdf,D,C,66,79,13,8.371108,0.10079
essay13.pdf,B,A,89,103,14,11.106414,0.113632
essay15.pdf,C,A,77,92,15,9.546617,0.126623


* 5 papers marked higher than acutal

# scratch work

In [40]:
dfScore.loc[(dfScore['In 15'] == True) & (dfScore['Actual Grade'] == dfScore['Min Grade'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay08.pdf,B,B,83,83,0,8.535339,0.125272


In [41]:
dfScore.loc[(dfScore['In 15'] == True) & (dfScore['Actual Grade'] > dfScore['Min Grade']) & (dfScore['Actual Letter'] == dfScore['Min Letter'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay03.pdf,A,A,100,90,-10,9.637066,0.097626


In [42]:
dfScore.loc[(dfScore['In 15'] == True) & (dfScore['Actual Grade'] > dfScore['Min Grade']) & (dfScore['Actual Letter'] != dfScore['Min Letter'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay06.pdf,B,C,83,75,-8,7.635331,0.118122
essay09.pdf,A,B,95,88,-7,9.362091,0.109787


In [43]:
dfScore.loc[(dfScore['In 15'] == True) & (dfScore['Actual Grade'] < dfScore['Min Grade']) & (dfScore['Actual Letter'] == dfScore['Min Letter'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay02.pdf,C,C,70,71,1,7.798837,0.065915
essay07.pdf,A,A,95,105,10,11.469634,0.100112
essay12.pdf,C,C,75,78,3,8.353183,0.08706


In [44]:
dfScore.loc[(dfScore['In 15'] == True) & (dfScore['Actual Grade'] <= dfScore['Min Grade']) & (dfScore['Actual Letter'] != dfScore['Min Letter'])][sub_columns]

Unnamed: 0,Actual Letter,Min Letter,Actual Grade,Min Grade,Min - Actual,Min Score,TF-IDF Mean
essay01.pdf,C,B,75,83,8,8.790555,0.104495
essay04.pdf,B,A,83,96,13,10.14739,0.115156
essay10.pdf,D,C,66,79,13,8.371108,0.10079
essay13.pdf,B,A,89,103,14,11.106414,0.113632
essay15.pdf,C,A,77,92,15,9.546617,0.126623
