In [1]:
import pdftotext
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from pathlib import Path
from readability import Readability

***
***
## Process a directory of PDF

In [2]:
# adjust directory to point to location of files
directory = 'hf_pdf/'

# create file list of pdf in directory
pdf_folder = Path(directory).rglob('*.pdf')

# create list of files and verify contents
files = [file for file in pdf_folder]
#files

## Iterate through each file and
* Tokenize file text
* Create consistent case `.lower()` for each token
* Remove tokens from `nltk` library `english` stopwords
* Remove non-`.isalpha()` tokens

In [4]:
tokens = []
multi_corpus = []
stopWords = set(stopwords.words('english'))

# iterate every file in directory
for file in files:
    # open file
    with open(file, 'rb') as f:
        # conversion with pdftotext
        multi_pdf = pdftotext.PDF(f)
        multi_corpus.append(''.join(multi_pdf))
        # place current pdf text into list of tokens
        tokens += nltk.word_tokenize(''.join(multi_pdf))
        #corpus.append(tokens)

# update tokens by setting all to lowercase,
# removing stopwords,
# removing non-alphanumeric
tokens_removed = [word.lower() for word in tokens
                  if word.lower() not in stopWords
                  and word.isalpha()]

## based on `top_n_words` to search for of `tokens_removed` (no stopwords) create a frequency distribution `fd` and place that number of words in list `target_words`

In [5]:
top_n_words = 10
fd = nltk.FreqDist(tokens_removed)
target_words = sorted(fd, key = fd.get, reverse = True)[:top_n_words]

***
# Clustering
***
## TF-IDF
* take unique tokens from each pdf being fed as input
* store each token as a string in the corpus

In [6]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(multi_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
dfM = pd.DataFrame(denselist, columns=feature_names)

### set properties will allow us to remove every column that is not a targeted word from the early NLTK selection


In [7]:
dropped_columns = list(set(feature_names).difference(target_words))
dfM.drop(columns = dropped_columns)

Unnamed: 0,also,computer,computers,family,friends,get,go,people,time,use
0,0.071953,0.056562,0.224392,0.040466,0.000000,0.000000,0.000000,0.278647,0.070110,0.081230
1,0.038443,0.045331,0.194820,0.064861,0.020262,0.038645,0.019941,0.074438,0.037459,0.043400
2,0.014957,0.058789,0.023322,0.000000,0.031532,0.000000,0.000000,0.046338,0.000000,0.016885
3,0.056539,0.207410,0.073467,0.042396,0.000000,0.000000,0.039104,0.102178,0.018363,0.063828
4,0.000000,0.082640,0.163922,0.059123,0.000000,0.123289,0.018177,0.094993,0.034144,0.059340
...,...,...,...,...,...,...,...,...,...,...
602,0.000000,0.207015,0.000000,0.069696,0.065315,0.000000,0.016071,0.191968,0.301880,0.000000
603,0.000000,0.045736,0.030240,0.109069,0.081771,0.058485,0.040240,0.015021,0.018897,0.043788
604,0.034119,0.107285,0.106404,0.000000,0.000000,0.000000,0.035397,0.026426,0.033245,0.000000
605,0.014097,0.044326,0.043962,0.079280,0.014859,0.028341,0.000000,0.032755,0.068679,0.000000


### create dictionary of k:v pair index : pdf filename to rename pandas rows for readability

In [9]:
str(files[10])[7:-4]

'essay_181'

In [10]:
names = {}
for x in range(len(files)):
    names[x] = str(files[x]).split('/')[-1]

0 essay_208
1 essay_546
2 essay_220
3 essay_234
4 essay_552
5 essay_585
6 essay_591
7 essay_156
8 essay_142
9 essay_195
10 essay_181
11 essay_368
12 essay_432
13 essay_354
14 essay_340
15 essay_17
16 essay_426
17 essay_397
18 essay_383
19 essay_382
20 essay_396
21 essay_16
22 essay_341
23 essay_427
24 essay_433
25 essay_355
26 essay_369
27 essay_180
28 essay_194
29 essay_143
30 essay_157
31 essay_590
32 essay_584
33 essay_235
34 essay_553
35 essay_547
36 essay_221
37 essay_209
38 essay_579
39 essay_551
40 essay_237
41 essay_223
42 essay_545
43 essay_592
44 essay_586
45 essay_169
46 essay_141
47 essay_155
48 essay_182
49 essay_196
50 essay_419
51 essay_28
52 essay_425
53 essay_343
54 essay_14
55 essay_357
56 essay_431
57 essay_380
58 essay_394
59 essay_395
60 essay_381
61 essay_356
62 essay_430
63 essay_424
64 essay_15
65 essay_342
66 essay_418
67 essay_29
68 essay_197
69 essay_183
70 essay_154
71 essay_140
72 essay_168
73 essay_587
74 essay_593
75 essay_222
76 essay_544
77 essay_550
78

In [11]:
dfM.rename(index=names, inplace = True)

In [12]:
dfM.drop(columns = dropped_columns)

Unnamed: 0,also,computer,computers,family,friends,get,go,people,time,use
essay_208,0.071953,0.056562,0.224392,0.040466,0.000000,0.000000,0.000000,0.278647,0.070110,0.081230
essay_546,0.038443,0.045331,0.194820,0.064861,0.020262,0.038645,0.019941,0.074438,0.037459,0.043400
essay_220,0.014957,0.058789,0.023322,0.000000,0.031532,0.000000,0.000000,0.046338,0.000000,0.016885
essay_234,0.056539,0.207410,0.073467,0.042396,0.000000,0.000000,0.039104,0.102178,0.018363,0.063828
essay_552,0.000000,0.082640,0.163922,0.059123,0.000000,0.123289,0.018177,0.094993,0.034144,0.059340
...,...,...,...,...,...,...,...,...,...,...
essay_239,0.000000,0.207015,0.000000,0.069696,0.065315,0.000000,0.016071,0.191968,0.301880,0.000000
essay_577,0.000000,0.045736,0.030240,0.109069,0.081771,0.058485,0.040240,0.015021,0.018897,0.043788
essay_211,0.034119,0.107285,0.106404,0.000000,0.000000,0.000000,0.035397,0.026426,0.033245,0.000000
essay_205,0.014097,0.044326,0.043962,0.079280,0.014859,0.028341,0.000000,0.032755,0.068679,0.000000


In [13]:
dfSummary = pd.DataFrame({'word': [], 'max TF-IDF value' : [], 'file' : []})

In [14]:
for word in target_words:
    dfSummary = dfSummary.append({'word': word, 'max TF-IDF value' : dfM[word].max(), 'file' : dfM[word].idxmax()}, ignore_index = True)

In [15]:
dfSummary.sort_values(by = 'max TF-IDF value', ascending = False)

Unnamed: 0,word,max TF-IDF value,file
0,computers,0.357421,essay_41
1,people,0.328015,essay_192
3,time,0.30188,essay_239
2,computer,0.281669,essay_353
9,use,0.262015,essay_58
6,also,0.19658,essay_298
4,get,0.190251,essay_423
7,go,0.186935,essay_182
5,friends,0.180627,essay_248
8,family,0.177341,essay_182


## KMeans
* Predict the cluster for `search_text`
* access `dfM` dataframe utilizing prediction to determine pdf file where target text is likely to be

In [16]:
num = len(multi_pdf) # set clusters to number of documents being scrubbed?
kmeans = KMeans(n_clusters = num, init = 'k-means++', max_iter = 500, n_init = 1)
kmeans.fit(vectors)
centroids = kmeans.cluster_centers_
print(centroids) #This will print cluster centroids as tf-idf vectors

[[1.15244307e-04 7.13756349e-05 1.94047282e-04 ... 1.13972938e-04
  1.20163930e-04 2.14075734e-04]]


In [17]:
kmeans.predict(vectors)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
search_text = "how to run on the water"
predicted = kmeans.predict(vectorizer.transform([search_text]))
dfM.loc[ names[ predicted[0] ] ].name

'essay_208'

## Scoring
* Find Flesch-Kincaid Grade Level.
* Find TF-IDF average of scored document from `top_n_words` `nltk` Frequency Distribution of corpus.
* Find TF-IDF average of scored document from `top_n_words` `nltk` Frequency Distribution of scored document.

In [22]:
# all files from original scan
files

[PosixPath('test_pdf/MedvedevaEtAl2019.pdf'),
 PosixPath('test_pdf/KDD97-003.pdf'),
 PosixPath('test_pdf/P99-1001.pdf'),
 PosixPath('test_pdf/10.1007978-3-319-67056-018.pdf'),
 PosixPath('test_pdf/dummy_test.pdf')]

In [66]:
files[0]

PosixPath('hf_pdf/essay_208.pdf')

In [61]:
# string conversion and indexing produces path
str(files[125])

'hf_pdf/essay_231.pdf'

In [102]:
# set path to string conversion of desired file
#path = str(files[0])
path = 'hf_pdf/essay_125.pdf'
with open(path, "rb") as f:
    pdf = pdftotext.PDF(f)

In [103]:
# initialize text string, iterate through pdf, append text
text = ''
for page in pdf:
    text+=page

In [111]:
test = text.replace('\n', ' ')
test

'Computers donâ€™t have any affect on kids we just love going on cause we use it for help and this persuade the readers of the local newspaper cause we need to be able to communicate also do writing essays and doing social studies or science homework my ideas are let us go computers cause were not bothering u can just leave us alone and let us do what you need to do cause what computers are what give us information for we have to do and were to do wat we gotta do and u people can just leave us alone cause arent addicting to me or anyone and if we were it still would it matter cause a computers a computer u dont punish it because just punish us from the computer punish us because of it cause its the computer fault it can be addicting cause the computer is device that gives us wat we need and the information we also the computer does favors for us the computer is a amazing thing \x0c'

In [106]:
text

'Computers donâ€™t have any affect on kids we just love going on cause we use it for help and this persuade the\nreaders of the local newspaper cause we need to be able to communicate also do writing essays and doing social\nstudies or science homework my ideas are let us go computers cause were not bothering u can just leave us alone\nand let us do what you need to do cause what computers are what give us information for we have to do and were to\ndo wat we gotta do and u people can just leave us alone cause arent addicting to me or anyone and if we were it\nstill would it matter cause a computers a computer u dont punish it because just punish us from the computer\npunish us because of it cause its the computer fault it can be addicting cause the computer is device that gives\nus wat we need and the information we also the computer does favors for us the computer is a amazing thing\n\x0c'

In [104]:
len(text.split(' '))

168

In [130]:
# score text via Flesch-Kincaid Grade Level metric
r = Readability(text)
fk = r.flesch_kincaid()
fk.score, fk.grade_level

(68.29157303370786, '68')

In [151]:
r = Readability(text)
metric = r.coleman_liau()
metric.score, metric.grade_level

(7.619775280898875, '8')

# Text with very awkward grammar or spelling causes artifically high scores?

In [75]:
# mean score of top_n_words in corpus for doc in files index 0
corpus_score = dfM.drop(columns = dropped_columns).iloc[0].mean()
corpus_score

0.08233593011175473

In [76]:
# tokenize file from index 0 document and remove stopwords
target_tokens = nltk.word_tokenize(''.join(multi_corpus[0]))
target_tokens_removed = [word.lower() for word in target_tokens
                         if word.lower() not in stopWords and word.isalpha()]

In [77]:
# utilize previously defined top_n_words to find FreqDist for current document with tokens removed
fdDoc = nltk.FreqDist(target_tokens_removed)
target_doc_words = sorted(fdDoc, key = fdDoc.get, reverse = True)[:top_n_words]

In [78]:
target_doc_words

['people',
 'computers',
 'think',
 'effect',
 'negative',
 'positive',
 'helps',
 'one',
 'countries',
 'also']

In [79]:
dfM.drop(columns = list(set(feature_names).difference(target_doc_words)))

Unnamed: 0,also,computers,countries,effect,helps,negative,one,people,positive,think
essay_208,0.071953,0.224392,0.185879,0.235273,0.218664,0.249094,0.123199,0.278647,0.200931,0.160684
essay_546,0.038443,0.194820,0.000000,0.000000,0.000000,0.000000,0.021941,0.074438,0.000000,0.021463
essay_220,0.014957,0.023322,0.000000,0.000000,0.000000,0.000000,0.000000,0.046338,0.000000,0.000000
essay_234,0.056539,0.073467,0.097372,0.000000,0.000000,0.043496,0.043025,0.102178,0.035086,0.042087
essay_552,0.000000,0.163922,0.000000,0.028645,0.000000,0.000000,0.000000,0.094993,0.000000,0.039128
...,...,...,...,...,...,...,...,...,...,...
essay_239,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017682,0.191968,0.000000,0.034594
essay_577,0.000000,0.030240,0.000000,0.000000,0.000000,0.000000,0.066412,0.015021,0.000000,0.021655
essay_211,0.034119,0.106404,0.000000,0.000000,0.000000,0.000000,0.000000,0.026426,0.000000,0.000000
essay_205,0.014097,0.043962,0.000000,0.000000,0.000000,0.097604,0.048274,0.032755,0.000000,0.047221


In [80]:
# mean score of top_n_words in doc in files index 0 to corpus
doc_score = dfM.drop(columns = list(set(feature_names).difference(target_doc_words))).iloc[0].mean()
doc_score

0.19487154650889085

In [81]:
fk.score, corpus_score, doc_score

(68.29157303370786, 0.08233593011175473, 0.19487154650889085)

In [32]:
100/12

8.333333333333334

In [82]:
(fk.score/12) + ((corpus_score + doc_score)/2)

5.829568157785978

## Batch processing

In [38]:
len(multi_corpus[0].split(' '))

142

In [169]:
# initialize list of Flesch-Kincaid Grade Level scores
fk_score = []
# initialize list of Coleman Liau Index scores
cl_score = []
# initialize list of TF-IDF scores for file among corpus top_n_words
corpus_score = []
# initialize list of TF-IDF scores for file top_n_words among corpus
doc_score = []

# iterate every file in directory
for file_index in range(len(files)):
    if len(multi_corpus[file_index].split(' ')) > 100:
        # append Flesch-Kincaid Grade Level Score
        fk_score.append( Readability(multi_corpus[file_index]).flesch_kincaid().score )
        # append Coleman Liau Index Score
        cl_score.append( Readability(multi_corpus[file_index]).coleman_liau().score )
    else:
        # future use np.NaN?
        fk_score.append(0)
        cl_score.append(0)
    # tokenize file from file_index document and remove stopWords
    target_tokens = [word.lower() for word in nltk.word_tokenize(''.join(multi_corpus[file_index])) if word.lower() not in stopWords and word.isalpha()]
    # append corpus score
    corpus_score.append(dfM.drop(columns = dropped_columns).iloc[file_index].mean())
    # append doc score
    fdDoc = nltk.FreqDist(target_tokens)
    target_doc_words = sorted(fdDoc, key = fdDoc.get, reverse = True)[:top_n_words]
    doc_score.append(dfM.drop(columns = list(set(feature_names).difference(target_doc_words))).iloc[file_index].mean())

In [170]:
#dfScore = pd.DataFrame(list(zip(fk_score, corpus_score, doc_score)), columns = ['Flesch-Kincaid', 'Corpus TF-IDF', 'Doc TF-IDF'])
#dfScore = pd.DataFrame(list(zip(fk_score, corpus_score, doc_score)), columns = ['Coleman Liau', 'Corpus TF-IDF', 'Doc TF-IDF'])
dfScore = pd.DataFrame(list(zip(cl_score, fk_score, corpus_score, doc_score)), columns = ['Coleman Liau', 'Flesch-Kincaid', 'Corpus TF-IDF', 'Doc TF-IDF'])
dfScore.rename(index=names, inplace = True)

In [171]:
dfScore

Unnamed: 0,Coleman Liau,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF
essay_208,11.250331,8.902967,0.082336,0.194872
essay_546,8.750643,7.404642,0.057760,0.114933
essay_220,8.578731,6.558218,0.019182,0.120310
essay_234,7.617592,5.971872,0.060328,0.118217
essay_552,8.816000,9.201566,0.063563,0.154025
...,...,...,...,...
essay_239,9.401918,9.577723,0.085195,0.189052
essay_577,7.608852,6.309697,0.044325,0.119465
essay_211,5.912421,6.449825,0.034288,0.129669
essay_205,8.121525,5.865054,0.032630,0.064345


In [174]:
dfScore['F-K%'] = dfScore['Flesch-Kincaid']/12
dfScore['CL%'] = dfScore['Coleman Liau']/12
dfScore['TF-IDF Mean'] = dfScore[['Corpus TF-IDF', 'Doc TF-IDF']].mean(axis = 1)

In [175]:
dfScore['F-K Grade'] = dfScore['F-K%'] + dfScore['TF-IDF Mean']
dfScore['CL Grade'] = dfScore['CL%'] + dfScore['TF-IDF Mean']

In [176]:
target_essay_fk = dfScore['F-K Grade'].idxmax()
target_essay_cl = dfScore['CL Grade'].idxmax()

In [178]:
dfScore.loc[ target_essay_fk ]

Coleman Liau       7.619775
Flesch-Kincaid    68.291573
Corpus TF-IDF      0.031492
Doc TF-IDF         0.195797
F-K%               5.690964
CL%                0.634981
TF-IDF Mean        0.113645
F-K Grade          5.804609
CL Grade           0.748626
Name: essay_125, dtype: float64

In [179]:
dfScore.loc[ target_essay_cl ]

Coleman Liau      13.782857
Flesch-Kincaid    10.171421
Corpus TF-IDF      0.042195
Doc TF-IDF         0.149523
F-K%               0.847618
CL%                1.148571
TF-IDF Mean        0.095859
F-K Grade          0.943478
CL Grade           1.244431
Name: essay_593, dtype: float64

In [189]:
dfScore.iloc[0][['F-K Grade', 'CL Grade']]

F-K Grade    0.880518
CL Grade     1.076131
Name: essay_208, dtype: float64

In [188]:
dfScore.iloc[0][['F-K Grade', 'CL Grade']].min()

0.8805176455950909

In [192]:
dfScore['Grade Min'] = dfScore[['F-K Grade', 'CL Grade']].min(axis = 1)

In [194]:
dfScore.describe()

Unnamed: 0,Coleman Liau,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,CL%,TF-IDF Mean,F-K Grade,CL Grade,Grade Min
count,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0
mean,8.140893,7.88639,0.04736,0.129365,0.657199,0.678408,0.088363,0.745562,0.76677,0.68971
std,1.882801,3.621266,0.015473,0.030796,0.301772,0.1569,0.017405,0.301924,0.154986,0.146983
min,0.0,0.0,0.0,0.053705,0.0,0.0,0.035484,0.074685,0.074685,0.074685
25%,7.207701,6.40144,0.036819,0.108819,0.533453,0.600642,0.077136,0.618598,0.688453,0.612357
50%,8.230464,7.497506,0.045894,0.126761,0.624792,0.685872,0.086944,0.712125,0.776736,0.692275
75%,9.234088,8.834058,0.057311,0.14438,0.736172,0.769507,0.098142,0.827372,0.855497,0.773451
max,13.782857,68.291573,0.106713,0.444259,5.690964,1.148571,0.222129,5.804609,1.244431,1.116506


In [243]:
import numpy as np
# Flesch-Kincaid Grade Level scores
fk_score = []
# Flesch Reading Ease scores
f_score = []
# Dale Chall scores
dc_score = []
# Automated Readability Index
ari_score = []
# Coleman Liau Index scores
cl_score = []
# Gunning Fog
gf_score = []
# Linsear Write
lw_score = []

# iterate every file in directory
for file_index in range(len(files)):
    if len(multi_corpus[file_index].split(' ')) > 100:
        r = Readability(multi_corpus[file_index])
        fk_score.append( r.flesch_kincaid().score )
        f_score.append( r.flesch().score )
        dc_score.append( r.dale_chall().score )
        ari_score.append( r.ari().score )
        cl_score.append( r.coleman_liau().score )
        gf_score.append( r.gunning_fog().score )
        lw_score.append( r.linsear_write().score )
    else:
        #value = 0
        value = np.NaN
        fk_score.append(value)
        f_score.append(value)
        dc_score.append(value)
        ari_score.append(value)
        cl_score.append(value)
        gf_score.append(value)
        lw_score.append(value)

In [255]:
frame_scores = list(
                zip(
                    fk_score, [i/12 for i in fk_score],
                    f_score, [(100-i)/40 for i in f_score],
                    dc_score, [i/9 for i in dc_score],
                    ari_score, [i/12 for i in ari_score],
                    cl_score, [i/12 for i in cl_score],
                    gf_score, [i/12 for i in gf_score],
                    lw_score, [i/12 for i in lw_score]
                ) )
dfReadability = pd.DataFrame(frame_scores,
                             columns = ['Flesch-Kincaid', 'F-K%',
                                        'Flesch Reading Ease', 'FR%',
                                        'Dale Chall', 'DC%',
                                        'Automated Readability Index', 'ARI%',
                                        'Coleman Liau', 'CL%',
                                        'Gunning Fog', 'GF%',
                                        'Linsear Write', 'LW%'])
dfReadability.rename(index=names, inplace = True)

In [245]:
dfReadability['Flesch-Kincaid'].idxmin(), dfReadability['Flesch Reading Ease'].idxmin(), dfReadability['Dale Chall'].idxmin(), dfReadability['Automated Readability Index'].idxmin(), dfReadability['Coleman Liau'].idxmin(), dfReadability['Gunning Fog'].idxmin(), dfReadability['Linsear Write'].idxmin()

('essay_392',
 'essay_125',
 'essay_212',
 'essay_572',
 'essay_373',
 'essay_417',
 'essay_189')

In [259]:
dfReadability.loc['essay_125']

Flesch-Kincaid                  68.291573
F-K%                             5.690964
Flesch Reading Ease            -90.278820
FR%                              4.756971
Dale Chall                      15.658783
DC%                              1.739865
Automated Readability Index     86.462921
ARI%                             7.205243
Coleman Liau                     7.619775
CL%                              0.634981
Gunning Fog                     75.244944
GF%                              6.270412
Linsear Write                  108.000000
LW%                              9.000000
Name: essay_125, dtype: float64

In [258]:
dfReadability.info()

<class 'pandas.core.frame.DataFrame'>
Index: 607 entries, essay_208 to essay_563
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Flesch-Kincaid               595 non-null    float64
 1   F-K%                         595 non-null    float64
 2   Flesch Reading Ease          595 non-null    float64
 3   FR%                          595 non-null    float64
 4   Dale Chall                   595 non-null    float64
 5   DC%                          595 non-null    float64
 6   Automated Readability Index  595 non-null    float64
 7   ARI%                         595 non-null    float64
 8   Coleman Liau                 595 non-null    float64
 9   CL%                          595 non-null    float64
 10  Gunning Fog                  595 non-null    float64
 11  GF%                          595 non-null    float64
 12  Linsear Write                595 non-null    float64
 13  LW%        

In [256]:
dfReadability.describe()

Unnamed: 0,Flesch-Kincaid,F-K%,Flesch Reading Ease,FR%,Dale Chall,DC%,Automated Readability Index,ARI%,Coleman Liau,CL%,Gunning Fog,GF%,Linsear Write,LW%
count,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0
mean,8.045444,0.670454,68.061562,0.798461,7.618273,0.846475,8.310236,0.69252,8.305079,0.69209,11.313466,0.942789,10.453526,0.871127
std,3.478028,0.289836,10.933735,0.273343,0.829069,0.092119,4.539881,0.378323,1.500226,0.125019,3.70825,0.309021,5.697677,0.474806
min,3.737992,0.311499,-90.27882,0.254301,5.653694,0.628188,2.699793,0.224983,4.385409,0.365451,5.82789,0.485657,3.475,0.289583
25%,6.490731,0.540894,63.762606,0.650153,7.094907,0.788323,6.341411,0.528451,7.316175,0.609681,9.613131,0.801094,7.582983,0.631915
50%,7.552738,0.629395,69.13243,0.771689,7.545713,0.838413,7.635324,0.636277,8.266526,0.688877,10.821344,0.901779,10.06,0.838333
75%,8.901713,0.741809,73.993862,0.905935,8.071669,0.896852,9.256969,0.771414,9.257492,0.771458,12.135134,1.011261,11.894444,0.991204
max,68.291573,5.690964,89.827946,4.756971,15.658783,1.739865,86.462921,7.205243,13.782857,1.148571,75.244944,6.270412,108.0,9.0


In [257]:
dfReadability

Unnamed: 0,Flesch-Kincaid,F-K%,Flesch Reading Ease,FR%,Dale Chall,DC%,Automated Readability Index,ARI%,Coleman Liau,CL%,Gunning Fog,GF%,Linsear Write,LW%
essay_208,8.902967,0.741914,57.044924,1.073877,8.045394,0.893933,9.358079,0.779840,11.250331,0.937528,12.927417,1.077285,10.250000,0.854167
essay_546,7.404642,0.617054,69.681742,0.757956,7.200639,0.800071,7.802573,0.650214,8.750643,0.729220,10.390788,0.865899,8.854167,0.737847
essay_220,6.558218,0.546518,69.157022,0.771074,9.293635,1.032626,6.193181,0.516098,8.578731,0.714894,9.404436,0.783703,6.635135,0.552928
essay_234,5.971872,0.497656,73.988859,0.650279,7.371036,0.819004,5.551537,0.462628,7.617592,0.634799,9.510104,0.792509,6.828125,0.569010
essay_552,9.201566,0.766797,66.429119,0.839272,7.535435,0.837271,10.294498,0.857875,8.816000,0.734667,13.170577,1.097548,13.500000,1.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
essay_239,9.577723,0.798144,62.048562,0.948786,6.617727,0.735303,10.322613,0.860218,9.401918,0.783493,13.000391,1.083366,13.000000,1.083333
essay_577,6.309697,0.525808,77.564318,0.560892,7.050925,0.783436,6.895737,0.574645,7.608852,0.634071,9.516119,0.793010,8.431818,0.702652
essay_211,6.449825,0.537485,75.886272,0.602843,6.831886,0.759098,5.376246,0.448020,5.912421,0.492702,10.754386,0.896199,8.750000,0.729167
essay_205,5.865054,0.488755,73.998865,0.650028,8.350272,0.927808,5.801261,0.483438,8.121525,0.676794,8.723463,0.726955,6.436170,0.536348


## Percentage of scores in processed PDF documents to be manually reviewed
* Scores are greater than 1 (i.e. 100%)
  * ~7% for Flesch-Kincaid Grade Level
  * ~5% for Coleman Liau Index

In [161]:
len(dfScore[dfScore.Grade > 1])/len(dfScore)

0.051070840197693576

In [162]:
dfScore[dfScore.Grade > 1].info()

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, essay_208 to essay_416
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Coleman Liau   31 non-null     float64
 1   Corpus TF-IDF  31 non-null     float64
 2   Doc TF-IDF     31 non-null     float64
 3   CL%            31 non-null     float64
 4   TF-IDF Mean    31 non-null     float64
 5   Grade          31 non-null     float64
dtypes: float64(6)
memory usage: 1.7+ KB


* Scores are greater than 1 (i.e. 100%) and Grade Level % greater than 1 (i.e. above grade 12 level)
  * Flesch-Kincaide ~ 4.6%
  * Coleman Liau ~ 1.1%

In [163]:
len(dfScore[(dfScore.Grade > 1) & (dfScore['CL%'] > 1)]) / len(dfScore)

0.011532125205930808

In [164]:
dfScore[(dfScore.Grade > 1) & (dfScore['CL%'] > 1)].info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, essay_547 to essay_416
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Coleman Liau   7 non-null      float64
 1   Corpus TF-IDF  7 non-null      float64
 2   Doc TF-IDF     7 non-null      float64
 3   CL%            7 non-null      float64
 4   TF-IDF Mean    7 non-null      float64
 5   Grade          7 non-null      float64
dtypes: float64(6)
memory usage: 392.0+ bytes


* 46 papers with `Grade` above 1, 28 papers with `Grade` **and** `F-K%` above 1
* 31 papers with `Grade` above 1, 7 papers with `Grade` **and** `CL%` above 1

In [165]:
dfScore[(dfScore.Grade > 1) & (dfScore['CL%'] > 1)]

Unnamed: 0,Coleman Liau,Corpus TF-IDF,Doc TF-IDF,CL%,TF-IDF Mean,Grade
essay_547,12.180237,0.020084,0.116418,1.01502,0.068251,1.083271
essay_169,12.018667,0.036372,0.129567,1.001556,0.082969,1.084525
essay_593,13.782857,0.042195,0.149523,1.148571,0.095859,1.244431
essay_145,12.337221,0.029404,0.100773,1.028102,0.065088,1.09319
essay_56,12.600171,0.027651,0.105333,1.050014,0.066492,1.116506
essay_310,12.080546,0.026779,0.128824,1.006712,0.077802,1.084514
essay_416,12.42443,0.054061,0.116132,1.035369,0.085096,1.120466


* 18 papers with `Grade` above 1 **and** `F-K%` below 1
* 24 papers with `Grade` above 1 **and** `CL%` below 1
* ASSUMPTIONS
  * Student able to write at/above 12th grade level
  * Student vocabulary choice is unique and correct

In [168]:
dfScore[(dfScore.Grade > 1) & (dfScore['CL%'] < 1)].info()

<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, essay_208 to essay_600
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Coleman Liau   24 non-null     float64
 1   Corpus TF-IDF  24 non-null     float64
 2   Doc TF-IDF     24 non-null     float64
 3   CL%            24 non-null     float64
 4   TF-IDF Mean    24 non-null     float64
 5   Grade          24 non-null     float64
dtypes: float64(6)
memory usage: 1.3+ KB


In [123]:
dfScore['F-K%'].max(), dfScore['F-K%'].idxmax()

(5.690964419475655, 'essay_125')

In [124]:
dfScore.loc[['essay_125']]

Unnamed: 0,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,TF-IDF Mean,Grade
essay_125,68.291573,0.031492,0.195797,5.690964,0.113645,5.804609


In [119]:
dfScore['Corpus TF-IDF'].max(), dfScore['Corpus TF-IDF'].idxmax()

(0.1067131182648862, 'essay_182')

In [122]:
dfScore.loc[['essay_182']]

Unnamed: 0,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,TF-IDF Mean,Grade
essay_182,7.229719,0.106713,0.164913,0.602477,0.135813,0.73829


In [118]:
dfScore['Doc TF-IDF'].max(), dfScore['Doc TF-IDF'].idxmax()

(0.44425870217767477, 'essay_357')

In [121]:
dfScore.loc[['essay_357']]

Unnamed: 0,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,TF-IDF Mean,Grade
essay_357,0.0,0.0,0.444259,0.0,0.222129,0.222129


In [125]:
dfScore['TF-IDF Mean'].max(), dfScore['TF-IDF Mean'].idxmax()

(0.22212935108883738, 'essay_357')

In [129]:
dfScore.sort_values(by = 'TF-IDF Mean', ascending = False)

Unnamed: 0,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,TF-IDF Mean,Grade
essay_357,0.000000,0.000000,0.444259,0.000000,0.222129,0.222129
essay_543,0.000000,0.016831,0.291302,0.000000,0.154067,0.154067
essay_41,0.000000,0.098512,0.203927,0.000000,0.151220,0.151220
essay_44,4.303225,0.079610,0.208025,0.358602,0.143818,0.502420
essay_208,8.902967,0.082336,0.194872,0.741914,0.138604,0.880518
...,...,...,...,...,...,...
essay_506,10.298412,0.029394,0.077075,0.858201,0.053234,0.911435
essay_467,8.038186,0.030233,0.068142,0.669849,0.049187,0.719036
essay_205,5.865054,0.032630,0.064345,0.488755,0.048488,0.537242
essay_601,6.964272,0.011812,0.082147,0.580356,0.046980,0.627335


In [100]:
dfScore[(dfScore.Grade > 1) & (dfScore['F-K%'] < 1)]

Unnamed: 0,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF,F-K%,TF-IDF Mean,Grade
essay_223,11.990227,0.09313,0.131184,0.999186,0.112157,1.111343
essay_394,11.057692,0.050471,0.159197,0.921474,0.104834,1.026309
essay_356,11.765833,0.059414,0.119704,0.980486,0.089559,1.070045
essay_233,11.825897,0.018848,0.148078,0.985491,0.083463,1.068954
essay_594,11.557843,0.048763,0.154194,0.963154,0.101478,1.064632
essay_437,11.15292,0.049995,0.146606,0.92941,0.098301,1.027711
essay_72,11.392311,0.039962,0.11559,0.949359,0.077776,1.027135
essay_481,11.628472,0.067407,0.122877,0.969039,0.095142,1.064181
essay_126,11.949286,0.042079,0.144649,0.995774,0.093364,1.089138
essay_90,10.877142,0.080524,0.13825,0.906429,0.109387,1.015815


## TO-DO
* Look at 3 scores for papers with **Teacher** grade attached.
* Look for equation to fit 3 scores to **Teacher** score.
* Scoring assumptions
  * student should be able to write at grade level
  * vocab choice of all students (TF-IDF of corpus) and individual (TF-IDF of doc) are important