In [1]:
import pdftotext
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from pathlib import Path
from readability import Readability

***
***
## Process a directory of PDF

In [2]:
# adjust directory to point to location of files
directory = 'test_pdf/'

# create file list of pdf in directory
pdf_folder = Path(directory).rglob('*.pdf')

# create list of files and verify contents
# should be 5 if using supplied 'test_pdf' directory
files = [file for file in pdf_folder]
files

[PosixPath('test_pdf/MedvedevaEtAl2019.pdf'),
 PosixPath('test_pdf/KDD97-003.pdf'),
 PosixPath('test_pdf/P99-1001.pdf'),
 PosixPath('test_pdf/10.1007978-3-319-67056-018.pdf'),
 PosixPath('test_pdf/dummy_test.pdf')]

## Iterate through each file and
* Tokenize file text
* Create consistent case `.lower()` for each token
* Remove tokens from `nltk` library `english` stopwords
* Remove non-`.isalpha()` tokens

In [3]:
tokens = []
multi_corpus = []
stopWords = set(stopwords.words('english'))

# iterate every file in directory
for file in files:
    # open file
    with open(file, 'rb') as f:
        # conversion with pdftotext
        multi_pdf = pdftotext.PDF(f)
        multi_corpus.append(''.join(multi_pdf))
        # place current pdf text into list of tokens
        tokens += nltk.word_tokenize(''.join(multi_pdf))
        #corpus.append(tokens)

# update tokens by setting all to lowercase,
# removing stopwords,
# removing non-alphanumeric
tokens_removed = [word.lower() for word in tokens
                  if word.lower() not in stopWords
                  and word.isalpha()]

## based on `top_n_words` to search for of `tokens_removed` (no stopwords) create a frequency distribution `fd` and place that number of words in list `target_words`

In [4]:
top_n_words = 10
fd = nltk.FreqDist(tokens_removed)
target_words = sorted(fd, key = fd.get, reverse = True)[:top_n_words]

***
# Clustering
***
## TF-IDF
* take unique tokens from each pdf being fed as input
* store each token as a string in the corpus

In [5]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(multi_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
dfM = pd.DataFrame(denselist, columns=feature_names)

### set properties will allow us to remove every column that is not a targeted word from the early NLTK selection


In [6]:
dropped_columns = list(set(feature_names).difference(target_words))
dfM.drop(columns = dropped_columns)

Unnamed: 0,articles,case,cases,data,information,learning,mining,results,text,used
0,0.023118,0.082652,0.142561,0.047975,0.024673,0.036324,0.002431,0.02947,0.017827,0.035639
1,0.0,0.007946,0.002361,0.055444,0.008401,0.00168,0.045688,0.016801,0.019864,0.018481
2,0.034813,0.005325,0.0,0.103584,0.103584,0.002252,0.098508,0.011259,0.15708,0.011259
3,0.143512,0.005426,0.001612,0.092931,0.076869,0.061954,0.184481,0.030977,0.23874,0.033272
4,0.0,0.0,0.0,0.027596,0.003942,0.01774,0.0,0.05125,0.0,0.023654


### create dictionary of k:v pair index : pdf filename to rename pandas rows for readability

In [7]:
names = {}
for x in range(len(files)):
    print(str(x)+' '+str(files[x])[9:])
    names[x] = str(files[x])[9:]

0 MedvedevaEtAl2019.pdf
1 KDD97-003.pdf
2 P99-1001.pdf
3 10.1007978-3-319-67056-018.pdf
4 dummy_test.pdf


In [8]:
dfM.rename(index=names, inplace = True)

In [9]:
dfM.drop(columns = dropped_columns)

Unnamed: 0,articles,case,cases,data,information,learning,mining,results,text,used
MedvedevaEtAl2019.pdf,0.023118,0.082652,0.142561,0.047975,0.024673,0.036324,0.002431,0.02947,0.017827,0.035639
KDD97-003.pdf,0.0,0.007946,0.002361,0.055444,0.008401,0.00168,0.045688,0.016801,0.019864,0.018481
P99-1001.pdf,0.034813,0.005325,0.0,0.103584,0.103584,0.002252,0.098508,0.011259,0.15708,0.011259
10.1007978-3-319-67056-018.pdf,0.143512,0.005426,0.001612,0.092931,0.076869,0.061954,0.184481,0.030977,0.23874,0.033272
dummy_test.pdf,0.0,0.0,0.0,0.027596,0.003942,0.01774,0.0,0.05125,0.0,0.023654


In [10]:
dfSummary = pd.DataFrame({'word': [], 'max TF-IDF value' : [], 'file' : []})

In [11]:
for word in target_words:
    dfSummary = dfSummary.append({'word': word, 'max TF-IDF value' : dfM[word].max(), 'file' : dfM[word].idxmax()}, ignore_index = True)

In [12]:
dfSummary.sort_values(by = 'max TF-IDF value', ascending = False)

Unnamed: 0,word,max TF-IDF value,file
0,text,0.23874,10.1007978-3-319-67056-018.pdf
2,mining,0.184481,10.1007978-3-319-67056-018.pdf
5,articles,0.143512,10.1007978-3-319-67056-018.pdf
4,cases,0.142561,MedvedevaEtAl2019.pdf
1,data,0.103584,P99-1001.pdf
3,information,0.103584,P99-1001.pdf
8,case,0.082652,MedvedevaEtAl2019.pdf
6,learning,0.061954,10.1007978-3-319-67056-018.pdf
7,results,0.05125,dummy_test.pdf
9,used,0.035639,MedvedevaEtAl2019.pdf


## KMeans
* Predict the cluster for `search_text`
* access `dfM` dataframe utilizing prediction to determine pdf file where target text is likely to be

In [13]:
num = len(multi_pdf) # set clusters to number of documents being scrubbed?
kmeans = KMeans(n_clusters = num, init = 'k-means++', max_iter = 500, n_init = 1)
kmeans.fit(vectors)
centroids = kmeans.cluster_centers_
print(centroids) #This will print cluster centroids as tf-idf vectors

[[0.00413669 0.         0.         ... 0.         0.         0.        ]
 [0.         0.00194255 0.         ... 0.00722322 0.00240774 0.00722322]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.0028766  ... 0.         0.         0.        ]
 [0.         0.01143798 0.         ... 0.         0.         0.        ]]


In [14]:
kmeans.predict(vectors)

array([3, 2, 4, 1, 0], dtype=int32)

In [15]:
search_text = "how to run on the water"
predicted = kmeans.predict(vectorizer.transform([search_text]))
dfM.loc[ names[ predicted[0] ] ].name

'10.1007978-3-319-67056-018.pdf'

## Scoring
* Find Flesch-Kincaid Grade Level.
* Find TF-IDF average of scored document from `top_n_words` `nltk` Frequency Distribution of corpus.
* Find TF-IDF average of scored document from `top_n_words` `nltk` Frequency Distribution of scored document.

In [22]:
# all files from original scan
files

[PosixPath('test_pdf/MedvedevaEtAl2019.pdf'),
 PosixPath('test_pdf/KDD97-003.pdf'),
 PosixPath('test_pdf/P99-1001.pdf'),
 PosixPath('test_pdf/10.1007978-3-319-67056-018.pdf'),
 PosixPath('test_pdf/dummy_test.pdf')]

In [25]:
# string conversion and indexing produces path
str(files[0])

'test_pdf/MedvedevaEtAl2019.pdf'

In [26]:
# set path to string conversion of desired file
path = str(files[0])
with open(path, "rb") as f:
    pdf = pdftotext.PDF(f)

In [27]:
# initialize text string, iterate through pdf, append text
text = ''
for page in pdf:
    text+=page

In [28]:
# score text via Flesch-Kincaid Grade Level metric
r = Readability(text)
fk = r.flesch_kincaid()
fk.score, fk.grade_level

(12.704919385560462, '13')

In [108]:
# mean score of top_n_words in corpus for doc in files index 0
corpus_score = dfM.drop(columns = dropped_columns).iloc[0].mean()
corpus_score

0.0442669256448735

In [101]:
# tokenize file from index 0 document and remove stopwords
target_tokens = nltk.word_tokenize(''.join(multi_corpus[0]))
target_tokens_removed = [word.lower() for word in target_tokens
                         if word.lower() not in stopWords and word.isalpha()]

In [102]:
# utilize previously defined top_n_words to find FreqDist for current document with tokens removed
fdDoc = nltk.FreqDist(target_tokens_removed)
target_doc_words = sorted(fdDoc, key = fdDoc.get, reverse = True)[:top_n_words]

In [104]:
target_doc_words

['cases',
 'case',
 'violation',
 'court',
 'law',
 'article',
 'data',
 'decisions',
 'legal',
 'using']

In [106]:
dfM.drop(columns = list(set(feature_names).difference(target_doc_words)))

Unnamed: 0,article,case,cases,court,data,decisions,law,legal,using,violation
MedvedevaEtAl2019.pdf,0.087031,0.082652,0.142561,0.117941,0.047975,0.08486,0.107873,0.056832,0.039065,0.2028
KDD97-003.pdf,0.0,0.007946,0.002361,0.0,0.055444,0.0,0.0,0.0,0.00336,0.0
P99-1001.pdf,0.0,0.005325,0.0,0.0,0.103584,0.0,0.0,0.0,0.013511,0.0
10.1007978-3-319-67056-018.pdf,0.003885,0.005426,0.001612,0.0,0.092931,0.0,0.0,0.001612,0.036714,0.0
dummy_test.pdf,0.0,0.0,0.0,0.0,0.027596,0.0,0.0,0.00277,0.009856,0.0


In [109]:
# mean score of top_n_words in doc in files index 0 to corpus
doc_score = dfM.drop(columns = list(set(feature_names).difference(target_doc_words))).iloc[0].mean()
doc_score

0.09695889184932609

In [115]:
fk.score, corpus_score, doc_score

(12.704919385560462, 0.0442669256448735, 0.09695889184932609)

## Batch processing

In [133]:
# initialize list of Flesch-Kincaid Grade Level scores
fk_score = []
# initialize list of TF-IDF scores for file among corpus top_n_words
corpus_score = []
# initialize list of TF-IDF scores for file top_n_words among corpus
doc_score = []

# iterate every file in directory
for file_index in range(len(files)):
    # append Flesch-Kincaid Grade Level Score
    fk_score.append( Readability(multi_corpus[file_index]).flesch_kincaid().score )
    # tokenize file from file_index document and remove stopWords
    target_tokens = [word.lower() for word in nltk.word_tokenize(''.join(multi_corpus[file_index])) if word.lower() not in stopWords and word.isalpha()]
    # append corpus score
    corpus_score.append(dfM.drop(columns = dropped_columns).iloc[file_index].mean())
    # append doc score
    fdDoc = nltk.FreqDist(target_tokens)
    target_doc_words = sorted(fdDoc, key = fdDoc.get, reverse = True)[:top_n_words]
    doc_score.append(dfM.drop(columns = list(set(feature_names).difference(target_doc_words))).iloc[file_index].mean())

In [134]:
for file_index in range(len(files)):
    print(fk_score[file_index], corpus_score[file_index], doc_score[file_index])

12.704919385560462 0.0442669256448735 0.09695889184932609
11.699757630092137 0.01766658069771749 0.09954108374763701
10.309591337877851 0.05276634587570308 0.08872207670385866
10.649642785973345 0.08697753413614841 0.10947173304751992
10.6788966015096 0.012418278795192379 0.15977827983685147


In [139]:
dfScore = pd.DataFrame(list(zip(fk_score, corpus_score, doc_score)), columns = ['Flesch-Kincaid', 'Corpus TF-IDF', 'Doc TF-IDF'])
dfScore.rename(index=names, inplace = True)

In [140]:
dfScore

Unnamed: 0,Flesch-Kincaid,Corpus TF-IDF,Doc TF-IDF
MedvedevaEtAl2019.pdf,12.704919,0.044267,0.096959
KDD97-003.pdf,11.699758,0.017667,0.099541
P99-1001.pdf,10.309591,0.052766,0.088722
10.1007978-3-319-67056-018.pdf,10.649643,0.086978,0.109472
dummy_test.pdf,10.678897,0.012418,0.159778


## TO-DO
* Look at 3 scores for papers with **Teacher** grade attached.
* Look for equation to fit 3 scores to **Teacher** score.
* Scoring assumptions
  * student should be able to write at grade level
  * vocab choice of all students (TF-IDF of corpus) and individual (TF-IDF of doc) are important