In [1]:
import pdftotext
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from pathlib import Path

***
***
## Process a directory of PDF

In [2]:
# adjust directory to point to location of files
directory = 'test_pdf/'

# create file list of pdf in directory
pdf_folder = Path(directory).rglob('*.pdf')

# create list of files and verify contents
# should be 5 if using supplied 'test_pdf' directory
files = [file for file in pdf_folder]
files

[PosixPath('test_pdf/MedvedevaEtAl2019.pdf'),
 PosixPath('test_pdf/KDD97-003.pdf'),
 PosixPath('test_pdf/P99-1001.pdf'),
 PosixPath('test_pdf/10.1007978-3-319-67056-018.pdf'),
 PosixPath('test_pdf/dummy_test.pdf')]

## Iterate through each file and
* Tokenize file text
* Create consistent case `.lower()` for each token
* Remove tokens from `nltk` library `english` stopwords
* Remove non-`.isalpha()` tokens

In [3]:
tokens = []
multi_corpus = []
stopWords = set(stopwords.words('english'))

# iterate every file in directory
for file in files:
    # open file
    with open(file, 'rb') as f:
        # conversion with pdftotext
        multi_pdf = pdftotext.PDF(f)
        multi_corpus.append(''.join(multi_pdf))
        # place current pdf text into list of tokens
        tokens += nltk.word_tokenize(''.join(multi_pdf))
        #corpus.append(tokens)

# update tokens by setting all to lowercase,
# removing stopwords,
# removing non-alphanumeric
tokens_removed = [word.lower() for word in tokens
                  if word.lower() not in stopWords
                  and word.isalpha()]

## based on `top_n_words` to search for of `tokens_removed` (no stopwords) create a frequency distribution `fd` and place that number of words in list `target_words`

In [4]:
top_n_words = 10
fd = nltk.FreqDist(tokens_removed)
target_words = sorted(fd, key = fd.get, reverse = True)[:top_n_words]

***
# Clustering
***
## TF-IDF
* take unique tokens from each pdf being fed as input
* store each token as a string in the corpus

In [5]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(multi_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
dfM = pd.DataFrame(denselist, columns=feature_names)

### set properties will allow us to remove every column that is not a targeted word from the early NLTK selection


In [6]:
dropped_columns = list(set(feature_names).difference(target_words))
dfM.drop(columns = dropped_columns)

Unnamed: 0,articles,case,cases,data,information,learning,mining,results,text,used
0,0.023118,0.082652,0.142561,0.047975,0.024673,0.036324,0.002431,0.02947,0.017827,0.035639
1,0.0,0.007946,0.002361,0.055444,0.008401,0.00168,0.045688,0.016801,0.019864,0.018481
2,0.034813,0.005325,0.0,0.103584,0.103584,0.002252,0.098508,0.011259,0.15708,0.011259
3,0.143512,0.005426,0.001612,0.092931,0.076869,0.061954,0.184481,0.030977,0.23874,0.033272
4,0.0,0.0,0.0,0.027596,0.003942,0.01774,0.0,0.05125,0.0,0.023654


### create dictionary of k:v pair index : pdf filename to rename pandas rows for readability

In [7]:
names = {}
for x in range(len(files)):
    print(str(x)+' '+str(files[x])[9:])
    names[x] = str(files[x])[9:]

0 MedvedevaEtAl2019.pdf
1 KDD97-003.pdf
2 P99-1001.pdf
3 10.1007978-3-319-67056-018.pdf
4 dummy_test.pdf


In [8]:
dfM.rename(index=names, inplace = True)

In [9]:
dfM.drop(columns = dropped_columns)

Unnamed: 0,articles,case,cases,data,information,learning,mining,results,text,used
MedvedevaEtAl2019.pdf,0.023118,0.082652,0.142561,0.047975,0.024673,0.036324,0.002431,0.02947,0.017827,0.035639
KDD97-003.pdf,0.0,0.007946,0.002361,0.055444,0.008401,0.00168,0.045688,0.016801,0.019864,0.018481
P99-1001.pdf,0.034813,0.005325,0.0,0.103584,0.103584,0.002252,0.098508,0.011259,0.15708,0.011259
10.1007978-3-319-67056-018.pdf,0.143512,0.005426,0.001612,0.092931,0.076869,0.061954,0.184481,0.030977,0.23874,0.033272
dummy_test.pdf,0.0,0.0,0.0,0.027596,0.003942,0.01774,0.0,0.05125,0.0,0.023654


In [10]:
dfSummary = pd.DataFrame({'word': [], 'max TF-IDF value' : [], 'file' : []})

In [11]:
for word in target_words:
    dfSummary = dfSummary.append({'word': word, 'max TF-IDF value' : dfM[word].max(), 'file' : dfM[word].idxmax()}, ignore_index = True)

In [12]:
dfSummary.sort_values(by = 'max TF-IDF value', ascending = False)

Unnamed: 0,word,max TF-IDF value,file
0,text,0.23874,10.1007978-3-319-67056-018.pdf
2,mining,0.184481,10.1007978-3-319-67056-018.pdf
5,articles,0.143512,10.1007978-3-319-67056-018.pdf
4,cases,0.142561,MedvedevaEtAl2019.pdf
1,data,0.103584,P99-1001.pdf
3,information,0.103584,P99-1001.pdf
8,case,0.082652,MedvedevaEtAl2019.pdf
6,learning,0.061954,10.1007978-3-319-67056-018.pdf
7,results,0.05125,dummy_test.pdf
9,used,0.035639,MedvedevaEtAl2019.pdf


## KMeans
* Predict the cluster for `search_text`
* access `dfM` dataframe utilizing prediction to determine pdf file where target text is likely to be

In [13]:
num = len(multi_pdf) # set clusters to number of documents being scrubbed?
kmeans = KMeans(n_clusters = num, init = 'k-means++', max_iter = 500, n_init = 1)
kmeans.fit(vectors)
centroids = kmeans.cluster_centers_
print(centroids) #This will print cluster centroids as tf-idf vectors

[[0.         0.01143798 0.         ... 0.         0.         0.        ]
 [0.         0.         0.0028766  ... 0.         0.         0.        ]
 [0.00413669 0.         0.         ... 0.         0.         0.        ]
 [0.         0.00194255 0.         ... 0.00722322 0.00240774 0.00722322]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [14]:
kmeans.predict(vectors)

array([1, 4, 0, 3, 2], dtype=int32)

In [15]:
search_text = "how to run on the water"
predicted = kmeans.predict(vectorizer.transform([search_text]))
dfM.loc[ names[ predicted[0] ] ].name

'KDD97-003.pdf'