In [2]:
import pdftotext
import nltk
from nltk.corpus import stopwords
import pandas as pd
from pathlib import Path
from readability import Readability

In [3]:
# text tokenization
nltk.download('punkt')

# stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ryanrien/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanrien/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## load pdf

In [4]:
path = "test_pdf/dummy_test.pdf"
with open(path, "rb") as f:
    pdf = pdftotext.PDF(f)
#type(pdf)

## page count

In [5]:
print(len(pdf))

5


## Iterate over all the pages

In [12]:
text = ''
for page in pdf:
    text+=page

In [17]:
r = Readability(text)
fk = r.flesch_kincaid()
fk.score, fk.grade_level

(10.6788966015096, '11')

In [6]:
for page in pdf:
    print(page)

     Analysis of Race and Gender Bias in Deep Age
                   Estimation Models
                                          Andraž Puc, Vitomir Štruc, Klemen Grm
                                  University of Ljubljana, Faculty of Electrical Engineering
                                       Tržaška cesta 25, SI-1000 Ljubljana, Slovenia
                                                    klemen.grm@fe.uni-lj.si


   Abstract—Due to advances in deep learning and convolutional
neural networks (CNNs) there has been significant progress in
the field of visual age estimation from face images over recent
years. While today’s models are able to achieve considerable age
estimation accuracy, their behaviour, especially with respect to
specific demographic groups is still not well understood. In this
paper, we take a deeper look at CNN-based age estimation models
and analyze their performance across different race and gender
groups. We use two publicly available off-the-shelf age estim

## print specific page

In [7]:
print(pdf[0])

     Analysis of Race and Gender Bias in Deep Age
                   Estimation Models
                                          Andraž Puc, Vitomir Štruc, Klemen Grm
                                  University of Ljubljana, Faculty of Electrical Engineering
                                       Tržaška cesta 25, SI-1000 Ljubljana, Slovenia
                                                    klemen.grm@fe.uni-lj.si


   Abstract—Due to advances in deep learning and convolutional
neural networks (CNNs) there has been significant progress in
the field of visual age estimation from face images over recent
years. While today’s models are able to achieve considerable age
estimation accuracy, their behaviour, especially with respect to
specific demographic groups is still not well understood. In this
paper, we take a deeper look at CNN-based age estimation models
and analyze their performance across different race and gender
groups. We use two publicly available off-the-shelf age estim

## subscript within page

In [8]:
pdf[0][0:20]

'     Analysis of Rac'

***

## tokenize text

In [None]:
nltk.download('punkt')

In [None]:
tokens = nltk.word_tokenize(''.join(pdf))

In [None]:
# tokens can be indexed
tokens[0]

## find word in token list using nltk

In [None]:
word = 'experiment'
nltk.Text(tokens).count(word)

## find five words in pdf, place word & count in list

In [None]:
# text parsing issue
# not all tokens are consistent in case
tokens[23].lower(), tokens[23]

In [None]:
# convert all tokens to lowercase
low_tokens = [token.lower() for token in tokens]

In [None]:
# no more conflict of case
low_tokens[23].lower(), low_tokens[23]

In [None]:
# initialize lists
words = ["accuracy", "findings", "experimental", "estimation", "doctor"]
word_summary = []
low_word_summary = []

In [None]:
# check every word in the 5 to search against the tokens
# insert word/count in summary
for word in words:
    word_summary.append([word, nltk.Text(tokens).count(word)])
word_summary

In [None]:
# same as above but utilizing lowercase tokens
# results are different but important
# ex. 'experimental' is 5 without lowercase and 7 with all lowercase
for word in words:
    low_word_summary.append([word, nltk.Text(low_tokens).count(word)])
low_word_summary

## create pandas dataframe for visualization

In [None]:
df = pd.DataFrame(word_summary, columns = ["word", "count"])
df

In [None]:
df_low = pd.DataFrame(low_word_summary, columns = ["word", "count"])
# demo start
#df_low = df_low.set_index('word')
# demo end
df_low

In [None]:
df_low.plot.bar(x='word', y='count')

***
***
## Directory of PDF

In [None]:
pdf_folder = Path('test_pdf/').rglob('*.pdf')
files = [file for file in pdf_folder]
files

In [None]:
tokens = []
stopWords = set(stopwords.words('english'))

# iterate every file in directory
for file in files:
    # open file
    with open(file, 'rb') as f:
        # conversion with pdftotext
        multi_pdf = pdftotext.PDF(f)
        # place current pdf text into list of tokens
        tokens += nltk.word_tokenize(''.join(multi_pdf))

# update tokens by setting all to lowercase,
# removing stopwords,
# removing non-alphanumeric
tokens_removed = [word.lower() for word in tokens
                  if word.lower() not in stopWords
                  and word.isalpha()]

In [None]:
# initialize user summary list
word_summary = []
# create list with sublist [word, count]
for word in range(len(words)):
    word_summary.append([words[word], nltk.Text(tokens).count(words[word])])

In [None]:
len(tokens), len(tokens_removed)

In [None]:
word_summary

## user specifies 5 words

In [None]:
df_multi = pd.DataFrame(word_summary, columns = ["word", "count"])
df_multi

In [None]:
df_multi.plot.bar(x='word', y='count')

## NLTK provides top 5
* This allows the data to drive further work by looking solely at the top N words in the files processed.  By default the tokens are cleaned by removing entries in the NLTK library stopwords list as well as punctuation from the string library.
* Utilizing this rather than user input allows the opportunity for supervised follow-up utilizing some terms from the tokens and additional to narrow in on "hits" in the dataset.
* Since most_common() provides all entries it can be indexed as a traditional list to look anywhere in the list if desired.

In [None]:
# create a frequencity distribution based off of the cleaned tokens
fd = nltk.FreqDist(tokens_removed)

In [None]:
# NLTK most_common(n) provides a list of n length with sublist [word, count]
# create a dataframe utilizing the 5 most common words in the claned token list
data = fd.most_common()
df_fd = pd.DataFrame(data[:5], columns = ["word", "count"])
df_fd

In [None]:
df_fd.plot.bar(x='word', y='count')

---
---
# TO-DO
### immediate

### long-term
* Look into API for digital commons
* Adjust visualizations