In [None]:
# https://www.nltk.org/
import nltk

In [None]:
# pip install pypdf2

from PyPDF2 import PdfFileReader
from pathlib import Path
import os

In [None]:
# PATH is input dir. Program recurses on this
# OUTPUT_PATH mirrored dir structure of PATH

PATH = r"/home/andy/"  # use raw string in case it needs to run on windows
OUTPUT_PATH = r"/home/andy/tmp/"  # due to M$ non standard dir delimiter


In [None]:
def replace_bad_characters(filepath):
    for character in filepath:
        if character in ";:(), '\"":
            filepath = filepath.replace(character, "_")
    return filepath

In [None]:
def extract_information(pdf_path):
    doc = []
    with open(pdf_path, 'rb') as f:
        try:
            pdf = PdfFileReader(f)
        except:
            print(f"error opening {pdf_path}; can't read")
            return
        try:
            num_pages = pdf.getNumPages()
        except:
            print(f"Can't read {pdf_path}: possibly encrypted")
            return
        for page in range(num_pages):
            this_page = pdf.getPage(page)
            try:
                doc.append(this_page.extractText())
            except:
                print(
                    f"error reading pdf: {pdf_path} name too long or file malformed")
                return
    filedir = str(Path(pdf_path).resolve().parent)
    filedir = filedir.replace(PATH, OUTPUT_PATH)
    filedir = replace_bad_characters(filedir)
    try:
        os.system(f"mkdir -p {filedir}")
    except FileExistsError:
        pass
    new_pdf_path = str(pdf_path)[:-3] + "txt"
    new_pdf_path = new_pdf_path.replace(PATH, OUTPUT_PATH)
    new_pdf_path = replace_bad_characters(new_pdf_path)
    try:
        with open(new_pdf_path, 'w') as f:
            f.writelines(doc)
    except FileNotFoundError:
        print(f"can't find file {pdf_path}")

In [None]:
def getfiles():
    paths = []
    for filepath in Path(PATH).rglob('*.pdf'):
        paths.append(filepath)
    return paths

In [None]:
paths = getfiles()
for filepath in paths:
    extract_information(filepath)

to enable on your pc run in a terminal:
```
pip install nltk
```
and then in an interactive python shell run:

```
import nltk
nltk.download()
```

In [None]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [None]:
# here the text is hardcoded, This could be read from a text file easily
# when delaing with large volumes of text (say more than 500,000 words)
# we would use a generator to do the reading

text = """When we work with text, we can work with text units on different
scales: we can work at the level of the document itself, such as a newspaper
article; the paragraph, the sentence, or the word. Sentences are the main
unit of processing in many NLP tasks. In this section, I will show you how
to divide text into sentences."""

# note special characters above; we'll remove them now
special_chars = "\n.,:;"
for char in special_chars:
    text = text.replace(char, " ")

In [None]:
sentences = tokenizer.tokenize(text)
print(sentences)

In [None]:
tokens = [nltk.word_tokenize(sentence) for sentence in sentences][0]
print(tokens)

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
import pandas as pd

# find dictionary equivalents

lemmatize = WordNetLemmatizer()

lemmatized_words = []

for word in tokens:
    rootWord = lemmatize.lemmatize(word)
    lemmatized_words.append(rootWord)

counts_lemmatized_words = Counter(lemmatized_words)
df_tokenized_lemmatized_words = pd.DataFrame.from_dict(counts_lemmatized_words,
                                                       orient="index").reset_index()
df_tokenized_lemmatized_words.sort_values(by=0, ascending=False, inplace=True)
df_tokenized_lemmatized_words

In [None]:
# Frequencies
#
freq = nltk.FreqDist(tokens)

for key, val in freq.items():
    print(str(key) + ':' + str(val))

In [None]:
freq.plot(20, cumulative=False)

In [None]:
# lets do some parts of speech tagging. Notice the classification

for sentence in sentences:
    tagged = nltk.pos_tag(tokens)
print(tagged)

In [None]:
# pip install svgling
import svgling

In [None]:
entities = nltk.chunk.ne_chunk(tagged[20:35])

In [None]:
entities