## Pre-processing PDF Text

In [5]:
import pandas as pd
import numpy as np
import nltk
import string
#import fasttext
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
plt.xticks(rotation=70)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

In [13]:
full_df = pd.read_csv('Full-Text.csv', index_col = 0)

In [19]:
full_df['tokenized'] = full_df['Text'].apply(word_tokenize)
full_df['lower'] = full_df['tokenized'].apply(lambda x: [word.lower() for word in x])
punc = string.punctuation
full_df['no_punc'] = full_df['lower'].apply(lambda x: [word for word in x if word not in punc])
stop_words = set(stopwords.words('english'))
full_df['stopwords_removed'] = full_df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])

full_df.head()

Unnamed: 0,Title,Text,tokenized,lower,no_punc,stopwords_removed,pos_tags
0,3002.pdf,"\n \nImmunization Program/Puerto Rico Department of Health\n \nOCTOBER 16, 2020\n| \nV\nERSION ...","[Immunization, Program/Puerto, Rico, Department, of, Health, OCTOBER, 16, ,, 2020, |, V, ERSION,...","[immunization, program/puerto, rico, department, of, health, october, 16, ,, 2020, |, v, ersion,...","[immunization, program/puerto, rico, department, of, health, october, 16, 2020, v, ersion, 1.0, ...","[immunization, program/puerto, rico, department, health, october, 16, 2020, v, ersion, 1.0, covi...","[(immunization, NN), (program/puerto, NN), (rico, VBP), (department, NN), (health, NN), (october..."
1,ARCEMP-1Sept2021-Final_ ARKANSAS.pdf,"Arkansas \nComprehensive \nEmergency \nManagement Plan \n (ARCEMP) \n September 1, 2021\n I ...","[Arkansas, Comprehensive, Emergency, Management, Plan, (, ARCEMP, ), September, 1, ,, 2021, I, S...","[arkansas, comprehensive, emergency, management, plan, (, arcemp, ), september, 1, ,, 2021, i, s...","[arkansas, comprehensive, emergency, management, plan, arcemp, september, 1, 2021, i, state, of,...","[arkansas, comprehensive, emergency, management, plan, arcemp, september, 1, 2021, state, arkans...","[(arkansas, NNS), (comprehensive, JJ), (emergency, NN), (management, NN), (plan, NN), (arcemp, V..."
2,covid-19-vaccination-plan-maine-interim-draft.pdf,"[Maine Center for Disease Control and Prevention\n][October \n16, 2020\n] | \n[V 1.0] COVID-19 \...","[[, Maine, Center, for, Disease, Control, and, Prevention, ], [, October, 16, ,, 2020, ], |, [, ...","[[, maine, center, for, disease, control, and, prevention, ], [, october, 16, ,, 2020, ], |, [, ...","[maine, center, for, disease, control, and, prevention, october, 16, 2020, v, 1.0, covid-19, vac...","[maine, center, disease, control, prevention, october, 16, 2020, v, 1.0, covid-19, vaccination, ...","[(maine, NN), (center, NN), (disease, NN), (control, NN), (prevention, NN), (october, IN), (16, ..."
3,covid-19-vaccination-plan-massachusetts-10-19-2020.pdf,Massachusetts Department of Public Health\n 16 OCTOBER 2020\n| \nV1.0\n COVID\n-19 \nVaccinatio...,"[Massachusetts, Department, of, Public, Health, 16, OCTOBER, 2020, |, V1.0, COVID, -19, Vaccinat...","[massachusetts, department, of, public, health, 16, october, 2020, |, v1.0, covid, -19, vaccinat...","[massachusetts, department, of, public, health, 16, october, 2020, v1.0, covid, -19, vaccination...","[massachusetts, department, public, health, 16, october, 2020, v1.0, covid, -19, vaccination, pl...","[(massachusetts, JJ), (department, NN), (public, JJ), (health, NN), (16, CD), (october, NN), (20..."
4,covid-icf-iid-response-plan_TX.pdf,Page | \ni COVID\n-19 RESPONSE \nFOR INTERMEDIATE \nCARE FACILITIES FOR \nINDIVIDUALS WITH A...,"[Page, |, i, COVID, -19, RESPONSE, FOR, INTERMEDIATE, CARE, FACILITIES, FOR, INDIVIDUALS, WITH, ...","[page, |, i, covid, -19, response, for, intermediate, care, facilities, for, individuals, with, ...","[page, i, covid, -19, response, for, intermediate, care, facilities, for, individuals, with, an,...","[page, covid, -19, response, intermediate, care, facilities, individuals, intellectual, disabili...","[(page, NN), (covid, NN), (-19, NNP), (response, NN), (intermediate, NN), (care, NN), (facilitie..."


In [22]:
full_df['pos_tags'] = full_df['stopwords_removed'].apply(nltk.tag.pos_tag)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
full_df['wordnet_pos'] = full_df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

wnl = WordNetLemmatizer()
full_df['lemmatized'] = full_df['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
full_df.head()

Unnamed: 0,Title,Text,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,3002.pdf,"\n \nImmunization Program/Puerto Rico Department of Health\n \nOCTOBER 16, 2020\n| \nV\nERSION ...","[Immunization, Program/Puerto, Rico, Department, of, Health, OCTOBER, 16, ,, 2020, |, V, ERSION,...","[immunization, program/puerto, rico, department, of, health, october, 16, ,, 2020, |, v, ersion,...","[immunization, program/puerto, rico, department, of, health, october, 16, 2020, v, ersion, 1.0, ...","[immunization, program/puerto, rico, department, health, october, 16, 2020, v, ersion, 1.0, covi...","[(immunization, NN), (program/puerto, NN), (rico, VBP), (department, NN), (health, NN), (october...","[(immunization, n), (program/puerto, n), (rico, v), (department, n), (health, n), (october, n), ...","[immunization, program/puerto, rico, department, health, october, 16, 2020, v, ersion, 1.0, covi..."
1,ARCEMP-1Sept2021-Final_ ARKANSAS.pdf,"Arkansas \nComprehensive \nEmergency \nManagement Plan \n (ARCEMP) \n September 1, 2021\n I ...","[Arkansas, Comprehensive, Emergency, Management, Plan, (, ARCEMP, ), September, 1, ,, 2021, I, S...","[arkansas, comprehensive, emergency, management, plan, (, arcemp, ), september, 1, ,, 2021, i, s...","[arkansas, comprehensive, emergency, management, plan, arcemp, september, 1, 2021, i, state, of,...","[arkansas, comprehensive, emergency, management, plan, arcemp, september, 1, 2021, state, arkans...","[(arkansas, NNS), (comprehensive, JJ), (emergency, NN), (management, NN), (plan, NN), (arcemp, V...","[(arkansas, n), (comprehensive, a), (emergency, n), (management, n), (plan, n), (arcemp, v), (se...","[arkansas, comprehensive, emergency, management, plan, arcemp, september, 1, 2021, state, arkans..."
2,covid-19-vaccination-plan-maine-interim-draft.pdf,"[Maine Center for Disease Control and Prevention\n][October \n16, 2020\n] | \n[V 1.0] COVID-19 \...","[[, Maine, Center, for, Disease, Control, and, Prevention, ], [, October, 16, ,, 2020, ], |, [, ...","[[, maine, center, for, disease, control, and, prevention, ], [, october, 16, ,, 2020, ], |, [, ...","[maine, center, for, disease, control, and, prevention, october, 16, 2020, v, 1.0, covid-19, vac...","[maine, center, disease, control, prevention, october, 16, 2020, v, 1.0, covid-19, vaccination, ...","[(maine, NN), (center, NN), (disease, NN), (control, NN), (prevention, NN), (october, IN), (16, ...","[(maine, n), (center, n), (disease, n), (control, n), (prevention, n), (october, n), (16, n), (2...","[maine, center, disease, control, prevention, october, 16, 2020, v, 1.0, covid-19, vaccination, ..."
3,covid-19-vaccination-plan-massachusetts-10-19-2020.pdf,Massachusetts Department of Public Health\n 16 OCTOBER 2020\n| \nV1.0\n COVID\n-19 \nVaccinatio...,"[Massachusetts, Department, of, Public, Health, 16, OCTOBER, 2020, |, V1.0, COVID, -19, Vaccinat...","[massachusetts, department, of, public, health, 16, october, 2020, |, v1.0, covid, -19, vaccinat...","[massachusetts, department, of, public, health, 16, october, 2020, v1.0, covid, -19, vaccination...","[massachusetts, department, public, health, 16, october, 2020, v1.0, covid, -19, vaccination, pl...","[(massachusetts, JJ), (department, NN), (public, JJ), (health, NN), (16, CD), (october, NN), (20...","[(massachusetts, a), (department, n), (public, a), (health, n), (16, n), (october, n), (2020, n)...","[massachusetts, department, public, health, 16, october, 2020, v1.0, covid, -19, vaccination, pl..."
4,covid-icf-iid-response-plan_TX.pdf,Page | \ni COVID\n-19 RESPONSE \nFOR INTERMEDIATE \nCARE FACILITIES FOR \nINDIVIDUALS WITH A...,"[Page, |, i, COVID, -19, RESPONSE, FOR, INTERMEDIATE, CARE, FACILITIES, FOR, INDIVIDUALS, WITH, ...","[page, |, i, covid, -19, response, for, intermediate, care, facilities, for, individuals, with, ...","[page, i, covid, -19, response, for, intermediate, care, facilities, for, individuals, with, an,...","[page, covid, -19, response, intermediate, care, facilities, individuals, intellectual, disabili...","[(page, NN), (covid, NN), (-19, NNP), (response, NN), (intermediate, NN), (care, NN), (facilitie...","[(page, n), (covid, n), (-19, n), (response, n), (intermediate, n), (care, n), (facilities, n), ...","[page, covid, -19, response, intermediate, care, facility, individual, intellectual, disability,..."


In [23]:
full_df.to_csv('Full_text_TOKEN_clean.csv')