In [92]:
# import packages
import nltk
import pathlib
import re
import pandas as pd

In [93]:
# load main list of 113 words/phrases from CDC guidelines ("problem cases" are handled separately)
with open('./CDC_main.txt','r') as f:
    difficult_main = f.readlines()
    difficult_main = [x.strip() for x in difficult_main]

In [94]:
# words and phrases requiring special handling because of duplicates
problem_words = ['risk', 'disease', 'assessment', 'registry']
problem_phrases = ['risk assessment', 'disease registry', 'chronic disease', 'risk factor']

In [95]:
# total number of owrds and phrases considered
len(difficult_main+problem_words+problem_phrases)

121

In [96]:
# count how many words/phrases in list appear at least once in text
def howManyPhrases (phrases, text):
    count = 0
    for phrase in phrases:
        if re.search(r"\b" + re.escape(phrase.lower()) + r"\b", text.lower()):
            count += 1
    return count

In [97]:
# count problem words after removing problem phrases
def correctionCount(text):
    for word in problem_phrases:
        text = text.replace(word, "")
    correction = howManyPhrases(problem_words, text)
    return correction

In [107]:
# count for entire list 
def totalCount(text):
    # main list
    count1 = howManyPhrases(difficult_main, text)
    # problem words
    count2 = correctionCount(text)
    # problem phrases
    count3 = howManyPhrases(problem_phrases,text)
    difficult_count = count1 + count2 + count3
    return difficult_count

In [None]:
# analyze CDC files
path = pathlib.Path("./CDC_txt_all")

filename = []
token_count = []
count = []
frac = []

# loop through each file
for entry in path.iterdir():
    if entry.is_file(): 
        with io.open(entry, 'r', encoding='windows-1252') as f:
        #with open(entry,'r') as f:
            
            raw = f.read()
            tokens = nltk.word_tokenize(raw)
            
            # count of difficult words and phrases that appear at least once 
            difficult_count = howManyWords(difficult_all, raw)
            # fraction of difficult words and phrases appearing
            difficult_frac = difficult_count / len(difficult_all)
            
            # total instances of difficult words and phrases
            difficult_total = cumulativeCount(difficult_all, raw)
            # per-token frequency of difficult words and phrases
            difficult_freq = difficult_total / len(tokens)
            
            filename.append(entry)
            token_count.append(len(tokens))
            
            count.append(difficult_count)
            frac.append(difficult_frac)
            total.append(difficult_total)
            freq.append(difficult_freq) 

In [98]:
# generate final counts tokens and difficult words and phrases
def finalValues(text):
    
    #token count
    tokens = nltk.word_tokenize(text)
    
    # count of difficult words and phrases that appear at least once
    # main list
    count1 = howManyPhrases(difficult_main, text)
    # problem words
    count2 = correctionCount(text)
    # problem phrases
    count3 = howManyPhrases(problem_phrases,text)
    # total count
    difficult_count = count1 + count2 + count3
    
    # fraction of difficult words and phrases appearing
    difficult_frac = difficult_count / 121
    
    return filename, token_count, count, fraction

In [104]:
# analyze CDC files
path = pathlib.Path("./CDC_txt_all")

filename_CDC = []
token_count = []
count_CDC = []
fraction_CDC = []

for entry in path.iterdir():
    if entry.is_file(): 
        with open(entry,'r', errors='ignore') as f:
            raw = f.read()
            finalValues(raw)
            
# export CDC results to csv
df_filename_CDC = pd.DataFrame({'file': filename_CDC})
df_token_count_CDC = pd.DataFrame({'tokens': token_count_CDC})
df_count_CDC = pd.DataFrame({'raw count': count_CDC})
df_frac_CDC = pd.DataFrame({'fraction': fraction_CDC})

df_combined = pd.concat([df_filename_CDC, df_token_count_CDC, df_count_CDC, df_frac_CDC], axis=1)
df_combined_final = df_combined.round(3).sort_values(by='file')
df_combined_final.to_csv('./CDCpages_difficult_words_analysis.csv', sep=',',index=False)

In [106]:
token_count

[]

In [100]:
# analyze US state files
path = pathlib.Path("./50_states")

filename_states = []
token_count_states = []
count_states = []
fraction_states = []

for entry in path.iterdir():
    if entry.is_file(): 
        with open(entry,'r', errors='ignore') as f:
            raw = f.read()
            finalValues(raw)
            
# export state results to csv
df_filename_states = pd.DataFrame({'file': filename_states})
df_token_count_states = pd.DataFrame({'tokens': token_count_states})
df_count_states = pd.DataFrame({'raw count': count_states})
df_frac_states = pd.DataFrame({'fraction': fraction_states})

df_combined = pd.concat([df_filename_states, df_token_count_states, df_count_states, df_frac_states], axis=1)
df_combined_final = df_combined.round(3).sort_values(by='file')
df_combined_final.to_csv('./states_difficult_words_analysis.csv', sep=',',index=False)