In [1]:
# Python notebook for counting the number of difficult words and phrases that appear in a document
# List of words and phrases taken from CDC's "Everyday Words for Public Health Communication"
# https://www.cdc.gov/other/pdf/EverydayWordsForPublicHealthCommunication.pdf

In [None]:
# import packages
import nltk
import pathlib
import re
import pandas as pd

In [2]:
# load main list of 113 words/phrases from CDC guidelines ("problem cases" are handled separately)
with open('./CDC_main.txt','r') as f:
    difficult_main = f.readlines()
    difficult_main = [x.strip() for x in difficult_main]

In [3]:
# words and phrases requiring special handling because of duplicates
problem_words = ['risk', 'disease', 'assessment', 'registry']
problem_phrases = ['risk assessment', 'disease registry', 'chronic disease', 'risk factor']

In [4]:
# total number of owrds and phrases considered
len(difficult_main+problem_words+problem_phrases)

121

In [5]:
# count how many words/phrases in list appear at least once in text
def howManyPhrases (phrases, text):
    count = 0
    for phrase in phrases:
        if re.search(r"\b" + re.escape(phrase.lower()) + r"\b", text.lower()):
            count += 1
    return count

In [6]:
# count problem words after removing problem phrases
def correctionCount(text):
    for word in problem_phrases:
        text = text.replace(word, "")
    correction = howManyPhrases(problem_words, text)
    return correction

In [7]:
# count for entire list 
def totalCount(text):
    # main list
    count1 = howManyPhrases(difficult_main, text)
    # problem words
    count2 = correctionCount(text)
    # problem phrases
    count3 = howManyPhrases(problem_phrases,text)
    difficult_count = count1 + count2 + count3
    return difficult_count

In [12]:
# analyze CDC files
path = pathlib.Path("./CDC_txt_all")

filename = []
token_count = []
count = []
frac = []

# loop through each file
for entry in path.iterdir():
    if entry.is_file(): 
        with open(entry,'r', errors='ignore') as f:
            raw = f.read()
            
            #token count
            tokens = nltk.word_tokenize(raw)
            # count of difficult words and phrases
            difficult_count = totalCount(raw)
            # fraction of difficult words and phrases appearing
            difficult_frac = difficult_count / 121
            
            filename.append(entry)
            token_count.append(len(tokens))
            count.append(difficult_count)
            frac.append(difficult_frac)

In [13]:
# export CDC results to csv
df_filename = pd.DataFrame({'file': filename})
df_token_count = pd.DataFrame({'tokens': token_count})
df_count = pd.DataFrame({'raw count': count})
df_frac = pd.DataFrame({'fraction': frac})

df_combined = pd.concat([df_filename, df_token_count, df_count, df_frac], axis=1)
df_combined_final = df_combined.round(3).sort_values(by='file')
df_combined_final.to_csv('./CDC_difficult_words_analysis.csv', sep=',',index=False)

In [14]:
# analyze state files
path = pathlib.Path("./50_states")

filename = []
token_count = []
count = []
frac = []

# loop through each file
for entry in path.iterdir():
    if entry.is_file(): 
        with open(entry,'r', errors='ignore') as f:
            raw = f.read()
            
            #token count
            tokens = nltk.word_tokenize(raw)
            # count of difficult words and phrases
            difficult_count = totalCount(raw)
            # fraction of difficult words and phrases appearing
            difficult_frac = difficult_count / 121
            
            filename.append(entry)
            token_count.append(len(tokens))
            count.append(difficult_count)
            frac.append(difficult_frac)

In [15]:
# export state results to csv
df_filename = pd.DataFrame({'file': filename})
df_token_count = pd.DataFrame({'tokens': token_count})
df_count = pd.DataFrame({'raw count': count})
df_frac = pd.DataFrame({'fraction': frac})

df_combined = pd.concat([df_filename, df_token_count, df_count, df_frac], axis=1)
df_combined_final = df_combined.round(3).sort_values(by='file')
df_combined_final.to_csv('./states_difficult_words_analysis.csv', sep=',',index=False)