In [1]:
# import packages
import nltk
import pathlib
import io
import re
import pandas as pd

In [2]:
# load full list of 121 words/phrases from CDC guidelines
with open('./CDC_difficult_all.txt','r') as f:
    difficult_all = f.readlines()
    difficult_all = [x.strip() for x in difficult_all]
# total number of words
len(difficult_all)

121

In [3]:
# count how many words in list appear at least once in text
def howManyWords (words, text):
    tokens = nltk.word_tokenize(text.lower())
    count = 0
    for word in words:
        if word.lower() in tokens:
            count += 1
    return count

In [4]:
# count aggregate instances of list of words/phrases in a text
def cumulativeCount(words, text):
    counts = []
    for word in words:
        count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), text.lower()))
        counts.append(count)
    return sum(counts)

In [5]:
# directory of txt files to be analyzed
path = pathlib.Path("./CDC_txt_all")

filename = []
token_count = []

count = []
frac = []
total = []
freq =[]

# loop through each file
for entry in path.iterdir():
    if entry.is_file(): 
        with io.open(entry, 'r', encoding='windows-1252') as f:
        #with open(entry,'r') as f:
            
            raw = f.read()
            tokens = nltk.word_tokenize(raw)
            
            # count of difficult words and phrases that appear at least once 
            difficult_count = howManyWords(difficult_all, raw)
            # fraction of difficult words and phrases appearing
            difficult_frac = difficult_count / len(difficult_all)
            
            # total instances of difficult words and phrases
            difficult_total = cumulativeCount(difficult_all, raw)
            # per-token frequency of difficult words and phrases
            difficult_freq = difficult_total / len(tokens)
            
            filename.append(entry)
            token_count.append(len(tokens))
            
            count.append(difficult_count)
            frac.append(difficult_frac)
            total.append(difficult_total)
            freq.append(difficult_freq) 
            

In [11]:
# print results
print(count)
print(frac)
print(total)
print(freq)

[16, 11, 18, 12, 13, 9, 5, 31, 8, 10, 0, 27, 4, 4, 13, 3, 7, 22, 16, 0, 23, 26, 12, 8, 10, 7, 28, 48, 6, 3, 8, 5, 6, 11, 3, 20, 10, 4, 9, 11, 7, 3, 15, 6, 4, 5, 15, 5, 12, 23, 15, 5, 11, 19, 11, 10, 7, 22, 11, 24, 11, 13, 13, 4, 8, 20, 11, 6, 3]
[0.1322314049586777, 0.09090909090909091, 0.1487603305785124, 0.09917355371900827, 0.10743801652892562, 0.0743801652892562, 0.04132231404958678, 0.256198347107438, 0.06611570247933884, 0.08264462809917356, 0.0, 0.2231404958677686, 0.03305785123966942, 0.03305785123966942, 0.10743801652892562, 0.024793388429752067, 0.05785123966942149, 0.18181818181818182, 0.1322314049586777, 0.0, 0.19008264462809918, 0.21487603305785125, 0.09917355371900827, 0.06611570247933884, 0.08264462809917356, 0.05785123966942149, 0.23140495867768596, 0.39669421487603307, 0.049586776859504134, 0.024793388429752067, 0.06611570247933884, 0.04132231404958678, 0.049586776859504134, 0.09090909090909091, 0.024793388429752067, 0.1652892561983471, 0.08264462809917356, 0.033057851

In [12]:
# export results to csv
df_filename = pd.DataFrame({'file': filename})
df_token_count = pd.DataFrame({'tokens': token_count})
df_count = pd.DataFrame({'raw count all': count})
df_frac = pd.DataFrame({'fraction all': frac})
df_total = pd.DataFrame({'raw total count all': total})
df_freq = pd.DataFrame({'frequency all': freq})

df_combined = pd.concat([df_filename, df_token_count, df_count, df_frac, df_total, df_freq], axis=1)
df_combined_final = df_combined.round(3).sort_values(by='file')
df_combined_final.to_csv('./CDC_difficult_words_analysis.csv', sep=',',index=False)