## Goal: to construct a means to count how many times words appear in a given document

## Toy Example: 

Go to this link and copy the example text: https://en.wikipedia.org/wiki/Lorem_ipsum

In [2]:
s = "Lorem ipsum dolor sit sit amet, consectetur adipiscing elit, \
     sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
     Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris \
     nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in \
     reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \
     pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \
     culpa qui officia deserunt mollit anim id est laborum."
#Parse string into list by every space 
s_parsed=s.split(" ")
print("Total number of words: " + str(len(s_parsed)))
print(s_parsed)
s_parsed.count("sit")

Total number of words: 100
['Lorem', 'ipsum', 'dolor', 'sit', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit,', '', '', '', '', '', 'sed', 'do', 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua.', '', '', '', '', '', 'Ut', 'enim', 'ad', 'minim', 'veniam,', 'quis', 'nostrud', 'exercitation', 'ullamco', 'laboris', '', '', '', '', '', 'nisi', 'ut', 'aliquip', 'ex', 'ea', 'commodo', 'consequat.', 'Duis', 'aute', 'irure', 'dolor', 'in', '', '', '', '', '', 'reprehenderit', 'in', 'voluptate', 'velit', 'esse', 'cillum', 'dolore', 'eu', 'fugiat', 'nulla', '', '', '', '', '', 'pariatur.', 'Excepteur', 'sint', 'occaecat', 'cupidatat', 'non', 'proident,', 'sunt', 'in', '', '', '', '', '', 'culpa', 'qui', 'officia', 'deserunt', 'mollit', 'anim', 'id', 'est', 'laborum.']


2

In [4]:
#Define a function that counts how many times a word appears, 
#given a list of unique words.
def num_times_appear(x, entire_list): 
    count=0
    for word in entire_list:
        if word==x: 
            count = count+1 
    return(count)

set_strings = set(s_parsed)
print(set_strings)

{'', 'magna', 'qui', 'pariatur.', 'Ut', 'cupidatat', 'id', 'ad', 'cillum', 'proident,', 'irure', 'elit,', 'dolore', 'deserunt', 'minim', 'labore', 'officia', 'amet,', 'sit', 'esse', 'occaecat', 'do', 'ipsum', 'Excepteur', 'tempor', 'consectetur', 'nostrud', 'consequat.', 'aliqua.', 'enim', 'nulla', 'quis', 'anim', 'dolor', 'laborum.', 'Duis', 'Lorem', 'sed', 'aute', 'exercitation', 'non', 'sint', 'mollit', 'laboris', 'sunt', 'adipiscing', 'culpa', 'ex', 'ullamco', 'ut', 'incididunt', 'commodo', 'est', 'et', 'eu', 'fugiat', 'in', 'velit', 'eiusmod', 'veniam,', 'reprehenderit', 'voluptate', 'nisi', 'aliquip', 'ea'}


In [5]:
dict_words ={x: num_times_appear(x, s_parsed) for x in set_strings}

print("Total number of unique words: "+str(len(dict_words)))
print(dict_words)

Total number of unique words: 65
{'': 30, 'magna': 1, 'qui': 1, 'pariatur.': 1, 'Ut': 1, 'cupidatat': 1, 'id': 1, 'ad': 1, 'cillum': 1, 'proident,': 1, 'irure': 1, 'elit,': 1, 'dolore': 2, 'deserunt': 1, 'minim': 1, 'labore': 1, 'officia': 1, 'amet,': 1, 'sit': 2, 'esse': 1, 'occaecat': 1, 'do': 1, 'ipsum': 1, 'Excepteur': 1, 'tempor': 1, 'consectetur': 1, 'nostrud': 1, 'consequat.': 1, 'aliqua.': 1, 'enim': 1, 'nulla': 1, 'quis': 1, 'anim': 1, 'dolor': 2, 'laborum.': 1, 'Duis': 1, 'Lorem': 1, 'sed': 1, 'aute': 1, 'exercitation': 1, 'non': 1, 'sint': 1, 'mollit': 1, 'laboris': 1, 'sunt': 1, 'adipiscing': 1, 'culpa': 1, 'ex': 1, 'ullamco': 1, 'ut': 2, 'incididunt': 1, 'commodo': 1, 'est': 1, 'et': 1, 'eu': 1, 'fugiat': 1, 'in': 3, 'velit': 1, 'eiusmod': 1, 'veniam,': 1, 'reprehenderit': 1, 'voluptate': 1, 'nisi': 1, 'aliquip': 1, 'ea': 1}


A more efficient algorithm is again to step through each word in the list, but rather than count the number of appearances of each word, increment a counter for the word each time it is encountered. 

**Why is that?**

The function `num_times_appear` written above requires that you iterate through the entire list every itme you call the function. You call the function for each word (which is a `key` in the dictionary we constructed). So, for 65 unique words, we call the function 65 times. But for each function call, we have to iterate through the entire list, which consists of **99** total words. So, effectively, we have to run through 65 times 99 total iterations (**6,435** in total!). This isn't very efficient!

We take advantage of the dictionary function in Python and instead, iterate through the entire list of words once (99 words total), but for each word, we call the word in the dictionary and add one to the count that is associated with the key/word. This means instead of 6,435 times, we are iterating through only 99 times.

In [6]:
dict_words2={x:0 for x in set_strings} #Initialize all to be zero count
for i in s_parsed: 
    dict_words2[i] = dict_words2[i]+1

#Sanity check: 
print(dict_words == dict_words2)
#Returns true!


True


## Less Trivial Case...

In [8]:
f=open('/Users/MelodyHuang/Desktop/PythonWorkshop/Week 2/\
AgathaChristie_TheMysteriousAffair.txt', 'r')
print(f)
myst_affair_text=f.read()

<_io.TextIOWrapper name='/Users/MelodyHuang/Desktop/PythonWorkshop/Week 2/AgathaChristie_TheMysteriousAffair.txt' mode='r' encoding='UTF-8'>


In [9]:
text_parsed=myst_affair_text.split(" ")
print('Total number of words: ' + str(len(text_parsed)))

set_text=set(text_parsed)
print('Total number of unique words: ' + str(len(set_text)))

Total number of words: 59578
Total number of unique words: 10594


In [10]:
dict_text={x:num_times_appear(x, text_parsed) for x in set_text}

In [11]:
dict_text2={x:0 for x in set_text}
for i in text_parsed: 
    dict_text2[i]=dict_text2[i]+1

In [12]:
print(dict_text2)

