In [5]:
import os
import re, csv, string, pandas as pd
from collections import defaultdict
from numpy import loadtxt
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.probability import FreqDist
import nltk
nltk.download('punkt')
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sharonku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [69]:
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")
fdist = FreqDist()

dictionary = set()
papers = {}
path = "HW1_TXT_files"
stop_words = []
all_words = []

for file_name in os.listdir(path):
    with open(path + os.sep + file_name, 'r', encoding = "utf8") as f:
        papers[file_name.lower()] = tokenizer.tokenize(f.read())
        for word in papers[file_name.lower()]:
            fdist[word] += 1
            all_words.append(word)
        dictionary = dictionary.union(set(papers[file_name.lower()]))

amount = sum(fdist.values())
print("Amount of words in corpus is ", amount)    
print("Dictionary size is ", len(dictionary))

# generate language model
for key, value in fdist.items():
    fdist[key] = value / amount

with open("stop_words_english.txt", "r", encoding = "utf8") as stop_words_file:
    stop_words = stop_words_file.read().split("\n")

dictionary_stopwords = set()
dictionary_casefold = set()
dictionary_stemming = set()


#stop words removal
stopwords_filter = [w for w in all_words if not w.lower() in stop_words]
dictionary_stopwords = dictionary_stopwords.union(set(stopwords_filter))
# generate lang model

fdist_stopwords = FreqDist(stopwords_filter)
stopwords_amount = sum(fdist_stopwords.values())
for key, value in fdist_stopwords.items():
    fdist_stopwords[key] = value / stopwords_amount
    
# case folding
casefold = [w.lower() for w in stopwords_filter]
dictionary_casefold = dictionary_casefold.union(set(casefold))

#generate lang model
fdist_casefold = FreqDist(casefold)
casefold_amount = sum(fdist_casefold.values())
for key, value in fdist_casefold.items():
    fdist_casefold[key] = value / casefold_amount

# stemming
stemmer = PorterStemmer()
stemming = [stemmer.stem(w) for w in casefold]
dictionary_stemming = dictionary_stemming.union(set(stemming))

# generate lang model
fdist_stemming = FreqDist(stemming)
stemming_amount = sum(fdist_stemming.values())
for key, value in fdist_stemming.items():
    fdist_stemming[key] = value / stemming_amount

for title, paper in papers.items():
    stopwords_filter = [w for w in paper if not w.lower() in stop_words]
    casefold = [w.lower() for w in stopwords_filter]
    stemmer = PorterStemmer()
    stemming = [stemmer.stem(w) for w in casefold]
    papers[title] = stemming

Amount of words in corpus is  536960
Dictionary size is  26089


In [117]:
print("initial Dictionary size is ", len(dictionary))

print("Dictionary size after stop words removal is ", len(dictionary_stopwords))

print("Dictionary size after case folding is ", len(dictionary_casefold))

print("Dictionary size after stemming is ", len(dictionary_stemming))

initial Dictionary size is  26089
Dictionary size after stop words removal is  24872
Dictionary size after case folding is  20064
Dictionary size after stemming is  14363


In [116]:
print("initial language model")
fdist.pprint()

print("\nlanguage model after stop words removal")
fdist_stopwords.pprint()

print("\nlanguage model after case folding")
fdist_casefold.pprint()

print("\nlanguage model after stemming")
fdist_stemming.pprint()

initial language model
FreqDist({'the': 0.062270932657926104, 'of': 0.03338609952324195, 'and': 0.028834550059594757, 'to': 0.021897348033373062, 'in': 0.018723927294398094, 'a': 0.017740613825983312, 'is': 0.009320992252681765, 'for': 0.009266984505363528, 'that': 0.009108685935637665, 'The': 0.007769666269368296, ...})

language model after stop words removal
FreqDist({'museum': 0.010523861221119589, 'mobile': 0.00907506230982125, 'guide': 0.008341106133121301, 'user': 0.008077340632119756, 'visitors': 0.007675958347986973, 'visitor': 0.004690438691723115, 'al': 0.003990886710805976, 'Museum': 0.003922078319240355, 'time': 0.003902964877138794, 'guides': 0.0037194758329638067, ...})

language model after case folding
FreqDist({'museum': 0.014656187403477117, 'mobile': 0.01151776021040077, 'guide': 0.009690515145491522, 'user': 0.009575834492882154, 'visitors': 0.008417559901527547, 'visitor': 0.005114757106377773, 'guides': 0.004384623618098136, 'context': 0.0042814110307497055, 'al'

In [107]:
df = pd.DataFrame(data = {"fdist_initial": fdist, "fdist_stopwords":fdist_stopwords, "fdist_casefold":fdist_casefold, "fdist_stemming": fdist_stemming}).sort_values(by = "fdist_stemming", ascending = False)
df

Unnamed: 0,fdist_initial,fdist_stopwords,fdist_casefold,fdist_stemming
museum,0.005127,0.010524,0.014656,0.017363
guid,,,,0.014656
visitor,0.002285,0.004690,0.005115,0.013532
user,0.003935,0.008077,0.009576,0.013104
mobil,,,,0.011644
...,...,...,...,...
sensibility,,,0.000004,
shoogle,,,0.000004,
excitatory,,,0.000004,
feasability,,,0.000004,


In [26]:
df = pd.DataFrame(papers.items())

In [4]:
len(papers)

50

In [15]:
len(dictionary)

26089

In [41]:
fdist

FreqDist({'the': 0.062270932657926104, 'of': 0.03338609952324195, 'and': 0.028834550059594757, 'to': 0.021897348033373062, 'in': 0.018723927294398094, 'a': 0.017740613825983312, 'is': 0.009320992252681765, 'for': 0.009266984505363528, 'that': 0.009108685935637665, 'The': 0.007769666269368296, ...})

In [40]:
# for initial language model
fdist["mobile"]+fdist["visitors"]+fdist["guide"]

#for language model after stop words removal

# for language model after case folding

# for language model after stemming

0.012224374255065555

In [46]:
sum(fdist_stopwords.values())

FreqDist({'museum': 2753, 'mobile': 2374, 'guide': 2182, 'user': 2113, 'visitors': 2008, 'visitor': 1227, 'al': 1044, 'Museum': 1026, 'time': 1021, 'guides': 973, ...})