In [1]:
import os
import re, csv, string, pandas as pd
from collections import defaultdict
from numpy import loadtxt
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.probability import FreqDist
import nltk
nltk.download('punkt')
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sharonku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")
fdist = FreqDist()

dictionary = set()
papers = {}
path = "HW1_TXT_files"
stop_words = []
all_words = []

for file_name in os.listdir(path):
    with open(path + os.sep + file_name, 'r', encoding = "utf8") as f:
        papers[file_name.lower()] = tokenizer.tokenize(f.read())
        for word in papers[file_name.lower()]:
            fdist[word] += 1
            all_words.append(word)
        dictionary = dictionary.union(set(papers[file_name.lower()]))

amount = sum(fdist.values())
print("Amount of words in corpus is ", amount)    
print("Dictionary size is ", len(dictionary))

# generate language model
for key, value in fdist.items():
    fdist[key] = value / amount

with open("stop_words_english.txt", "r", encoding = "utf8") as stop_words_file:
    stop_words = stop_words_file.read().split("\n")

dictionary_stopwords = set()
dictionary_casefold = set()
dictionary_stemming = set()


#stop words removal
stopwords_filter = [w for w in all_words if not w.lower() in stop_words]
dictionary_stopwords = dictionary_stopwords.union(set(stopwords_filter))
# generate lang model

fdist_stopwords = FreqDist(stopwords_filter)
stopwords_amount = sum(fdist_stopwords.values())
for key, value in fdist_stopwords.items():
    fdist_stopwords[key] = value / stopwords_amount
    
# case folding
casefold = [w.lower() for w in stopwords_filter]
dictionary_casefold = dictionary_casefold.union(set(casefold))

#generate lang model
fdist_casefold = FreqDist(casefold)
casefold_amount = sum(fdist_casefold.values())
for key, value in fdist_casefold.items():
    fdist_casefold[key] = value / casefold_amount

# stemming
stemmer = PorterStemmer()
stemming = [stemmer.stem(w) for w in casefold]
dictionary_stemming = dictionary_stemming.union(set(stemming))

# generate lang model
fdist_stemming = FreqDist(stemming)
stemming_amount = sum(fdist_stemming.values())
for key, value in fdist_stemming.items():
    fdist_stemming[key] = value / stemming_amount

for title, paper in papers.items():
    stopwords_filter = [w for w in paper if not w.lower() in stop_words]
    casefold = [w.lower() for w in stopwords_filter]
    stemmer = PorterStemmer()
    stemming = [stemmer.stem(w) for w in casefold]
    papers[title] = stemming

Amount of words in corpus is  536960
Dictionary size is  26089


In [117]:
print("initial Dictionary size is ", len(dictionary))

print("Dictionary size after stop words removal is ", len(dictionary_stopwords))

print("Dictionary size after case folding is ", len(dictionary_casefold))

print("Dictionary size after stemming is ", len(dictionary_stemming))

initial Dictionary size is  26089
Dictionary size after stop words removal is  24872
Dictionary size after case folding is  20064
Dictionary size after stemming is  14363


In [116]:
print("initial language model")
fdist.pprint()

print("\nlanguage model after stop words removal")
fdist_stopwords.pprint()

print("\nlanguage model after case folding")
fdist_casefold.pprint()

print("\nlanguage model after stemming")
fdist_stemming.pprint()

initial language model
FreqDist({'the': 0.062270932657926104, 'of': 0.03338609952324195, 'and': 0.028834550059594757, 'to': 0.021897348033373062, 'in': 0.018723927294398094, 'a': 0.017740613825983312, 'is': 0.009320992252681765, 'for': 0.009266984505363528, 'that': 0.009108685935637665, 'The': 0.007769666269368296, ...})

language model after stop words removal
FreqDist({'museum': 0.010523861221119589, 'mobile': 0.00907506230982125, 'guide': 0.008341106133121301, 'user': 0.008077340632119756, 'visitors': 0.007675958347986973, 'visitor': 0.004690438691723115, 'al': 0.003990886710805976, 'Museum': 0.003922078319240355, 'time': 0.003902964877138794, 'guides': 0.0037194758329638067, ...})

language model after case folding
FreqDist({'museum': 0.014656187403477117, 'mobile': 0.01151776021040077, 'guide': 0.009690515145491522, 'user': 0.009575834492882154, 'visitors': 0.008417559901527547, 'visitor': 0.005114757106377773, 'guides': 0.004384623618098136, 'context': 0.0042814110307497055, 'al'

In [107]:
df = pd.DataFrame(data = {"fdist_initial": fdist, "fdist_stopwords":fdist_stopwords, "fdist_casefold":fdist_casefold, "fdist_stemming": fdist_stemming}).sort_values(by = "fdist_stemming", ascending = False)
df

Unnamed: 0,fdist_initial,fdist_stopwords,fdist_casefold,fdist_stemming
museum,0.005127,0.010524,0.014656,0.017363
guid,,,,0.014656
visitor,0.002285,0.004690,0.005115,0.013532
user,0.003935,0.008077,0.009576,0.013104
mobil,,,,0.011644
...,...,...,...,...
sensibility,,,0.000004,
shoogle,,,0.000004,
excitatory,,,0.000004,
feasability,,,0.000004,


In [24]:
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

rand_papers = {}
path = "HW1_rand_txt"

for file_name in os.listdir(path):
    with open(path + os.sep + file_name, 'r', encoding = "utf8") as f:
        rand_papers[file_name.lower()] = tokenizer.tokenize(f.read())
        

for title, paper in rand_papers.items():
    stopwords_filter = [w for w in paper if not w.lower() in stop_words]
    casefold = [w.lower() for w in stopwords_filter]
    stemmer = PorterStemmer()
    stemming = [stemmer.stem(w) for w in casefold]
    rand_papers[title] = FreqDist(stemming)
    words_amount = sum(rand_papers[title].values())
    
    for key, value in rand_papers[title].items():
        rand_papers[title][key] = value / words_amount
    print(rand_papers[title].pprint())
    

FreqDist({'imag': 0.024328859060402684, 'object': 0.022651006711409395, 'museum': 0.01971476510067114, 'scan': 0.01552013422818792, 'colour': 0.013842281879194632, 'record': 0.012164429530201342, 'laser': 0.010486577181208054, 'conserv': 0.010067114093959731, 'technolog': 0.010067114093959731, 'digit': 0.00964765100671141, ...})
None
FreqDist({'user': 0.028890959925442685, 'sound': 0.023299161230195712, 'audio': 0.02050326188257223, 'interact': 0.01537744641192917, 'headphon': 0.010717614165890028, 'orient': 0.0097856477166822, 'posit': 0.009319664492078284, 'head': 0.008387698042870456, 'realiti': 0.007921714818266543, 'figur': 0.007921714818266543, ...})
None
FreqDist({'presenc': 0.035202086049543675, 'cultur': 0.033246414602346806, 'heritag': 0.027162103433289873, 'virtual': 0.024989135158626684, 'environ': 0.018035636679704477, 'interact': 0.012385919165580182, 'learn': 0.011516731855714906, 'evalu': 0.010212950890916993, 'social': 0.00847457627118644, 'understand': 0.0065189048239

FreqDist({'predict': 0.025506555423122766, 'user': 0.020500595947556616, 'target': 0.017878426698450536, 'condit': 0.0166865315852205, 'environ': 0.01597139451728248, 'predictor': 0.014779499404052444, 'eye': 0.014541120381406437, 'track': 0.014302741358760428, 'base': 0.013110846245530394, 'path': 0.010727056019070322, ...})
None
FreqDist({'user': 0.027923211169284468, 'site': 0.019197207678883072, 'augment': 0.013263525305410123, 'imag': 0.012216404886561954, 'present': 0.01012216404886562, 'base': 0.009075043630017453, 'archeoguid': 0.008726003490401396, 'mu': 0.008376963350785341, 'tour': 0.008027923211169284, 'render': 0.007678883071553229, ...})
None
FreqDist({'museum': 0.036720751494449186, 'access': 0.03131226871619698, 'visitor': 0.014232849416453175, 'servic': 0.013663535439795047, 'guid': 0.011955593509820665, 'univers': 0.008539709649871904, 'technolog': 0.007970395673213778, 'panel': 0.007685738684884714, 'text': 0.007685738684884714, 'project': 0.00654711073156846, ...})


In [54]:
sum_all_freq = 0
relevant = []
for title, freq in rand_papers.items():
    sum_of_freq = freq["mobil"] + freq["visitor"] + freq["guid"]
#     if sum_of_freq > 0.036:
    if freq["mobil"] > 0.0015 and freq["visitor"] > 0.0015 and freq["guid"] > 0.0015:
        print(title + ": ", freq["mobil"] , freq["visitor"] , freq["guid"])
        relevant.append(title)
    sum_all_freq += sum_of_freq
len(relevant)

a multi-sensory approach to cultural heritage the battle of pavia exhibition.txt:  0.00196174595389897 0.020107896027464444 0.0024521824423737125
a novel image based positioning technique using mobile eye tracker for a museum visit.txt:  0.017277486910994764 0.021465968586387434 0.004712041884816754
a visitors guide in an active museum presentation.txt:  0.004746152739824536 0.0326477779375809 0.014669926650366748
analyzing visitor perceptions of personalization in art museum interactive technology.txt:  0.0018890606216726772 0.06766271681263954 0.0018890606216726772
augmented reality for visitors of cultural heritage sites.txt:  0.0033422459893048127 0.004010695187165776 0.002005347593582888
augmented reality to reconstruct.txt:  0.008614976805831677 0.0033134526176275677 0.0033134526176275677
first results of an augmented reality.txt:  0.006980802792321117 0.004188481675392671 0.0017452006980802793
full access to cultural spaces (facs) mapping and evaluating museum access services us

12

In [43]:
print(fdist_stemming["mobil"], fdist_stemming["visitor"], fdist_stemming["guid"])

0.011643908928271074 0.013532317007905319 0.014656187403477117


In [27]:
sum_all_freq = 0
for title, freq in rand_papers.items():
    sum_of_freq = freq["mobil"] + freq["visitor"] + freq["guid"]
    print(title + ": ", sum_of_freq)
    sum_all_freq += sum_of_freq
sum_all_freq/50

[('3d colour imaging for cultural heritage artefacts.txt',
  FreqDist({'imag': 0.024328859060402684, 'object': 0.022651006711409395, 'museum': 0.01971476510067114, 'scan': 0.01552013422818792, 'colour': 0.013842281879194632, 'record': 0.012164429530201342, 'laser': 0.010486577181208054, 'conserv': 0.010067114093959731, 'technolog': 0.010067114093959731, 'digit': 0.00964765100671141, ...})),
 ('a 3d audio augmented reality system for a cultural heritage management and fruition.txt',
  FreqDist({'user': 0.028890959925442685, 'sound': 0.023299161230195712, 'audio': 0.02050326188257223, 'interact': 0.01537744641192917, 'headphon': 0.010717614165890028, 'orient': 0.0097856477166822, 'posit': 0.009319664492078284, 'head': 0.008387698042870456, 'realiti': 0.007921714818266543, 'figur': 0.007921714818266543, ...})),
 ('a critical examination of presence applied to cultural heritage.txt',
  FreqDist({'presenc': 0.035202086049543675, 'cultur': 0.033246414602346806, 'heritag': 0.02716210343328987

In [40]:
# for initial language model
fdist["mobile"]+fdist["visitors"]+fdist["guide"]

#for language model after stop words removal

# for language model after case folding

# for language model after stemming

0.012224374255065555

In [46]:
sum(fdist_stopwords.values())

FreqDist({'museum': 2753, 'mobile': 2374, 'guide': 2182, 'user': 2113, 'visitors': 2008, 'visitor': 1227, 'al': 1044, 'Museum': 1026, 'time': 1021, 'guides': 973, ...})

Hello Github
