In [2]:
import os
import re, csv, string, pandas as pd
from collections import defaultdict
from numpy import loadtxt
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.probability import FreqDist
import nltk
nltk.download('punkt')
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sharonku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")
fdist = FreqDist()

dictionary = set()
papers = {}
path = "HW1_TXT_files"
stop_words = []
all_words = []

In [4]:
for file_name in os.listdir(path):
    with open(path + os.sep + file_name, 'r', encoding = "utf8") as f:
        papers[file_name.lower()] = tokenizer.tokenize(f.read())
        for word in papers[file_name.lower()]:
            fdist[word] += 1
            all_words.append(word)
        dictionary = dictionary.union(set(papers[file_name.lower()]))

amount = sum(fdist.values())
print("Amount of words in corpus is ", amount)    
print("Initial dictionary size is ", len(dictionary))

# generate language model
for key, value in fdist.items():
    fdist[key] = value / amount

with open("stop_words_english.txt", "r", encoding = "utf8") as stop_words_file:
    stop_words = stop_words_file.read().split("\n")

Amount of words in corpus is  536960
Initial dictionary size is  26089


In [4]:
dictionary_stopwords = set()
dictionary_casefold = set()
dictionary_stemming = set()

In [5]:
#stop words removal
stopwords_filtered_list = [w for w in all_words if not w.lower() in stop_words]
dictionary_stopwords = set(stopwords_filtered_list)
# generate lang model

fdist_stopwords = FreqDist(stopwords_filtered_list)
stopwords_amount = sum(fdist_stopwords.values())
for key, value in fdist_stopwords.items():
    fdist_stopwords[key] = value / stopwords_amount

In [6]:
# case folding
casefold = [w.lower() for w in stopwords_filtered_list]
dictionary_casefold = set(casefold)

#generate lang model
fdist_casefold = FreqDist(casefold)
casefold_amount = sum(fdist_casefold.values())
for key, value in fdist_casefold.items():
    fdist_casefold[key] = value / casefold_amount

In [7]:
# stemming
stemmer = PorterStemmer()
stemming = [stemmer.stem(w) for w in casefold]
dictionary_stemming = set(stemming)

# generate stemmed lang model
fdist_stemming = FreqDist(stemming)
stemming_amount = sum(fdist_stemming.values())
for key, value in fdist_stemming.items():
    fdist_stemming[key] = value / stemming_amount

In [8]:
for title, paper in papers.items():
    stopwords_filtered_list = [w for w in paper if not w.lower() in stop_words]
    casefold = [w.lower() for w in stopwords_filtered_list]
    stemmer = PorterStemmer()
    stemming = [stemmer.stem(w) for w in casefold]
    papers[title] = stemming

In [9]:
print("initial Dictionary size is ", len(dictionary))

print("Dictionary size after stop words removal is ", len(dictionary_stopwords))

print("Dictionary size after case folding is ", len(dictionary_casefold))

print("Dictionary size after stemming is ", len(dictionary_stemming))

initial Dictionary size is  26089
Dictionary size after stop words removal is  24872
Dictionary size after case folding is  20064
Dictionary size after stemming is  14363


In [10]:
print("initial language model")
fdist.pprint()

print("\nlanguage model after stop words removal")
fdist_stopwords.pprint()

print("\nlanguage model after case folding")
fdist_casefold.pprint()

print("\nlanguage model after stemming")
fdist_stemming.pprint()

initial language model
FreqDist({'the': 0.062270932657926104, 'of': 0.03338609952324195, 'and': 0.028834550059594757, 'to': 0.021897348033373062, 'in': 0.018723927294398094, 'a': 0.017740613825983312, 'is': 0.009320992252681765, 'for': 0.009266984505363528, 'that': 0.009108685935637665, 'The': 0.007769666269368296, ...})

language model after stop words removal
FreqDist({'museum': 0.010523861221119589, 'mobile': 0.00907506230982125, 'guide': 0.008341106133121301, 'user': 0.008077340632119756, 'visitors': 0.007675958347986973, 'visitor': 0.004690438691723115, 'al': 0.003990886710805976, 'Museum': 0.003922078319240355, 'time': 0.003902964877138794, 'guides': 0.0037194758329638067, ...})

language model after case folding
FreqDist({'museum': 0.014656187403477117, 'mobile': 0.01151776021040077, 'guide': 0.009690515145491522, 'user': 0.009575834492882154, 'visitors': 0.008417559901527547, 'visitor': 0.005114757106377773, 'guides': 0.004384623618098136, 'context': 0.0042814110307497055, 'al'

In [11]:
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

rand_papers = {}
path = "HW1_rand_txt"

for file_name in os.listdir(path):
    with open(path + os.sep + file_name, 'r', encoding = "utf8") as f:
        rand_papers[file_name.lower()] = tokenizer.tokenize(f.read())
        

for title, paper in rand_papers.items():
    stopwords_filtered_list = [w for w in paper if not w.lower() in stop_words]
    casefold = [w.lower() for w in stopwords_filtered_list]
    stemmer = PorterStemmer()
    stemming = [stemmer.stem(w) for w in casefold]
    rand_papers[title] = FreqDist(stemming)
    words_amount = sum(rand_papers[title].values())
    
    for key, value in rand_papers[title].items():
        rand_papers[title][key] = value / words_amount
    

In [22]:
rel_list = ['A Framework for Guiding the Museum Tours Personalization',
            'A Multi-Sensory Approach to Cultural Heritage The Battle of Pavia Exhibition',
            'A Novel Image Based Positioning Technique Using Mobile Eye Tracker For A Museum Visit',
            'A Point-Of-Interest Directory For Mobile Tourists In Abuja, Nigeria',
            'A Survey of Map-based Mobile Guides',
            'A visitors guide in an active museum Presentation',
            'Adoption and Use of Emerging Cultural Technologies in China\'s Museums',
            'Analyzing Visitor Perceptions of Personalization in Art Museum Interactive Technology',
            'Augmented reality for visitors of cultural heritage sites',
            'Design and development of Taeneb City Guide - From Paper Maps and Guidebooks to Electronic Guides',
            'Full Access to Cultural Spaces (FACS) Mapping and evaluating museum access services using mobile eye-tracking technology',
            'In-Sights into Mobile Learning  An Exploration of Mobile Eye Tracking Methodology for  Learning in Museums',
            'Mobile augmented reality for cultural heritage Following the footsteps of Ovid among different locations in Europe',
            'Museum Guide 2.0 – An Eye-Tracking based Personal Assistant for Museums and Exhibits',
            'Potentials and Limitations of Mobile Eye Tracking in Visitor Studies_ Evidence From Field Research at Two Museum Exhibitions in Germany',
            ]

rel_list = [name.lower() + '.txt' for name in rel_list]
len(rel_list)

15

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

rand_papers_clas = {}
path = "HW1_rand_txt"

for file_name in os.listdir(path):
    with open(path + os.sep + file_name, 'r', encoding = "utf8") as f:
        if file_name.lower() in rel_list:
            rand_papers_clas[file_name.lower()] = (f.read(), 1)
        else:
            rand_papers_clas[file_name.lower()] = (f.read(), 0)

In [21]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(rand_papers_clas.values())
X_train_counts.shape

AttributeError: 'list' object has no attribute 'lower'

In [40]:
# for initial language model
fdist["mobile"]+fdist["visitors"]+fdist["guide"]

#for language model after stop words removal

# for language model after case folding

# for language model after stemming

0.012224374255065555

In [18]:
rel_test = rel_list[-2:]
rel_test

['Museum Guide 2.0 – An Eye-Tracking based Personal Assistant for Museums and Exhibits.txt',
 'Potentials and Limitations of Mobile Eye Tracking in Visitor Studies_ Evidence From Field Research at Two Museum Exhibitions in Germany.txt']

In [None]:
rel_list