In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tianw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [70]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)
            

# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
# define vocab
vocab_pos = Counter()
vocab_neg = Counter()
vocab = Counter()
# add all docs to vocab
process_docs('neg', vocab_neg)
process_docs('pos', vocab_pos)
process_docs('neg', vocab)
process_docs('pos', vocab)
# print the size of the vocab
print(len(vocab_neg))
print(len(vocab_pos))
print(len(vocab))

# print the top words in the vocab
print(vocab_neg.most_common(50))
print(vocab_pos.most_common(50))
# keep tokens with > 0 occurrence in a dictionary
min_occurane = 0
tokensn = {k: str(c) for k,c in vocab_neg.items() if c >= min_occurane}
tokensp = {k: str(c) for k,c in vocab_pos.items() if c >= min_occurane}
tokens = {k: str(c) for k,c in vocab.items() if c >= min_occurane}
print(len(tokensn))
print(len(tokensp))
print(len(tokens))
# save tokens to a vocabulary file
save_list(tokensn, 'neg_occurance.txt')
save_list(tokensp, 'pos_occurance.txt')
save_list(tokens, 'vocab.txt')

30412
32487
44276
[('film', 3600), ('movie', 2717), ('one', 2331), ('like', 1644), ('even', 1219), ('good', 1008), ('time', 994), ('would', 947), ('get', 930), ('bad', 923), ('much', 905), ('films', 828), ('characters', 810), ('story', 805), ('character', 799), ('two', 787), ('plot', 779), ('make', 731), ('first', 723), ('really', 715), ('see', 701), ('could', 694), ('way', 688), ('also', 685), ('little', 662), ('well', 646), ('dont', 602), ('movies', 602), ('scene', 593), ('people', 589), ('doesnt', 588), ('know', 586), ('never', 561), ('scenes', 558), ('action', 558), ('theres', 529), ('hes', 521), ('director', 515), ('new', 498), ('man', 494), ('another', 491), ('made', 488), ('end', 485), ('better', 475), ('something', 473), ('go', 468), ('big', 458), ('seems', 448), ('best', 447), ('isnt', 444)]
[('film', 4383), ('one', 2615), ('movie', 2109), ('like', 1557), ('story', 1102), ('good', 1072), ('also', 1072), ('time', 1047), ('films', 1045), ('even', 1043), ('characters', 925), ('mu

In [10]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords


# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)
 
# load all docs in a directory
def process_docs(directory, vocab):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines
 
# load vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print(len(vocab))
# prepare negative reviews
negative_lines = process_docs('neg', vocab)
save_list(negative_lines, 'negative.txt')
# prepare positive reviews
positive_lines = process_docs('pos', vocab)
save_list(positive_lines, 'positive.txt')

13893


In [73]:
# create a dictionary for the frequency of words appears in number of positive reviews
pos_freq_list = {}
# create a dictionary for the frequency of words appears in number of negative reviews
neg_freq_list = {}

#set the default appreance for each word to be 1 to prevent special case 0
for token in tokens:
    pos_freq_list[token] = 1
    neg_freq_list[token] = 1
    
#This function will tokenize each review in the directory and add one to the counter if it appears in the review
def count_freq(directory, freq_list):
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        temp_token = Counter()
        tok = clean_doc(doc)
        temp_token.update(tok)
        for w in temp_token:
            freq_list[w] += 1
    return freq_list

#update the list for both pos and neg frequency
neg_freq_list = count_freq('neg', neg_freq_list)
pos_freq_list = count_freq('pos', pos_freq_list)



In [76]:
from math import log2
  
#Default number for totoal review is 1800, positive review is 900 and negative review also 900
N = 1800
pos = 900
neg = 900

neg_I = {}
pos_I = {}

# Mutual information for all possible tokens in negative reviews
for token in tokens:
    neg_I[token] = log2((neg_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*neg))
    
    
# Mutual information for all possible tokens in positive reviews
for token in tokens:
    pos_I[token] = log2((pos_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*pos))

print(neg_I)
print(pos_I)

