In [221]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thelu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [222]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load num amount of docs in a directory
def process_docs(directory, vocab, start, end):
	# walk through all files in the folder
	for filename in listdir(directory)[start:end]:
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)
            

# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
# define vocab
vocab = Counter()
# training set size: 70/30 split
neg_size = len(listdir('neg'))
neg_train_size = int(neg_size * 0.7)
pos_size = len(listdir('pos'))
pos_train_size = int(pos_size * 0.7)
# add all docs to vocab
process_docs('neg', vocab, 0, neg_train_size)
process_docs('pos', vocab, 0, pos_train_size)

In [223]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

# Updated process_docs for pos and neg lines of words
def process_docs(directory, vocab, start, end):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory)[start:end]:
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines

In [224]:
import numpy as np
#Ref: https://github.com/Vakhshoori101/TwitterSentimentAnalysis/blob/main/Twitter%20Classification%20Template.ipynb
# 70% Training: 630 out of 900, 30% Testing: 270 out of 900
neg_train_lines = process_docs('neg', vocab, 0, neg_train_size)
neg_test_lines = process_docs('neg', vocab, neg_train_size, neg_size)
pos_train_lines = process_docs('pos', vocab, 0, pos_train_size)
pos_test_lines = process_docs('pos', vocab, pos_train_size, pos_size)

train_x = neg_train_lines + pos_train_lines
test_x = neg_test_lines + pos_test_lines

train_y = np.append(np.ones(len(pos_train_lines)), np.zeros(len(neg_train_lines)))
test_y = np.append(np.ones(len(pos_test_lines)), np.zeros(len(neg_test_lines)))

print(len(train_x))
print(len(test_x))
print(len(train_y))
print(len(test_y))

1260
540
1260
540


In [225]:
# create a dictionary for the frequency of words appears in number of positive reviews
pos_freq_list = {}
# create a dictionary for the frequency of words appears in number of negative reviews
neg_freq_list = {}

#set the default appreance for each word to be 15 to prevent special case 0
for token in vocab:
    pos_freq_list[token] = 4
    neg_freq_list[token] = 4

#This function will tokenize each review in the directory and add one to the counter if it appears in the review
def count_freq(directory, freq_list, num):
	for i, filename in enumerate(listdir(directory)):
		if i < num:
			if not filename.endswith(".txt"):
				continue
			path = directory + '/' + filename
			doc = load_doc(path)
			temp_token = Counter()
			tok = clean_doc(doc)
			temp_token.update(tok)
			for w in temp_token:
				freq_list[w] += 1
	return freq_list

#update the list for both pos and neg frequency
neg_freq_list = count_freq('neg', neg_freq_list, neg_train_size)
pos_freq_list = count_freq('pos', pos_freq_list, pos_train_size)



In [226]:
from math import log2

# Default number for total training review is 1260
# positive and negative training size are both 630
pos = pos_train_size
neg = neg_train_size
N = pos + neg

neg_I = {}
pos_I = {}

min_occur = 60
tokens = [k for k,c in vocab.items() if c >= min_occur]

# Mutual information for all possible tokens in negative reviews
for token in tokens:
    neg_I[token] = log2((neg_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*neg))


# Mutual information for all possible tokens in positive reviews
for token in tokens:
    pos_I[token] = log2((pos_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*pos))


In [227]:
# sorted the positive mutal information and get the top 5
top_pos = sorted(pos_I.items(), key=lambda x:-x[1])[:5]
# sorted the negative mutal information and get the top 5
top_neg = sorted(neg_I.items(), key=lambda x:-x[1])[:5]

for x in range(5):
    print(f"P{str(x+1)}: Word: {top_pos[x][0]}\nfreq: {top_pos[x][1]}")


for x in range(5):
    print(f"N{str(x+1)}: Word: {top_neg[x][0]}\nfreq: {top_neg[x][1]}")

P1: Word: terrific
freq: 0.6256044852185021
P2: Word: subtle
freq: 0.6175303631775868
P3: Word: excellent
freq: 0.6141088463806726
P4: Word: memorable
freq: 0.5908873346782616
P5: Word: period
freq: 0.5767885692754561
N1: Word: wasted
freq: 0.7476128383657147
N2: Word: waste
freq: 0.7104933828050153
N3: Word: worst
freq: 0.7094848577440992
N4: Word: awful
freq: 0.7030182622428686
N5: Word: lame
freq: 0.7004397181410922
