In [42]:
import nltk
import pickle
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thelu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)
	return tokens

# loads all docs in the directory
def process_docs1(directory):
	all_reviews = {}
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to list
		all_reviews[filename] = add_doc_to_vocab(path)

	return all_reviews

def subset_docs(directory, start, end):
	subset = []
	for filename in listdir(directory)[start:end]:
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to list
		subset.append(add_doc_to_vocab(path))
	return subset

# # save list to file
# def save_list(lines, filename):
# 	data = '\n'.join(lines)
# 	file = open(filename, 'w')
# 	file.write(data)
# 	file.close()

In [44]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

# load num amount of docs in a directory
def process_docs(directory, vocab, start, end):
	# walk through all files in the folder
	for filename in listdir(directory)[start:end]:
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

vocab = Counter()

# Updated process_docs for pos and neg lines of words
def process_docs2(directory, vocab, start, end):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory)[start:end]:
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines

In [45]:
import numpy as np
import random
#Ref: https://github.com/Vakhshoori101/TwitterSentimentAnalysis/blob/main/Twitter%20Classification%20Template.ipynb
# 70% Training: 630 out of 900, 30% Testing: 270 out of 900
neg_size = len(listdir('neg'))
neg_train_size = int(neg_size * 0.7)
pos_size = len(listdir('pos'))
pos_train_size = int(pos_size * 0.7)

neg_train_lines = subset_docs('neg', 0, neg_train_size)
neg_test_lines = subset_docs('neg', neg_train_size, neg_size)
pos_train_lines = subset_docs('pos', 0, pos_train_size)
pos_test_lines = subset_docs('pos', pos_train_size, pos_size)

train_x = neg_train_lines + pos_train_lines
test_x = neg_test_lines + pos_test_lines

train_y = np.append(np.ones(len(pos_train_lines)), np.zeros(len(neg_train_lines)))
test_y = np.append(np.ones(len(pos_test_lines)), np.zeros(len(neg_test_lines)))

mapIndexPos = list(zip(train_x, train_y))
random.shuffle(mapIndexPos)
train_x_shuffled, train_y_shuffled = list(zip(*mapIndexPos))

print(len(train_x_shuffled))
print(len(test_x))
print(len(train_y_shuffled))
print(len(test_y))

1260
540
1260
540


In [46]:
# create a dictionary for the frequency of words appears in number of positive reviews
pos_freq_list = {}
# create a dictionary for the frequency of words appears in number of negative reviews
neg_freq_list = {}

#set the default appreance for each word to be 15 to prevent special case 0
for token in vocab:
    pos_freq_list[token] = 4
    neg_freq_list[token] = 4

#This function will tokenize each review in the directory up to num and add one to the counter if it appears in the review
def count_freq(directory, freq_list, num):
	for i, filename in enumerate(listdir(directory)):
		if i < num:
			if not filename.endswith(".txt"):
				continue
			path = directory + '/' + filename
			doc = load_doc(path)
			temp_token = Counter()
			tok = clean_doc(doc)
			temp_token.update(tok)
			for w in temp_token:
				freq_list[w] += 1
	return freq_list

#update the list for both pos and neg frequency
neg_freq_list = count_freq('neg', neg_freq_list, neg_train_size)
pos_freq_list = count_freq('pos', pos_freq_list, pos_train_size)


In [47]:
from math import log2

# Default number for total training review is 1260
# positive and negative training size are both 630
pos = pos_train_size
neg = neg_train_size
N = pos + neg

neg_I = {}
pos_I = {}

min_occur = 200
tokens = [k for k,c in vocab.items() if c >= min_occur]

# Mutual information for all possible tokens in negative reviews
for token in tokens:
    neg_I[token] = log2((neg_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*neg))


# Mutual information for all possible tokens in positive reviews
for token in tokens:
    pos_I[token] = log2((pos_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*pos))


In [48]:
top_num = 25

# sorted the positive mutal information and get the top 5
top_pos = sorted(pos_I.items(), key=lambda x:-x[1])[:top_num]
# sorted the negative mutal information and get the top 5
top_neg = sorted(neg_I.items(), key=lambda x:-x[1])[:top_num]

features = top_pos + top_neg
feature_words = []
for word, p in features:
	feature_words.append(word)

# for x in range(top_num):
#     print(f"P{str(x+1)}: Word: {top_pos[x][0]}\nfreq: {top_pos[x][1]}")
#
#
# for x in range(top_num):
#         print(f"N{str(x+1)}: Word: {top_neg[x][0]}\nfreq: {top_neg[x][1]}")

In [49]:
# takes in a single review and returns frequency dictionary
def create_frequency(reviews, ys):
	freq_d = {}

	# create freq dictionary
	for review, y in zip(reviews, ys):
		for word in review:
			pair = (word, y)
			# if already in dictionary, add 1
			if pair in freq_d:
				freq_d[pair] += 1
			# add entry to dictionary if not present
			else:
				freq_d[pair] = freq_d.get(pair, 1)

	return freq_d

In [50]:
# has been saved into pickle file
# freqs = create_frequency(train_x_shuffled, train_y_shuffled)

def train_naive_bayes(freq, train_x, train_y):

	loglikelihood = {}
	logprior = 0
	# calculate number of unique words in vocab
	unique_words = set([pair[0] for pair in freq.keys()])
	V = len(unique_words)
	# calculate N_pos and N_neg
	N_pos = N_neg = 0
	for pair in freq.keys():
		if pair[1] > 0:
			N_pos += freq[(pair)]
		else:
			N_neg += freq[(pair)]

	# number of reviews
	D = len(train_y_shuffled)
	# number of pos reviews
	D_pos = sum(train_y_shuffled)
	# number of neg reviews
	D_neg = D - D_pos

	logprior = np.log(D_pos) - np.log(D_neg)

	# for each selected feature
	for word in feature_words:
		# get the pos and neg freq of the word
		pos_freq = freq.get((word, 1), 0)
		neg_freq = freq.get((word, 0), 0)
		# calculate probability that word is pos and neg
		p_word_pos = (pos_freq + 1) / (N_pos + V)
		p_word_neg = (neg_freq + 1) / (N_neg + V)
		# calculate the log likelihood of the word
		loglikelihood[word] = np.log(p_word_pos / p_word_neg)

	return logprior, loglikelihood
# has been saved into pickle file
# logp, logl = train_naive_bayes(freqs, train_x_shuffled, train_y_shuffled)

In [51]:
def naive_bayes_predict(review, logprior, loglikelihood):
	# process review to get list of words

	# initialize probability to zero
	p = 0
	# add logprior
	p += logprior

	for word in review:
		if word in loglikelihood:
			p += loglikelihood[word]

	return p

In [52]:
# save logprior and likelihood into pickle file
# save_logprior = open("logprior.pickle", "wb")
# pickle.dump(logp, save_logprior)
# save_logprior.close()
#
# save_loglikelihood = open("loglikelihood.pickle", "wb")
# pickle.dump(logl, save_loglikelihood)
# save_loglikelihood.close()

0.7738095238095238
0.7648148148148148


In [53]:
# returns filename and whether or not review is classified as pos (1)
def run_bayes(x_set, y_set, logp, logl):
	correct = 0
	# review is path
	for review, y in zip(x_set, y_set):
		predicted = 1
		p = naive_bayes_predict(review, logp, logl)
		# print(f'p:{p}')
		if p < 0:
			predicted = 0
		if predicted == y:
			correct += 1
		# print(f'predicted: {predicted} actual:{y}')
	print(correct/len(y_set))

with open("logprior.pickle", "rb") as logp_f:
	logp = pickle.load(logp_f)

with open("loglikelihood.pickle", "rb") as logl_f:
	logl = pickle.load(logl_f)

run_bayes(train_x, train_y, logp, logl)
run_bayes(test_x, test_y, logp, logl)

In [54]:
def new_bayes(directory, logp, logl):
	correct = 0
	predict_list = []
	# review is path
	reviews = process_docs1(directory)
	for key in reviews.keys():
		pos = 1
		p = naive_bayes_predict(reviews[key], logp, logl)
		if p < 0:
			pos = 0
		predict_list.append(f'{key},{pos}')

	return predict_list

In [28]:
import csv

dir = input("Enter directory name: ")
write_me = new_bayes(dir, logp, logl)
# open file to write
def write_to_csv(prediction_list):
	with open('results.csv', 'w', newline='') as f:
		for prediction in prediction_list:
			f.write(prediction + '\n')

write_to_csv(write_me)

