In [23]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tianw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)
            

# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('neg', vocab)
process_docs('pos', vocab)

## Split data into training and testing sets

In [2]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

# Updated process_docs for pos and neg lines of words
def process_docs(directory, vocab):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines


negative_lines = process_docs('neg', vocab)
positive_lines = process_docs('pos', vocab)

In [3]:
import numpy as np
#Ref: https://github.com/Vakhshoori101/TwitterSentimentAnalysis/blob/main/Twitter%20Classification%20Template.ipynb
# 70% Training: 630 out of 900, 30% Testing: 270 out of 900
test_pos = positive_lines[630:]
train_pos = positive_lines[:630]
test_neg = negative_lines[630:]
train_neg = negative_lines[:630]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

print(len(train_x))
print(len(test_x))
print(len(train_y))
print(len(test_y))

1260
540
1260
540


## Produce as output the top 5 positive and top 5 negative evidences 

In [4]:
# create a dictionary for the frequency of words appears in number of positive reviews
pos_freq_list = {}
# create a dictionary for the frequency of words appears in number of negative reviews
neg_freq_list = {}

#set the default appreance for each word to be 15 to prevent special case 0
for token in vocab:
    pos_freq_list[token] = 4
    neg_freq_list[token] = 4
    
#This function will tokenize each review in the directory and add one to the counter if it appears in the review
def count_freq(directory, freq_list):
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        temp_token = Counter()
        tok = clean_doc(doc)
        temp_token.update(tok)
        for w in temp_token:
            freq_list[w] += 1
    return freq_list

#update the list for both pos and neg frequency
neg_freq_list = count_freq('neg', neg_freq_list)
pos_freq_list = count_freq('pos', pos_freq_list)



In [5]:
from math import log2
  
#Default number for totoal review is 1800, positive review is 900 and negative review also 900
N = 1800
pos = 900
neg = 900

neg_I = {}
pos_I = {}

min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]

# Mutual information for all possible tokens in negative reviews
for token in tokens:
    neg_I[token] = log2((neg_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*neg))
    
    
# Mutual information for all possible tokens in positive reviews
for token in tokens:
    pos_I[token] = log2((pos_freq_list[token]*N)/((neg_freq_list[token]+pos_freq_list[token])*pos))
    

In [6]:
# sorted the positive mutal information and get the top 5
top_pos = sorted(pos_I.items(), key=lambda x:-x[1])[:5]
# sorted the negative mutal information and get the top 5
top_neg = sorted(neg_I.items(), key=lambda x:-x[1])[:5]
for x in range(5):
    print('P' + str(x+1) +': Word: ' + top_pos[x][0])
    
for x in range(5):
    print('N' + str(x+1) +': Word: ' + top_neg[x][0])

P1: Word: outstanding
P2: Word: religion
P3: Word: wonderfully
P4: Word: offbeat
P5: Word: finest
N1: Word: ludicrous
N2: Word: idiotic
N3: Word: wasted
N4: Word: stupidity
N5: Word: chuckle
