In [3]:
import numpy as np
import math
import random
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import sys


#hyper_parameter
bigram_thresh = 3
punct_list = [",",".","/","\"","'"]

In [113]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/saket/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [114]:
#initializing stemmer
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()

In [115]:
def remove_punct(documents):
	new_documents = []
	for document in documents:
		for i in range(len(punct_list)):
			document = document.lower()
			document = document.replace(punct_list[i]," ")
		new_documents.append(document)
	return new_documents

In [165]:
def getStemmedDocument(input_raw_data):
	docs = input_raw_data
	new_doc = []
	for doc in docs:
		raw = doc.lower()
		raw.decode('utf-8','ignore')
		raw = raw.replace("<br /><br />", " ")
		tokens = tokenizer.tokenize(raw)
		stopped_tokens = [token for token in tokens if token not in en_stop]
		stemmed_tokens = [p_stemmer.stem(token) for token in stopped_tokens]
		print stemmed_tokens
		documentWords = ' '.join(stemmed_tokens)
		new_doc.append(documentWords)
	return new_doc

In [166]:
def read_data(file_name):
	file = open("data/" + file_name)
	all_text = file.readlines()
	if len(all_text)==0:
		print "empty document"
	return all_text

In [167]:
def make_dict(training_data,bigram):
	vocab_dict = {}
	bigram_dict = {}
	count = 0   
	for document in training_data:
		words = document.split()
		i = 0
		for word in words:
			if i > 0 and bigram:
				bigram_word = words[i-1] + " " + word
				if not bigram_word in bigram_dict:
					bigram_dict[bigram_word] = 1
				else:
					bigram_dict[bigram_word] +=1
			if not word in vocab_dict:
				vocab_dict[word] = count
				count += 1
			i +=1
	if bigram:
		for key in bigram_dict.keys():
			if bigram_dict[key] >= bigram_thresh:
				vocab_dict[key] = count
				count += 1
	return vocab_dict

In [168]:
def predict(test_document,bigram):
	global naive_matrix,label_freq,number_classes,vocab_dict
	max_sum = 0
	predicted_class = -1
	for class_ in range(number_classes):
		sums = math.log(label_freq[class_])
		words = test_document.split()
		j = 0
		for word in words:
			if j>0 and bigram:
				bigram_word = words[j-1] + " " + word
				if bigram_word in vocab_dict:
					word_index = vocab_dict[bigram_word]
					sums += naive_matrix[word_index][class_]
			if word in vocab_dict:
				word_index = vocab_dict[word]
				sums += naive_matrix[word_index][class_]
			j += 1
		if sums > max_sum or class_==0:
			max_sum = sums
			predicted_class = class_
	return predicted_class

In [169]:
def inv_label(label_val):
    for key in label_dict.keys():
        if label_dict[key] == label_val:
            return key
def indices(label_key):
    label_key = int(label_key)
    if label_key >= 7:
        label_key -= 2
    return label_key - 1

In [170]:
def accuracy(test_documents,labels,bigram):
	correct = 0.0
	for i in range(len(test_documents)):
		predicted_class = predict(test_documents[i],bigram)
		if labels[i].split()[0] in label_dict.keys():
			expected_class = label_dict[labels[i].split()[0]]
			if predicted_class==expected_class:
				correct+=1
	return correct/len(test_documents)

def majority_accuracy(test_documents,labels):
	correct = 0.0
	for i in range(len(test_documents)):
		predicted_class = np.argmax(label_freq)
		if labels[i].split()[0] in label_dict.keys():
			expected_class = label_dict[labels[i].split()[0]]
			if predicted_class==expected_class:
				correct+=1
	return correct/len(test_documents)

def random_accuracy(test_documents,labels):
	correct = 0.0
	for i in range(len(test_documents)):
		predicted_class = random.randint(0,len(label_dict))
		if labels[i].split()[0] in label_dict:
			expected_class = label_dict[labels[i].split()[0]]
			if predicted_class==expected_class:
				correct+=1
	return correct/len(test_documents)

def confusion_matrix(test_documents,labels,bigram):
	correct = np.zeros((len(label_dict),len(label_dict)))
	for i in range(len(test_documents)):
		predicted_class = predict(test_documents[i],bigram)
		predicted_class_key = inv_label(predicted_class)
		if labels[i].split()[0] in label_dict:
			expected_class_key = labels[i].split()[0]
			correct[indices(expected_class_key)][indices(predicted_class_key)] += 1
	return (correct)

In [171]:
training_data = read_data("imdb_train_text.txt")
# training_data = remove_punct(training_data)
training_data = getStemmedDocument(training_data)
training_labels = read_data("imdb_train_labels.txt")
test_data = read_data("imdb_test_text.txt")
# test_data =remove_punct(test_data)
test_data = getStemmedDocument(test_data)
test_labels = read_data("imdb_test_labels.txt")

[u'love', u'movi', u'sinc', '7', 'saw', u'open', 'day', u'touch', u'beauti', u'strongli', 'recommend', u'see', u'movi', 'watch', u'famili', 'far', 'mpaa', u'rate', 'pg', '13', u'themat', u'element', u'prolong', u'scene', 'disastor', u'nuditi', u'sexual', u'languag']
['first', u'thing', 'first', 'edison', 'chen', u'fantast', u'believ', 'job', 'cambodian', 'hit', 'man', 'born', 'bred', u'dump', u'gladiatori', 'ring', u'hone', 'craft', u'savag', u'batteri', 'order', u'surviv', u'live', 'mantra', 'kill', u'kill', 'role', u'littl', u'dialogu', 'least', u'line', 'cambodian', 'thai', u'perform', u'compel', u'probabl', 'jet', 'li', u'vehicl', u'danni', 'dog', 'man', 'bred', 'sole', u'purpos', u'fight', u'someon', u'els', 'leash', 'like', u'danni', 'dog', 'much', u'talk', 'bare', u'knuckl', 'fight', u'sequenc', u'choreograph', u'stylist', 'rather', u'design', 'normal', 'brutal', u'fisticuff', u'everyth', u'goe', u'probabl', 'brought', u'sens', 'realism', 'grit', 'see', u'charact', 'slug', u'thr

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)

In [None]:
label_dict = make_dict(training_labels,0)
vocab_dict = make_dict(training_data,0)
print len(vocab_dict)

In [76]:
def make_matrix(bigram):
	naive_matrix = np.ones((len(vocab_dict),number_classes))
	num_words_in_class = np.full((1,number_classes),len(vocab_dict))
	label_freq = np.zeros(number_classes)
	for i in range(len(training_data)):
		label = label_dict[training_labels[i].split()[0]]
		label_freq[label] +=1
		words = training_data[i].split()
		j = 0
		for word in words:
			if j>0 and bigram:
				bigram_word = words[j-1] + " " + word
				if bigram_word in vocab_dict:
					word_index = vocab_dict[bigram_word]
					naive_matrix[word_index][label] +=1
			word_index = vocab_dict[word]
			naive_matrix[word_index][label] +=1
			num_words_in_class[0][label] +=1
			j +=1
	return (np.log(naive_matrix) - np.log(num_words_in_class)),label_freq

In [77]:
naive_matrix,label_freq = make_matrix(0)

In [78]:
training_accuracy = accuracy(training_data,training_labels,0)
print training_accuracy*100

68.844


In [79]:
test_accuracy = accuracy(test_data,test_labels,0)
print test_accuracy*100

37.388


In [81]:
test_random_accuracy = random_accuracy(test_data,test_labels)
print test_random_accuracy*100

11.188


In [82]:
test_majority_accuracy = majority_accuracy(test_data,test_labels)
print test_majority_accuracy*100

20.088


In [95]:
confuse = confusion_matrix(test_data,test_labels,0)

[[  4.72600000e+03   0.00000000e+00   4.00000000e+00   3.40000000e+01
    9.00000000e+00   3.30000000e+01   0.00000000e+00   2.16000000e+02]
 [  2.02800000e+03   2.00000000e+00   4.00000000e+00   5.60000000e+01
    1.30000000e+01   4.40000000e+01   0.00000000e+00   1.55000000e+02]
 [  1.95800000e+03   1.00000000e+00   2.80000000e+01   1.40000000e+02
    2.40000000e+01   1.20000000e+02   0.00000000e+00   2.70000000e+02]
 [  1.67400000e+03   2.00000000e+00   4.00000000e+00   2.41000000e+02
    6.20000000e+01   2.18000000e+02   1.00000000e+00   4.33000000e+02]
 [  6.56000000e+02   0.00000000e+00   0.00000000e+00   5.50000000e+01
    9.10000000e+01   4.36000000e+02   0.00000000e+00   1.06900000e+03]
 [  6.16000000e+02   0.00000000e+00   2.00000000e+00   3.30000000e+01
    3.40000000e+01   4.69000000e+02   2.00000000e+00   1.69400000e+03]
 [  4.35000000e+02   0.00000000e+00   0.00000000e+00   9.00000000e+00
    1.60000000e+01   2.10000000e+02   0.00000000e+00   1.67400000e+03]
 [  9.7500000

In [108]:
for i in range(len(confuse)):
    for j in range(len(confuse[0])):
        print (int)(confuse[i][j]),
    print

4726 0 4 34 9 33 0 216
2028 2 4 56 13 44 0 155
1958 1 28 140 24 120 0 270
1674 2 4 241 62 218 1 433
656 0 0 55 91 436 0 1069
616 0 2 33 34 469 2 1694
435 0 0 9 16 210 0 1674
975 0 2 8 13 210 1 3790


In [84]:
print label_dict

{'10': 0, '1': 7, '3': 4, '2': 6, '4': 5, '7': 2, '9': 3, '8': 1}
