In [65]:
import os
import sys
import pickle
import sklearn
from hypopt import GridSearch
from pprint import pprint
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
vect_unigram = CountVectorizer(analyzer='word', ngram_range = (1, 1))
vect_bigram = CountVectorizer(analyzer='word', ngram_range = (2, 2))
vect_unibigram = CountVectorizer(analyzer='word', ngram_range = (1, 2))

In [3]:
def load_data(file_path):
    pos_train = pd.read_csv(os.path.join(file_path + "\\pos_train.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    neg_train = pd.read_csv(os.path.join(file_path + "\\neg_train.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    pos_test = pd.read_csv(os.path.join(file_path + "\\pos_test.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    neg_test = pd.read_csv(os.path.join(file_path + "\\neg_test.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    pos_vali = pd.read_csv(os.path.join(file_path + "\\pos_vali.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    neg_vali = pd.read_csv(os.path.join(file_path + "\\neg_vali.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    pos_train_ns = pd.read_csv(os.path.join(file_path + "\\pos_train_ns.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    neg_train_ns = pd.read_csv(os.path.join(file_path + "\\neg_train_ns.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    pos_test_ns = pd.read_csv(os.path.join(file_path + "\\pos_test_ns.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    neg_test_ns = pd.read_csv(os.path.join(file_path + "\\neg_test_ns.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    pos_vali_ns = pd.read_csv(os.path.join(file_path + "\\pos_vali_ns.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    neg_vali_ns = pd.read_csv(os.path.join(file_path + "\\neg_vali_ns.csv"), sep = 'delimiter', names = ['Review'], engine = 'python')
    return pos_train,neg_train,pos_test,neg_test,pos_vali,neg_vali,pos_train_ns,neg_train_ns,pos_test_ns,neg_test_ns,pos_vali_ns,neg_vali_ns

In [18]:
def train_data(x, y, count_vect, a):
    x_count = count_vect.fit_transform(x)
    tfidf = TfidfTransformer()
    x_tfidf = tfidf.fit_transform(x_count)
    clf = MultinomialNB(alpha = a).fit(x_tfidf, y)
    return clf, count_vect, tfidf

In [5]:
def evaluate(x, y, clf, count_vect, tfidf):
    x_count = count_vect.transform(x)
    x_tfidf = tfidf.transform(x_count)
    preds = clf.predict(x_tfidf)
    return {
        'accuracy': accuracy_score(y, preds),
        'precision': precision_score(y, preds),
        'recall': recall_score(y, preds),
        'f1': f1_score(y, preds),
        }

In [58]:
def main(file_path):
	test_scores = {}
	test_scores_optimized = {}
	# Load Data
	# file_path = sys.argv[1]
	pos_train,neg_train,pos_test,neg_test,pos_vali,neg_vali,\
	pos_train_ns,neg_train_ns,pos_test_ns,neg_test_ns,pos_vali_ns,neg_vali_ns = load_data(file_path) 

	# Concat positive and negative data with labels
	# 1 for positive, 0 for negative
	pos_train['target'] = 1
	neg_train['target'] = 0
	train = pd.concat([pos_train,neg_train])
	train = shuffle(train, random_state=0).reset_index(drop=True)

	pos_test['target'] = 1
	neg_test['target'] = 0
	test = pd.concat([pos_test,neg_test])
	test = shuffle(test, random_state=0).reset_index(drop=True)

	pos_vali['target'] = 1
	neg_vali['target'] = 0
	vali = pd.concat([pos_vali,neg_vali])
	vali = shuffle(vali, random_state=0).reset_index(drop=True)

	pos_train_ns['target'] = 1
	neg_train_ns['target'] = 0
	train_ns = pd.concat([pos_train_ns,neg_train_ns])
	train_ns = shuffle(train_ns, random_state=0).reset_index(drop=True)

	pos_test_ns['target'] = 1
	neg_test_ns['target'] = 0
	test_ns = pd.concat([pos_test_ns,neg_test_ns])
	test_ns = shuffle(test_ns, random_state=0).reset_index(drop=True)

	pos_vali_ns['target'] = 1
	neg_vali_ns['target'] = 0
	vali_ns = pd.concat([pos_vali_ns,neg_vali_ns])
	vali_ns = shuffle(vali_ns, random_state=0).reset_index(drop=True)

	# Prepare x and y
	x_train = train['Review']
	y_train = train['target']
	x_test = test['Review']
	y_test = test['target']
	x_vali = vali['Review']
	y_vali = vali['target']

	x_train_ns = train_ns['Review']
	y_train_ns = train_ns['target']
	x_test_ns = test_ns['Review']
	y_test_ns = test_ns['target']
	x_vali_ns = vali_ns['Review']
	y_vali_ns = vali_ns['target']

	# 1. unigram
	clf, count_vect, tfidf = train_data(x_train, y_train, vect_unigram, 1)
	test_scores['test_unigram'] = evaluate(x_test, y_test, clf, count_vect, tfidf)

	best_alpha = alpha_tuning(x_train,y_train,x_vali,y_vali,vect_unigram,alpha_list)
	print("The selected alpha:{}\n".format(best_alpha))
	clf, count_vect, tfidf = train_data(x_train, y_train, vect_unigram, best_alpha)
	test_scores_optimized['test_unigram'] = evaluate(x_test, y_test, clf, count_vect, tfidf)

	# 2. bigram
	clf, count_vect, tfidf = train_data(x_train, y_train, vect_bigram, 1)
	test_scores['test_bigram'] = evaluate(x_test, y_test, clf, count_vect, tfidf)

	best_alpha = alpha_tuning(x_train,y_train,x_vali,y_vali,vect_bigram,alpha_list)
	print("The selected alpha:{}\n".format(best_alpha))
	clf, count_vect, tfidf = train_data(x_train, y_train, vect_bigram, best_alpha)
	test_scores_optimized['test_bigram'] = evaluate(x_test, y_test, clf, count_vect, tfidf)

	# 3. unigram + bigram
	clf, count_vect, tfidf = train_data(x_train, y_train, vect_unibigram, 1)
	test_scores['test_unibigram'] = evaluate(x_test, y_test, clf, count_vect, tfidf)

	best_alpha = alpha_tuning(x_train,y_train,x_vali,y_vali,vect_unibigram,alpha_list)
	print("The selected alpha:{}\n".format(best_alpha))
	clf, count_vect, tfidf = train_data(x_train, y_train, vect_unibigram, best_alpha)
	test_scores_optimized['test_unibigram'] = evaluate(x_test, y_test, clf, count_vect, tfidf)

	# 4. unigram(no stopword)
	clf, count_vect, tfidf = train_data(x_train_ns, y_train_ns, vect_unigram, 1)
	test_scores['test_ns_uigram'] = evaluate(x_test_ns, y_test_ns, clf, count_vect, tfidf)

	best_alpha = alpha_tuning(x_train,y_train,x_vali_ns,y_vali_ns,vect_unigram,alpha_list)
	print("The selected alpha:{}\n".format(best_alpha))
	clf, count_vect, tfidf = train_data(x_train_ns, y_train_ns, vect_unigram, best_alpha)
	test_scores_optimized['test_ns_unigram'] = evaluate(x_test_ns, y_test_ns, clf, count_vect, tfidf)

	# 5. bigram(no stopword)
	clf, count_vect, tfidf = train_data(x_train_ns, y_train_ns, vect_bigram, 1)
	test_scores['test_ns_bigram'] = evaluate(x_test_ns, y_test_ns, clf, count_vect, tfidf)

	best_alpha = alpha_tuning(x_train,y_train,x_vali_ns,y_vali_ns,vect_bigram,alpha_list)
	print("The selected alpha:{}\n".format(best_alpha))
	clf, count_vect, tfidf = train_data(x_train_ns, y_train_ns, vect_bigram, best_alpha)
	test_scores_optimized['test_ns_bigram'] = evaluate(x_test_ns, y_test_ns, clf, count_vect, tfidf)

	# 6. unigram + bigram(no stopword)
	clf, count_vect, tfidf = train_data(x_train_ns, y_train_ns, vect_unibigram, 1)
	test_scores['test_ns_unibigram'] = evaluate(x_test_ns, y_test_ns, clf, count_vect, tfidf)

	best_alpha = alpha_tuning(x_train,y_train,x_vali_ns,y_vali_ns,vect_unibigram,alpha_list)
	print("The selected alpha:{}\n".format(best_alpha))
	clf, count_vect, tfidf = train_data(x_train_ns, y_train_ns, vect_unibigram, best_alpha)
	test_scores_optimized['test_ns_unibigram'] = evaluate(x_test_ns, y_test_ns, clf, count_vect, tfidf)

	return test_scores, test_scores_optimized


In [76]:
%run main "C:\\Users\\andre\\msci-text-analytics-s20\\Assignment 1\\data"

The selected alpha:1

The selected alpha:0.1

The selected alpha:0.1

The selected alpha:0.001

The selected alpha:0.1

The selected alpha:0.01

({'test_bigram': {'accuracy': 0.82195,
                  'f1': 0.8279917884313488,
                  'precision': 0.8008175659892548,
                  'recall': 0.857075},
  'test_ns_bigram': {'accuracy': 0.7869723371542144,
                     'f1': 0.7971914792336071,
                     'precision': 0.7607086077674313,
                     'recall': 0.83735},
  'test_ns_uigram': {'accuracy': 0.8061100763759547,
                     'f1': 0.8108576097162438,
                     'precision': 0.7914871330968648,
                     'recall': 0.8312},
  'test_ns_unibigram': {'accuracy': 0.8224727809097614,
                        'f1': 0.8291180363373841,
                        'precision': 0.7992113198793783,
                        'recall': 0.86135},
  'test_unibigram': {'accuracy': 0.83315,
                     'f1': 0.837678762525537