In [144]:
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from matplotlib import pylab as plt
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_data(filename):
    data_train = []
    data_test = []
    labels_train = []
    labels_test = []
    num_p_labels = 0
    num_n_labels = 0
    for line in open(filename):
        l = line[-2]
        d = process_line(line[:len(line)-3])
        is_positive = l == '1'
        is_negative = l == '0'
        if is_positive:
            num_p_labels += 1
            if num_p_labels <= 400:
                data_train.append(d)
                labels_train.append(l)
            else:
                data_test.append(d)
                labels_test.append(l)
        elif is_negative:
            num_n_labels += 1
            if num_n_labels <= 400:
                data_train.append(d)
                labels_train.append(l)
            else:
                data_test.append(d)
                labels_test.append(l)
        else:
            raise Exception('Bad label found')

    print('Positive labels: {0}'.format(num_p_labels))
    print('Negative labels: {0}'.format(num_n_labels))
    return np.array(data_train), np.array(labels_train),\
           np.array(data_test), np.array(labels_test)

In [145]:
def process_line(line):
    l = line.lower()
    l = re.sub("[^a-z0-9 ]", "", l) # get rid of everything that isn't a letter or number
    
    # get all of the words
    word_tokens = nltk.word_tokenize(l)
    ret_l = ''
    for word in word_tokens:
        if word not in stop_words: # get rid of stopwords
            ret_l += lemmatizer.lemmatize(word) + ' ' # lemmatize

    ret_l = ret_l[:len(ret_l)-1] # get rid of the trailing space
    return ret_l

In [146]:
print('Loading data...')
am_trd, am_trl, am_ted, am_tld = get_data('./sentiment labelled sentences/amazon_cells_labelled.txt')
im_trd, im_trl, im_ted, im_tld = get_data('./sentiment labelled sentences/imdb_labelled.txt')
ye_trd, ye_trl, ye_ted, ye_tld = get_data('./sentiment labelled sentences/yelp_labelled.txt')

print('')
print('Training data: {0}'.format(len(am_trd) + len(im_trd) + len(ye_trd)))
print('Testing data: {0}'.format(len(am_ted) + len(im_ted) + len(ye_ted)))

Loading data...
Positive labels: 500
Negative labels: 500
Positive labels: 500
Negative labels: 500
Positive labels: 500
Negative labels: 500

Training data: 2400
Testing data: 600


In [147]:
features = {}
def gather_features(d1, d2, d3):
    for ds in (d1, d2, d3):
        for d in ds:
            sl = d.split(' ')
            for word in sl:
                if word not in features:
                    features[word] = len(features)
                
gather_features(am_trd, im_trd, ye_trd)
print('Number of features: {0}'.format(len(features)))

Number of features: 4248


In [148]:
def convert_bag_of_words(data):
    ret = []
    for d in data:
        bow = np.zeros(len(features))
        line = d.split(' ')
        for word in line:
            if word in features:
                bow[features[word]] += 1
        ret.append(bow)
    return np.array(ret)

amc_trd = convert_bag_of_words(am_trd)
amc_ted = convert_bag_of_words(am_ted)
imc_trd = convert_bag_of_words(im_trd)
imc_ted = convert_bag_of_words(im_ted)
yec_trd = convert_bag_of_words(ye_trd)
yec_ted = convert_bag_of_words(ye_ted)

In [149]:
"""
Part 1.E
"""
def postprocess(train1, train2, train3, test1, test2, test3):
    mean = get_mean(train1, train2, train3)
    var = get_variance(train1, train2, train3)
    
    t1 = []
    t2 = []
    t3 = []
    te1 = []
    te2 = []
    te3 = []
    for o,n in [(train1, t1), (train2, t2), (train3, t3), (test1, te1), (test2, te2), (test3, te3)]:
        for d in o:
            n.append((d-mean)/var)
    return np.array(t1), np.array(t2), np.array(t3), np.array(te1), np.array(te2), np.array(te3)

def get_mean(d1, d2, d3):
    all_data = np.concatenate((d1, d2, d3))
    return np.mean(all_data, axis=0)

def get_variance(d1, d2, d3):
    all_data = np.concatenate((d1, d2, d3))
    return np.var(all_data, axis=0)

amp_trd, imp_trd, yep_trd, amp_ted, imp_ted, yep_ted = postprocess(amc_trd, imc_trd, yec_trd, amc_ted, imc_ted, yec_ted)

In [150]:
"""
Part 1.F
"""
from sklearn.linear_model import LogisticRegression
def classify_logistic_regression(training_set, training_labels, testing_set, testing_labels):
    tp = 0.0
    tn = 0.0
    fp = 0.0
    fn = 0.0
    
    lr = LogisticRegression().fit(training_set, training_labels)
    guesses = lr.predict(testing_set)
    for ind in xrange(len(guesses)):
        label = guesses[ind]
        tl = testing_labels[ind]
        correct = tl == label
        if label == '1':
            if correct: tp += 1.0
            else: fp += 1.0
        else:
            if correct: tn += 1.0
            else: fn += 1.0
            
    print("True Positive: {0}".format(tp))
    print("False Positive: {0}".format(fp))    
    print("True Negative: {0}".format(tn))    
    print("False Negative: {0}".format(fn))    
    print("Accuracy: {0}".format((tp + tn)/(tp + tn + fp + fn)))

classify_logistic_regression(np.concatenate((amp_trd, imp_trd, yep_trd)), 
                             np.concatenate((am_trl, im_trl, ye_trl)),
                             np.concatenate((amp_ted, imp_ted, yep_ted)),
                             np.concatenate((am_tld, im_tld, ye_tld)))

True Positive: 188.0
False Positive: 73.0
True Negative: 227.0
False Negative: 112.0
Accuracy: 0.691666666667


In [151]:
"""
Part 1.F
"""
from sklearn.naive_bayes import GaussianNB
def classify_naive_bayes(training_set, training_labels, testing_set, testing_labels):
    tp = 0.0
    tn = 0.0
    fp = 0.0
    fn = 0.0
    
    nb = GaussianNB().fit(training_set, training_labels)
    guesses = nb.predict(testing_set)
    for ind in xrange(len(guesses)):
        label = guesses[ind]
        tl = testing_labels[ind]
        correct = tl == label
        if label == '1':
            if correct: tp += 1.0
            else: fp += 1.0
        else:
            if correct: tn += 1.0
            else: fn += 1.0
            
    print("True Positive: {0}".format(tp))
    print("False Positive: {0}".format(fp))    
    print("True Negative: {0}".format(tn))    
    print("False Negative: {0}".format(fn))    
    print("Accuracy: {0}".format((tp + tn)/(tp + tn + fp + fn)))

classify_naive_bayes(np.concatenate((amp_trd, imp_trd, yep_trd)), 
                             np.concatenate((am_trl, im_trl, ye_trl)),
                             np.concatenate((amp_ted, imp_ted, yep_ted)),
                             np.concatenate((am_tld, im_tld, ye_tld)))

True Positive: 140.0
False Positive: 57.0
True Negative: 243.0
False Negative: 160.0
Accuracy: 0.638333333333
