In [47]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import preprocessing
import string
import scipy
import re
from copy import deepcopy

stopwords = map(unicode, set(set(stopwords.words('english')) - set([unicode('not'), unicode('no')])))
lematzr = WordNetLemmatizer()

In [2]:
# http://stackoverflow.com/questions/771918/how-do-i-do-word-stemming-or-lemmatization
#  Lematizing, removing punctuation and removing caps. Also removing stopwords but keeping negatives like no and not
# Not adding words that are causing unicode errors: fiancé, café, crêpe, puréed, québec, 
# clichés, clichés, aurvåg, problemsthe, clichés, seeing      
def remove_punc(s):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)

def decap(s):
    return s.lower()

def lemmatize_word(lmtr, s):
    return lmtr.lemmatize(s)

def prepro(s):
    string = remove_punc(decap(s)).split()
    formated = []
    for word in string:

            try:
                if unicode(word) not in stopwords:
                    formated.append(str(lemmatize_word(lematzr, unicode(word))))
            except (UnicodeDecodeError, UnicodeEncodeError):

                print "unicode error for: " + word
    return formated



In [3]:
# Importing the data

amazon = pd.read_csv("amazon_cells_labelled.txt", sep="\t", header=None).dropna().reset_index().drop("index", axis=1)
yelp = pd.read_csv("yelp_labelled.txt", sep="\t", header=None, encoding="utf-8").dropna().reset_index().drop("index", axis=1)
imdb_file = open("imdb_labelled.txt", "r")

splits = []
for line in imdb_file:
    line = line.replace("\n","")
    splits.append(line.split("\t"))
    
imdb = pd.DataFrame(splits)

In [4]:
amazon[0] = amazon[0].map(prepro)
yelp[0] = yelp[0].map(prepro)
imdb[0] = imdb[0].map(prepro)
imdb[1] = imdb[1].map(int)

unicode error for: fiancé
unicode error for: café
unicode error for: crêpe
unicode error for: puréed
unicode error for: 
unicode error for: québec
unicode error for: iswas
unicode error for: 
unicode error for: clichés
unicode error for: clichés
unicode error for: aurvåg
unicode error for: 
unicode error for: problemsthe
unicode error for: 
unicode error for: clichés
unicode error for: 
unicode error for: seeing


In [5]:
amazon_train_true = amazon[amazon[1] > 0].head(400)
yelp_train_true = yelp[yelp[1] > 0].head(400)
imdb_train_true = imdb[imdb[1] > 0].head(400)

amazon_test_true = amazon[amazon[1] > 0].tail(100)
yelp_test_true = yelp[yelp[1] > 0].tail(100)
imdb_test_true = imdb[imdb[1] > 0].tail(100)

amazon_train_false = amazon[amazon[1] == 0].head(400)
yelp_train_false = yelp[yelp[1] == 0].head(400)
imdb_train_false = imdb[imdb[1] == 0].head(400)

amazon_test_false = amazon[amazon[1] == 0].tail(100)
yelp_test_false = yelp[yelp[1] == 0].tail(100)
imdb_test_false = imdb[imdb[1] == 0].tail(100)

train_frames = [amazon_train_true, amazon_train_false, yelp_train_true, yelp_train_false, imdb_train_true, imdb_train_false]
test_frames = [amazon_test_true, amazon_test_false, yelp_test_true, yelp_test_false, imdb_test_true, imdb_test_false]

train = pd.concat(train_frames).reset_index().drop("index", axis=1)
train.columns = ["words","score"]
test = pd.concat(test_frames).reset_index().drop("index", axis=1)
test.columns = ["words", "score"]

In [6]:
bag = {}
def create_bag(word_lst):
    for word in word_lst:
        bag[word] = 0.0
    return bag

def index_bag(bag):
    indx = 0
    for key in bag.keys():
        bag[key] = indx
        indx += 1
    return bag

def count(word_lst):
    zeros = [0 for x in range(0,len(bag))]
    
    for word in word_lst:
        if word in bag.keys():    
            zeros[int(bag[word])] += 1.0

    return zeros



In [7]:
# Creating the bag of words from TRAINING DATA ONLY!!!
train.words.map(create_bag);
index_bag(bag);

In [19]:
# Normalize the dataset with l1. See page 611 of HTF
vects_train = pd.DataFrame(map(count, train.words.tolist()), columns=bag.keys())
vects_test = pd.DataFrame(map(count, test.words.tolist()), columns=bag.keys())

train_normalized = pd.DataFrame(preprocessing.normalize(vects_train, norm='l1'))
train_mean = train_normalized.mean()
test_normalized = pd.DataFrame(preprocessing.normalize(vects_test, norm='l1'))
test_mean = test_normalized.mean()

In [20]:
train_centered = train_normalized.apply(lambda x: x - train_mean, axis=1)
test_centered = test_normalized.apply(lambda x: x - test_mean, axis=1)

In [70]:
U, s, V = scipy.linalg.svd(train_centered, full_matrices=False)
U_test, s_test, V_test = scipy.linalg.svd(test_centered, full_matrices=False)

In [71]:
s_train_50 = np.concatenate((s[0:50], [0]*(len(s)-50)), axis=1)
s_train_100 = np.concatenate((s[0:100], [0]*(len(s)-100)), axis=1)
s_train_150 = np.concatenate((s[0:150], [0]*(len(s)-150)), axis=1)

S_train_50 = np.diag(s_50)
S_train_100 = np.diag(s_100)
S_train_150 = np.diag(s_150)


s_test_50 = np.concatenate((s_test[0:50], [0]*(len(s_test)-50)), axis=1)
s_test_100 = np.concatenate((s_test[0:100], [0]*(len(s_test)-100)), axis=1)
s_test_150 = np.concatenate((s_test[0:150], [0]*(len(s_test)-150)), axis=1)

S_test_50 = np.diag(s_test_50)
S_test_100 = np.diag(s_test_100)
S_test_150 = np.diag(s_test_150)

In [73]:
train_50 = pd.DataFrame(np.dot(U, np.dot(S_50, V)))
train_100 = pd.DataFrame(np.dot(U, np.dot(S_100, V)))
train_150 = pd.DataFrame(np.dot(U, np.dot(S_150, V)))

test_50 = pd.DataFrame(np.dot(U_test, np.dot(S_test_50, V_test)))
test_100 = pd.DataFrame(np.dot(U_test, np.dot(S_test_100, V_test)))
test_150 = pd.DataFrame(np.dot(U_test, np.dot(S_test_150, V_test)))

In [77]:
# 50

k = 2
train_normalized = train_50
test_normalized = test_50

vectorized_train = pd.concat([train, train_normalized], axis=1);
vectorized_test = pd.concat([test, test_normalized], axis=1);

indexes = range(0,len(train_normalized))

def first_means():
    import random
    means = []
    indexes = range(0,len(train_normalized))
    mean_idxs = random.sample(indexes, k)
    for i in mean_idxs:
        means.append(train_normalized.iloc[i].values)
    return means

def assign_to_clusters(means):
    clusters = []
    for i in range(0,k):
        clusters.append([])

    for i in indexes:
        distances = []
        for j in means:
            distances.append(np.linalg.norm(j - train_normalized.iloc[i]))
        cluster_choice = np.argmin(distances)
        clusters[cluster_choice].append(i) 
    return clusters

def new_means(clusters):
    means = []
    for cluster in clusters:
        means.append(np.mean(train_normalized.iloc[cluster]))
    return means

def k_means(k, train_normalized):
    means = first_means()
    iterations = 0
    original_clusters = assign_to_clusters(means)
    means = new_means(original_clusters)
    new_clusters = assign_to_clusters(means)
    means = new_means(new_clusters)
    
    while original_clusters != new_clusters:
        original_clusters = assign_to_clusters(means)
        means = new_means(original_clusters)
        new_clusters = assign_to_clusters(means)
        means = new_means(new_clusters)
        iterations += 1
    
    print "iterations: " + str(iterations)
    
    return new_clusters

clustered = k_means(2, train_normalized)

for cluster in clustered:
    the_cluster = cluster

    cluster_score = str(int(round(train.score.iloc[the_cluster].mean())))
    print "Length of cluster: " + str(len(the_cluster))
    print "number of ones in cluster "+ cluster_score + ": " + str(train.score.iloc[the_cluster].mean()*len(the_cluster))
    print "percent of 1s in " + cluster_score  + " cluster: " + str(train.score.iloc[the_cluster].mean() * 100)    

from sklearn import linear_model
from sklearn.metrics import confusion_matrix
md = linear_model.LogisticRegression()
est = md.fit(train_normalized, vectorized_train.score)

score = est.score(test_normalized, vectorized_test.score)
print "Regression score: " + str(score)

pred = est.predict(test_normalized)
confusion_matrix(pred, vectorized_test.score)

iterations: 3
Length of cluster: 2373
number of ones in cluster 0: 1175.0
percent of 1s in 0 cluster: 49.5153813738
Length of cluster: 27
number of ones in cluster 1: 25.0
percent of 1s in 1 cluster: 92.5925925926
Regression score: 0.645


array([[243, 156],
       [ 57, 144]])

In [78]:
# 100

k = 2
train_normalized = train_100
test_normalized = test_100

vectorized_train = pd.concat([train, train_normalized], axis=1);
vectorized_test = pd.concat([test, test_normalized], axis=1);

indexes = range(0,len(train_normalized))

def first_means():
    import random
    means = []
    indexes = range(0,len(train_normalized))
    mean_idxs = random.sample(indexes, k)
    for i in mean_idxs:
        means.append(train_normalized.iloc[i].values)
    return means

def assign_to_clusters(means):
    clusters = []
    for i in range(0,k):
        clusters.append([])

    for i in indexes:
        distances = []
        for j in means:
            distances.append(np.linalg.norm(j - train_normalized.iloc[i]))
        cluster_choice = np.argmin(distances)
        clusters[cluster_choice].append(i) 
    return clusters

def new_means(clusters):
    means = []
    for cluster in clusters:
        means.append(np.mean(train_normalized.iloc[cluster]))
    return means

def k_means(k, train_normalized):
    means = first_means()
    iterations = 0
    original_clusters = assign_to_clusters(means)
    means = new_means(original_clusters)
    new_clusters = assign_to_clusters(means)
    means = new_means(new_clusters)
    
    while original_clusters != new_clusters:
        original_clusters = assign_to_clusters(means)
        means = new_means(original_clusters)
        new_clusters = assign_to_clusters(means)
        means = new_means(new_clusters)
        iterations += 1
    
    print "iterations: " + str(iterations)
    
    return new_clusters

clustered = k_means(2, train_normalized)

for cluster in clustered:
    the_cluster = cluster

    cluster_score = str(int(round(train.score.iloc[the_cluster].mean())))
    print "Length of cluster: " + str(len(the_cluster))
    print "number of ones in cluster "+ cluster_score + ": " + str(train.score.iloc[the_cluster].mean()*len(the_cluster))
    print "percent of 1s in " + cluster_score  + " cluster: " + str(train.score.iloc[the_cluster].mean() * 100)    

from sklearn import linear_model
from sklearn.metrics import confusion_matrix
md = linear_model.LogisticRegression()
est = md.fit(train_normalized, vectorized_train.score)

score = est.score(test_normalized, vectorized_test.score)
print score

pred = est.predict(test_normalized)
confusion_matrix(pred, vectorized_test.score)

iterations: 2
Length of cluster: 98
number of ones in cluster 1: 96.0
percent of 1s in 1 cluster: 97.9591836735
Length of cluster: 2302
number of ones in cluster 0: 1104.0
percent of 1s in 0 cluster: 47.9582971329
0.691666666667


array([[240, 125],
       [ 60, 175]])

In [79]:
# 150

k = 2
train_normalized = train_150
test_normalized = test_150

vectorized_train = pd.concat([train, train_normalized], axis=1);
vectorized_test = pd.concat([test, test_normalized], axis=1);

indexes = range(0,len(train_normalized))

def first_means():
    import random
    means = []
    indexes = range(0,len(train_normalized))
    mean_idxs = random.sample(indexes, k)
    for i in mean_idxs:
        means.append(train_normalized.iloc[i].values)
    return means

def assign_to_clusters(means):
    clusters = []
    for i in range(0,k):
        clusters.append([])

    for i in indexes:
        distances = []
        for j in means:
            distances.append(np.linalg.norm(j - train_normalized.iloc[i]))
        cluster_choice = np.argmin(distances)
        clusters[cluster_choice].append(i) 
    return clusters

def new_means(clusters):
    means = []
    for cluster in clusters:
        means.append(np.mean(train_normalized.iloc[cluster]))
    return means

def k_means(k, train_normalized):
    means = first_means()
    iterations = 0
    original_clusters = assign_to_clusters(means)
    means = new_means(original_clusters)
    new_clusters = assign_to_clusters(means)
    means = new_means(new_clusters)
    
    while original_clusters != new_clusters:
        original_clusters = assign_to_clusters(means)
        means = new_means(original_clusters)
        new_clusters = assign_to_clusters(means)
        means = new_means(new_clusters)
        iterations += 1
    
    print "iterations: " + str(iterations)
    
    return new_clusters

clustered = k_means(2, train_normalized)

for cluster in clustered:
    the_cluster = cluster

    cluster_score = str(int(round(train.score.iloc[the_cluster].mean())))
    print "Length of cluster: " + str(len(the_cluster))
    print "number of ones in cluster "+ cluster_score + ": " + str(train.score.iloc[the_cluster].mean()*len(the_cluster))
    print "percent of 1s in " + cluster_score  + " cluster: " + str(train.score.iloc[the_cluster].mean() * 100)    

from sklearn import linear_model
from sklearn.metrics import confusion_matrix
md = linear_model.LogisticRegression()
est = md.fit(train_normalized, vectorized_train.score)

score = est.score(test_normalized, vectorized_test.score)
print score

pred = est.predict(test_normalized)
confusion_matrix(pred, vectorized_test.score)

iterations: 0
Length of cluster: 10
number of ones in cluster 0: 0.0
percent of 1s in 0 cluster: 0.0
Length of cluster: 2390
number of ones in cluster 1: 1200.0
percent of 1s in 1 cluster: 50.2092050209
0.746666666667


array([[251, 103],
       [ 49, 197]])