In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import preprocessing
import string
import re
from copy import deepcopy

stopwords = map(unicode, set(set(stopwords.words('english')) - set([unicode('not'), unicode('no')])))
lematzr = WordNetLemmatizer()

In [3]:
# http://stackoverflow.com/questions/771918/how-do-i-do-word-stemming-or-lemmatization
#  Lematizing, removing punctuation and removing caps. Also removing stopwords but keeping negatives like no and not
# Not adding words that are causing unicode errors: fiancé, café, crêpe, puréed, québec, 
# clichés, clichés, aurvåg, problemsthe, clichés, seeing      
def remove_punc(s):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)

def decap(s):
    return s.lower()

def lemmatize_word(lmtr, s):
    return lmtr.lemmatize(s)

def two_gram(input_list):
    return zip(input_list, input_list[1:])

def prepro(s):
    string = remove_punc(decap(s)).split()
    formated = []
    for word in string:
            try:
                if unicode(word) not in stopwords:
                    formated.append(str(lemmatize_word(lematzr, unicode(word))))
            except (UnicodeDecodeError, UnicodeEncodeError):

                print "unicode error for: " + word
    return two_gram(formated)



In [4]:
# Importing the data

amazon = pd.read_csv("amazon_cells_labelled.txt", sep="\t", header=None).dropna().reset_index().drop("index", axis=1)
yelp = pd.read_csv("yelp_labelled.txt", sep="\t", header=None, encoding="utf-8").dropna().reset_index().drop("index", axis=1)
imdb_file = open("imdb_labelled.txt", "r")

splits = []
for line in imdb_file:
    line = line.replace("\n","")
    splits.append(line.split("\t"))
    
imdb = pd.DataFrame(splits)

In [5]:
amazon[0] = amazon[0].map(prepro)
yelp[0] = yelp[0].map(prepro)
imdb[0] = imdb[0].map(prepro)
imdb[1] = imdb[1].map(int)

unicode error for: fiancé
unicode error for: café
unicode error for: crêpe
unicode error for: puréed
unicode error for: 
unicode error for: québec
unicode error for: iswas
unicode error for: 
unicode error for: clichés
unicode error for: clichés
unicode error for: aurvåg
unicode error for: 
unicode error for: problemsthe
unicode error for: 
unicode error for: clichés
unicode error for: 
unicode error for: seeing


In [6]:
amazon_train_true = amazon[amazon[1] > 0].head(400)
yelp_train_true = yelp[yelp[1] > 0].head(400)
imdb_train_true = imdb[imdb[1] > 0].head(400)

amazon_test_true = amazon[amazon[1] > 0].tail(100)
yelp_test_true = yelp[yelp[1] > 0].tail(100)
imdb_test_true = imdb[imdb[1] > 0].tail(100)

amazon_train_false = amazon[amazon[1] == 0].head(400)
yelp_train_false = yelp[yelp[1] == 0].head(400)
imdb_train_false = imdb[imdb[1] == 0].head(400)

amazon_test_false = amazon[amazon[1] == 0].tail(100)
yelp_test_false = yelp[yelp[1] == 0].tail(100)
imdb_test_false = imdb[imdb[1] == 0].tail(100)

train_frames = [amazon_train_true, amazon_train_false, yelp_train_true, yelp_train_false, imdb_train_true, imdb_train_false]
test_frames = [amazon_test_true, amazon_test_false, yelp_test_true, yelp_test_false, imdb_test_true, imdb_test_false]

train = pd.concat(train_frames).reset_index().drop("index", axis=1)
train.columns = ["words","score"]
test = pd.concat(test_frames).reset_index().drop("index", axis=1)
test.columns = ["words", "score"]

In [7]:
bag = {}
def create_bag(word_lst):
    for word in word_lst:
        bag[word] = 0.0
    return bag

def index_bag(bag):
    indx = 0
    for key in bag.keys():
        bag[key] = indx
        indx += 1
    return bag

def count(word_lst):
    zeros = [0 for x in range(0,len(bag))]
    
    for word in word_lst:
        if word in bag.keys():    
            zeros[int(bag[word])] += 1.0

    return zeros



In [8]:
# Creating the bag of words from TRAINING DATA ONLY!!!
train.words.map(create_bag);
index_bag(bag);

In [9]:
# Normalize the dataset with l1. See page 611 of HTF
vects_train = pd.DataFrame(map(count, train.words.tolist()), columns=bag.keys())
vects_test = pd.DataFrame(map(count, test.words.tolist()), columns=bag.keys())

train_normalized = pd.DataFrame(preprocessing.normalize(vects_train, norm='l1'))
test_normalized = pd.DataFrame(preprocessing.normalize(vects_test, norm='l1'))

In [10]:
vectorized_train = pd.concat([train, train_normalized], axis=1);
vectorized_test = pd.concat([test, test_normalized], axis=1);

In [11]:
def first_means():
    import random
    means = []
    indexes = range(0,len(train_normalized))
    mean_idxs = random.sample(indexes, k)
    for i in mean_idxs:
        means.append(train_normalized.iloc[i].values)
    return means

def assign_to_clusters(means):
    clusters = []
    for i in range(0,k):
        clusters.append([])

    for i in indexes:
        distances = []
        for j in means:
            distances.append(np.linalg.norm(j - train_normalized.iloc[i]))
        cluster_choice = np.argmin(distances)
        clusters[cluster_choice].append(i) 
    return clusters

def new_means(clusters):
    means = []
    for cluster in clusters:
        means.append(np.mean(train_normalized.iloc[cluster]))
    return means

In [12]:
indexes = range(0,len(train_normalized))

def k_means(k, train_normalized):
    
    means = first_means()
    iterations = 0
    original_clusters = assign_to_clusters(means)
    means = new_means(original_clusters)
    new_clusters = assign_to_clusters(means)
    means = new_means(new_clusters)
    
    while original_clusters != new_clusters:
        original_clusters = assign_to_clusters(means)
        means = new_means(original_clusters)
        new_clusters = assign_to_clusters(means)
        means = new_means(new_clusters)
        iterations += 1
    
    print "iterations: " + str(iterations)
    
    return new_clusters
    

In [13]:
k=2
clustered = k_means(k, train_normalized)

iterations: 0


In [14]:
for cluster in clustered:
    the_cluster = cluster

    cluster_score = str(int(round(train.score.iloc[the_cluster].mean())))
    print "Length of cluster: " + str(len(the_cluster))
    print "number of ones in cluster "+ cluster_score + ": " + str(train.score.iloc[the_cluster].mean()*len(the_cluster))
    print "percent of 1s in " + cluster_score  + " cluster: " + str(train.score.iloc[the_cluster].mean() * 100)    
    

Length of cluster: 2399
number of ones in cluster 0: 1199.0
percent of 1s in 0 cluster: 49.9791579825
Length of cluster: 1
number of ones in cluster 1: 1.0
percent of 1s in 1 cluster: 100.0


In [15]:
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
md = linear_model.LogisticRegression()
est = md.fit(train_normalized, vectorized_train.score)

In [16]:
score = est.score(test_normalized, vectorized_test.score)
print score

pred = est.predict(test_normalized)
confusion_matrix(pred, vectorized_test.score)

0.636666666667


array([[269, 187],
       [ 31, 113]])

IndexError: index 1893 is out of bounds for axis 0 with size 1

In [19]:
bag[('work', 'great')]

1893