In [1]:
import cPickle as pickle
import os
import sys
import tarfile
import urllib
from os.path import isfile, isdir
from tqdm import tqdm
import glob
import csv

import math
import numpy as np
import pandas as pd
from scipy import misc
import random

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import nltk.data
from gensim.models import word2vec

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

# Load data and model

In [2]:
data_root = 'data/'
try:
    os.mkdir(data_root)
except OSError:
    pass

model_root = 'model/'
try:
    os.mkdir(model_root)
except OSError:
    pass

In [3]:
with open(os.path.join(data_root,'train_x_clean.pkl'), 'rb') as f:
    train_x_clean = pickle.load(f)
    
with open(os.path.join(data_root,'train_y_clean.pkl'), 'rb') as f:
    train_y_clean = pickle.load(f)
    
with open(os.path.join(data_root,'validation_x_clean.pkl'), 'rb') as f:
    validation_x_clean = pickle.load(f)
    
with open(os.path.join(data_root,'validation_y_clean.pkl'), 'rb') as f:
    validation_y_clean = pickle.load(f)

with open(os.path.join(data_root,'test_x_clean.pkl'), 'rb') as f:
    test_x_clean = pickle.load(f)
    
with open(os.path.join(data_root,'test_y_clean.pkl'), 'rb') as f:
    test_y_clean = pickle.load(f)

In [4]:
with open(os.path.join(model_root, 'model_w2v1.pkl'), 'rb') as f:
    model_w2v1 = pickle.load(f)

# Train k-means clustering model

In [13]:
from sklearn.cluster import KMeans
import time

def train_k_means1(model_w2v):
    start = time.time() # Start time

    # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
    # average of 5 words per cluster
    word_vectors = model_w2v.wv.syn0
    num_clusters = word_vectors.shape[0] / 5

    # Initalize a k-means object and use it to extract centroids
    model = KMeans( n_clusters = num_clusters )
    idx = model.fit_predict( word_vectors )

    # Get the end time and print how long the process took
    end = time.time()
    elapsed = end - start
    
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number  
    
    word_centroid_map = dict(zip(model_w2v.wv.index2word, idx ))
    
    print "Time taken for K Means clustering: ", elapsed, "seconds."
    return model, word_centroid_map

In [14]:
train_k_means = train_k_means1(model_w2v1)

Time taken for K Means clustering:  623.678175926 seconds.


In [15]:
model_kmeans1 = train_k_means[0]
word_centroid_map1 = train_k_means[1]                                                       

In [16]:
with open(os.path.join(model_root,'word_centroid_map1.pkl'), 'wb') as f:
    pickle.dump(word_centroid_map1, f)
    
with open(os.path.join(model_root,'model_kmeans1.pkl'), 'wb') as f:
    pickle.dump(model_kmeans1, f)

In [17]:
print len(word_centroid_map1)

16490


# Represent reviews with clusters

In [5]:
# testing case
review_cleaned = train_x_clean[1000]
with open(os.path.join(model_root, 'word_centroid_map1'), 'rb') as f:
    word_centroid_map1 = pickle.load(f)

In [6]:
def review_centroids(wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:        
        if word in word_centroid_map:
            #print word
            index = word_centroid_map[word]
            #print index
            bag_of_centroids[index] += 1
            #print bag_of_centroids[index]
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [8]:
print review_centroids(review_cleaned, word_centroid_map1).shape

(3298,)


In [9]:
def dataset_centroids(dataset, word_centroid_map):
    new_dataset = []
    for review in dataset:
        new_review = review_centroids(review, word_centroid_map)
        new_dataset.append(new_review)
    return new_dataset

In [10]:
train_x_clusters = dataset_centroids(train_x_clean, word_centroid_map1)
validation_x_clusters = dataset_centroids(validation_x_clean, word_centroid_map1)
test_x_clusters = dataset_centroids(test_x_clean, word_centroid_map1)

In [29]:
with open(os.path.join(data_root,'train_x_clusters.pkl'), 'wb') as f:
    pickle.dump(train_x_clusters, f)
    
with open(os.path.join(data_root,'validation_x_clusters.pkl'), 'wb') as f:
    pickle.dump(validation_x_clusters, f)

with open(os.path.join(data_root,'test_x_clusters.pkl'), 'wb') as f:
    pickle.dump(test_x_clusters, f)

In [11]:
len(train_x_clusters)

25000

# Train classification models

In [12]:
from sklearn.ensemble import RandomForestClassifier
rd_clf = RandomForestClassifier(n_estimators = 100, random_state=3)
rd_clf = rd_clf.fit(train_x_clusters, train_y_clean)
accuracy_validation = rd_clf.score(validation_x_clusters, validation_y_clean)
#test_validation = rd_clf.score(test_features_bw, test_labels)
print accuracy_validation

0.84728


In [13]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf = lr_clf.fit(train_x_clusters, train_y_clean)
accuracy_validation = lr_clf.score(validation_x_clusters, validation_y_clean)
print accuracy_validation

0.8564


In [14]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC()
svm_clf = svm_clf.fit(train_x_clusters, train_y_clean)
accuracy_validation = svm_clf.score(validation_x_clusters, validation_y_clean)
print accuracy_validation

0.84624


In [16]:
# performance on test set
        
print "Accuracy"
print "random forest: ", rd_clf.score(test_x_clusters, test_y_clean)
print "logistic regression: ", lr_clf.score(test_x_clusters, test_y_clean)
print "svm: ", svm_clf.score(test_x_clusters, test_y_clean)

 Accuracy
random forest:  0.8472
logistic regression:  0.86008
svm:  0.85296
