In [33]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Word2Vec
from gensim.models import Doc2Vec

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# random
import random

In [34]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [35]:

sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

In [36]:
model = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



In [37]:
model.train(sentences, total_examples=model.corpus_count, epochs=10)



In [38]:
model.wv.most_similar('gandalf')

[('aragorn', 0.7731785774230957),
 ('syllabic', 0.7168655395507812),
 ('ohara', 0.7154890894889832),
 ('eowyn', 0.7096154093742371),
 ('middleton', 0.6969672441482544),
 ('commishioner', 0.6917276382446289),
 ('basso', 0.6900217533111572),
 ('untutored', 0.68355393409729),
 ('mamodo', 0.6806982159614563),
 ('boromir', 0.6781603097915649)]

In [39]:
model['TRAIN_NEG_0']

array([-0.13913268,  0.15021321, -0.03531564, -0.28943199, -0.03138167,
       -0.22017106,  0.32938755,  0.15985407, -0.12201687,  0.06483268,
        0.09612474,  0.03355727,  0.03962829,  0.2830447 , -0.17131598,
       -0.00868482, -0.104504  , -0.1697655 , -0.03821738, -0.14742668,
       -0.0926893 ,  0.15509811,  0.06425036,  0.0541451 , -0.14582181,
       -0.22006164,  0.10749359, -0.07207451,  0.12976101,  0.21326768,
       -0.06379854, -0.04967392, -0.11519434, -0.16500533,  0.03110348,
        0.06954801,  0.24743034,  0.0374753 , -0.01505366,  0.18005267,
       -0.05945515,  0.03950237,  0.06424104,  0.01931917, -0.03325284,
        0.06662767,  0.07513139,  0.14157304,  0.24926299, -0.02202017,
       -0.07397141,  0.12651265,  0.01655136, -0.05342555,  0.12718171,
       -0.12258115, -0.1112871 ,  0.06722842, -0.07999547,  0.23850127,
        0.07700711,  0.15600057,  0.07784322,  0.22602975,  0.10866074,
       -0.18004186,  0.13781495, -0.05266292, -0.04609859,  0.15

In [40]:
#model.save('./imdb.d2v')
model = Doc2Vec.load('./imdb.d2v')

In [41]:
train_arrays = np.zeros((25000, 100))
train_labels = np.zeros(25000)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [42]:

print (train_arrays)

[[ 0.10814173  0.02058459  0.02001575 ...,  0.00272425  0.07071726
  -0.23649846]
 [ 0.49629509 -0.17324495 -0.06872885 ...,  0.5641169   0.16677722
  -0.36179718]
 [ 0.01649117 -0.29606307  0.02630273 ...,  0.1075986   0.23505121
  -0.26756456]
 ..., 
 [-0.22587335 -0.20128927 -0.08237034 ..., -0.19372711 -0.20316158
  -0.37590554]
 [ 0.15669258 -0.28902137  0.52362788 ...,  0.01294029  0.03904394
  -0.15000959]
 [ 0.15157355 -0.13107835 -0.04001979 ..., -0.03206924 -0.04616274
  -0.13328114]]


In [43]:
print (train_labels)

[ 1.  1.  1. ...,  0.  0.  0.]


In [44]:
test_arrays = np.zeros((25000, 100))
test_labels = np.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [45]:
LogisticRegression_classifier = LogisticRegression()
LogisticRegression_classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
LogisticRegression_classifier.score(test_arrays, test_labels)

0.61992000000000003

In [57]:
DecisionTree_classifier = DecisionTreeClassifier(random_state=1)
DecisionTree_classifier.fit(train_arrays, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [58]:
DecisionTree_classifier.score(test_arrays, test_labels)

0.54203999999999997

In [49]:
SVM_classifier = SVC()
SVM_classifier.fit(train_arrays, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
SVM_classifier.score(test_arrays, test_labels)

0.63100000000000001

In [51]:
MLP_Classifier = MLPClassifier()
MLP_Classifier.fit(train_arrays, train_labels)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [52]:
MLP_Classifier.score(test_arrays, test_labels)

0.65515999999999996

In [53]:
GaussianNB_Classifier = GaussianNB()
GaussianNB_Classifier.fit(train_arrays, train_labels)

GaussianNB(priors=None)

In [54]:
GaussianNB_Classifier.score(test_arrays, test_labels)

0.5242