In [9]:
# -*- coding: utf-8 -*-

In [10]:
#gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

In [128]:
import numpy as np
from random import shuffle 
from sklearn.linear_model import LogisticRegression

In [14]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [16]:
with open('D:/aclImdb_v1/aclImdb/train/pos/pos.txt','r') as infile:
    pos_reviews = infile.readlines()

In [71]:
testpospath='D:/aclImdb_v1/aclImdb/test/pos'
test_file_names = os.listdir(testpospath)
print test_file_names[0:5]

['0_10.txt', '10000_7.txt', '10001_9.txt', '10002_8.txt', '10003_8.txt']


In [72]:
len(test_file_names)

12500

In [74]:
testposfile=open('D:/aclImdb_v1/aclImdb/test/pos/test_pos.txt','a')
for test_file_name in test_file_names:
    if not test_file_name.endswith('.txt'):
        continue
    test_file = os.path.join(testpospath,test_file_name)
    with open (test_file,'r') as f:
        lines = f.read().splitlines()
        testposfile.writelines(lines)
        testposfile.write('\n')

testposfile.close()

In [77]:
num_lines = sum(1 for line in open('D:/aclImdb_v1/aclImdb/test/pos/test_pos.txt'))#count the lines just write in
print num_lines

12500


In [80]:
testnegpath='D:/aclImdb_v1/aclImdb/test/neg'
test_file_names = os.listdir(testnegpath)
print test_file_names[0:5]

['0_2.txt', '10000_4.txt', '10001_1.txt', '10002_3.txt', '10003_3.txt']


In [81]:
testnegfile=open('D:/aclImdb_v1/aclImdb/test/neg/test_neg.txt','a')
for test_file_name in test_file_names:
    if not test_file_name.endswith('.txt'):
        continue
    test_file = os.path.join(testnegpath,test_file_name)
    with open (test_file,'r') as f:
        lines = f.read().splitlines()
        testnegfile.writelines(lines)
        testnegfile.write('\n')

testnegfile.close()

In [82]:
num_lines = sum(1 for line in open('D:/aclImdb_v1/aclImdb/test/neg/test_neg.txt'))#count the lines just write in
print num_lines

12501


In [83]:
sources = {'D:/aclImdb_v1/aclImdb/test/neg/test_neg.txt':'TEST_NEG', 'D:/aclImdb_v1/aclImdb/test/pos/test_pos.txt':'TEST_POS', 'D:/aclImdb_v1/aclImdb/train/neg/train_neg.txt':'TRAIN_NEG', 'D:/aclImdb_v1/aclImdb/train/pos/train_pos.txt':'TRAIN_POS', 'D:/aclImdb_v1/aclImdb/train/unsup/train_unsup.txt':'TRAIN_UNS'}

In [84]:
sentences = LabeledLineSentence(sources)

In [134]:
sentences

<__main__.LabeledLineSentence at 0x87217f0>

In [85]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)

In [86]:
model.build_vocab(sentences.to_array())

In [87]:
for epoch in range(10):
    model.train(sentences.sentences_perm())

In [88]:
model.save('D:/aclImdb_v1/imdb.d2v')

In [90]:
model = Doc2Vec.load('D:/aclImdb_v1/imdb.d2v')

In [144]:
model.most_similar('bad','awful')

[(u'ex-kickboxer', 0.4178465008735657),
 (u"Horton's", 0.4147828221321106),
 (u'Cossell.', 0.4004952311515808),
 (u'shepherds', 0.3970067501068115),
 (u'Eckhart)', 0.3943425416946411),
 (u'Y+', 0.39374929666519165),
 (u'Darvi)', 0.39355382323265076),
 (u'Whitaker),', 0.3837607204914093),
 (u'Love"\x97and', 0.38014885783195496),
 (u'church;', 0.378433495759964)]

In [115]:
model.doesnt_match("actor director actress money".split())

'money'

In [130]:
np.shape(model.syn1[:])

(703379L, 100L)

In [131]:
model['great']

array([-0.03295228,  0.04245356, -0.12551388, -0.09444155,  0.03872843,
        0.06632298,  0.06923104,  0.00354602, -0.06555402, -0.16283469,
        0.10392242,  0.02682297,  0.09891751, -0.0025965 ,  0.08402909,
        0.07368149,  0.07677175, -0.03968524, -0.05909212,  0.12077152,
        0.00842909, -0.13830726,  0.00277739, -0.08820997, -0.01967817,
       -0.01383139,  0.00404541, -0.01583151,  0.08651619, -0.04554591,
       -0.05224117,  0.00765246,  0.07300655,  0.14639845, -0.01991197,
        0.03479124, -0.074931  ,  0.07947178, -0.06496388, -0.02345342,
        0.02506319,  0.02241754,  0.03872867, -0.05917371,  0.05777435,
       -0.10313001,  0.08805922,  0.1074123 ,  0.06927706, -0.02230474,
        0.0031256 ,  0.04181514, -0.07128429, -0.06932658, -0.01724239,
       -0.00226501, -0.08209578, -0.00394776, -0.08980807,  0.01733783,
       -0.02694049, -0.04900165,  0.07052645, -0.0089487 , -0.033897  ,
        0.00856984, -0.06952512,  0.13287283, -0.01410092, -0.01

In [156]:
model.docvecs['TRAIN_NEG_0']

array([ 0.34349704, -0.28114563,  0.15553074, -0.0742671 ,  0.16231376,
       -0.0577793 , -0.32738313,  0.06603613, -0.0177818 ,  0.03237845,
        0.08283368,  0.45181251,  0.15142146,  0.2468181 , -0.12983988,
        0.04126877, -0.07602935,  0.02741039,  0.17937183, -0.13999611,
        0.32161313, -0.56360668,  0.23081957, -0.19916241,  0.12782499,
       -0.05858454, -0.42471901,  0.03952511,  0.23915942, -0.07357813,
       -0.03383054, -0.05575309, -0.2468773 ,  0.06058374,  0.44233057,
       -0.0094618 ,  0.15928185,  0.08374003, -0.25648499,  0.16620168,
        0.09682708,  0.23457398,  0.14770512,  0.04763613, -0.12721123,
        0.01681   , -0.06182304,  0.05169237,  0.44910318, -0.35420635,
       -0.06949909, -0.50830024, -0.22384013,  0.07454668, -0.4178369 ,
       -0.03971678,  0.31576133,  0.01509237, -0.02283465,  0.10483131,
       -0.54380614,  0.03327699, -0.1676164 , -0.14590305, -0.2550818 ,
       -0.00423515,  0.01254161,  0.15923743, -0.16025175, -0.21

In [157]:
train_arrays = numpy.zeros((25000, 100))
train_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_arrays[12500 + i] = model.docvecs[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [158]:
print train_arrays

[[ 0.22747336 -0.29849795 -0.14208162 ..., -0.07592782 -0.07239597
   0.34631407]
 [ 0.09214678 -0.08534613  0.61773217 ...,  0.04337816 -0.10439413
  -0.10903627]
 [ 0.19412082 -0.20745908  0.33502981 ..., -0.33127564 -0.27579853
   0.18635908]
 ..., 
 [ 0.23705916 -0.22731981  0.1457182  ..., -0.11989334 -0.06326018
  -0.17819016]
 [ 0.39514577 -0.65651453  0.40622211 ..., -0.06577274  0.10656547
  -0.09263761]
 [ 0.41178915 -0.31219524  0.27829415 ..., -0.28650829 -0.37529829
   0.15988718]]


In [159]:
print train_labels

[ 1.  1.  1. ...,  0.  0.  0.]


In [161]:
test_arrays = numpy.zeros((25000, 100))
test_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test_pos]
    test_arrays[12500 + i] = model.docvecs[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [162]:
classifier = LogisticRegression()

In [163]:
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [164]:
classifier.score(test_arrays, test_labels)

0.87343999999999999