In [4]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
import numpy as np
from gensim.models import doc2vec
LabelDocument = namedtuple('LabeledDocument', 'words tags label')

alldocs = []  # Will hold all docs in original order
with open('../janeausten/ja_COMBINED_treated.txt', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
    tokenArray = np.array_split(tokens,10000)
    for tokenList_no, tokens in enumerate(tokenArray):
        words = tokens[1:]
        tags = [tokenList_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        label = ["janeausten"][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(LabelDocument(words, tags, label))

doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs' % (len(doc_list)))
model_loaded = doc2vec.Doc2Vec.load('/tmp/marktwain_model.doc2vec')

10000 docs


In [5]:
alldocs = []
with open('../marktwain/combined.txt', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
    tokenArray = np.array_split(tokens,10000)
    for tokenList_no, tokens in enumerate(tokenArray):
        words = tokens[1:]
        tags = [10000+tokenList_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        label = ["marktwain"][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(LabelDocument(words, tags, label))

mt_doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs' % (len(mt_doc_list)))
doc_list += mt_doc_list

10000 docs


In [6]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

# Speed up setup by sharing results of the 1st model's vocabulary scan
simple_models[0].build_vocab(doc_list)  # PV-DM w/ concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)


In [7]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [8]:
import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc labels, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.label, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.label for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

  from pandas.core import datetools


In [90]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results
    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()
            
        # Evaluate
        eval_duration = ''
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2017-11-06 19:49:30.099107
Completed pass 1 at alpha 0.025000
Completed pass 2 at alpha 0.023800
Completed pass 3 at alpha 0.022600
Completed pass 4 at alpha 0.021400
Completed pass 5 at alpha 0.020200
Completed pass 6 at alpha 0.019000
Completed pass 7 at alpha 0.017800
Completed pass 8 at alpha 0.016600
Completed pass 9 at alpha 0.015400
Completed pass 10 at alpha 0.014200
Completed pass 11 at alpha 0.013000
Completed pass 12 at alpha 0.011800
Completed pass 13 at alpha 0.010600
Completed pass 14 at alpha 0.009400
Completed pass 15 at alpha 0.008200
Completed pass 16 at alpha 0.007000
Completed pass 17 at alpha 0.005800
Completed pass 18 at alpha 0.004600
Completed pass 19 at alpha 0.003400
Completed pass 20 at alpha 0.002200
END 2017-11-06 19:51:05.315325


In [9]:
for name, train_model in models_by_name.items():
    print(name + " number of docvecs: " + str(len(train_model.docvecs)))

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) number of docvecs: 20000
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) number of docvecs: 20000
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) number of docvecs: 20000


TypeError: object of type 'ConcatenatedDocvecs' has no len()

In [10]:
model1 = Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores)
model1.build_vocab(doc_list)
len(doc_list)

20000

In [93]:
for epoch in range(10):
    shuffle(doc_list)  # Shuffling gets best results
    model1.alpha, model1.min_alpha = alpha, alpha
    with elapsed_timer() as elapsed:
        model1.train(doc_list, total_examples=len(doc_list), epochs=1)
        duration = '%.1f' % elapsed()
    # Evaluate
    eval_duration = ''
    eval_duration = '%.1f' % eval_elapsed()
    best_indicator = ' '

In [94]:
%store models_by_name

Stored 'models_by_name' (OrderedDict)


In [33]:
import pandas as pd
X = []
yzero = np.zeros(10000)
yone = np.ones(10000)
yzero = pd.DataFrame(yzero)
yone = pd.DataFrame(yone)
Y = pd.concat([yzero,yone])

In [158]:
%store model1

Stored 'model1' (Doc2Vec)


In [159]:
for dv in model1.docvecs:
    X.append(dv)
trainX = pd.DataFrame(X)
trainX.columns.astype(str)
Y.columns.astype(str)

Index(['0'], dtype='object')

In [160]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainX, Y, test_size=0.25)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(15000, 100) (15000, 1)
(5000, 100) (5000, 1)


In [None]:
y_train

In [164]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
#xgb model with parameters
gbm = xgb.XGBClassifier(num_class=2, max_depth=3, n_estimators=300, learning_rate=0.05)

In [165]:
gbm.fit(X_train, y_train)
predictions = gbm.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


TypeError: 'int' object is not iterable

In [142]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
rbf1 = SVC(C=1.0, kernel='poly',degree=6)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
rbf1.fit(X_train,y_train)
rbfpredictions = rbf1.predict(X_test)
accuracy = accuracy_score(y_test, rbfpredictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

  y = column_or_1d(y, warn=True)


Accuracy: 49.38%


In [3]:
len(alldocs)

10000

In [14]:
type(doc_list)

list

In [22]:
newdoclist = []
for doc in doc_list:
    iv = model_loaded.infer_vector(doc.words)
    newdoclist.append(iv)
len(newdoclist)

20000

In [23]:
newdoclist

[array([ -7.29919896e-02,   2.42309108e-01,   8.49781632e-02,
          1.61040112e-01,  -5.11419810e-02,   1.86680809e-01,
          2.21968442e-02,  -5.64095303e-02,   1.87121076e-03,
         -3.43533814e-01,  -5.05306385e-02,  -6.24195114e-03,
          1.22774221e-01,   1.42234797e-02,   3.28382390e-04,
          3.84849757e-02,   1.51644470e-02,  -1.13705203e-01,
          1.30913593e-02,  -2.42756866e-02,   2.29236394e-01,
         -7.72359669e-02,  -5.48899211e-02,  -8.52855071e-02,
          7.02492222e-02,  -2.06083104e-01,  -6.15537539e-03,
         -1.86100397e-02,  -7.31617510e-02,  -3.44897136e-02,
         -3.22837681e-02,   6.15299642e-02,   3.20253856e-02,
         -1.38777673e-01,   2.35115930e-01,   1.91380251e-02,
         -5.25550311e-03,  -8.42514560e-02,   1.00851603e-01,
         -4.68665622e-02,  -1.54775530e-01,  -5.69636561e-03,
         -1.07132308e-01,  -3.76958586e-02,  -5.10629592e-03,
         -6.25296533e-02,   3.76143418e-02,  -3.58162681e-03,
        

In [30]:
import pandas as pd
trainX2 = pd.DataFrame(newdoclist)

In [31]:
trainX2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.072992,0.242309,0.084978,0.161040,-0.051142,0.186681,0.022197,-0.056410,0.001871,-0.343534,...,-0.058699,0.051120,0.039901,-0.087114,0.155676,0.008147,-0.180893,-0.098259,-0.153104,0.090400
1,-0.235177,-0.026196,-0.128242,0.081152,-0.170408,0.183241,0.102298,0.182747,0.190017,0.020357,...,-0.062930,0.137756,0.113242,-0.149854,-0.106206,0.111281,-0.164544,0.022787,-0.097328,0.116233
2,-0.031718,0.189221,0.128569,-0.036340,0.011388,0.167755,0.006302,-0.058952,0.055337,-0.152316,...,0.035732,0.187765,0.070771,-0.150368,0.152318,0.161612,-0.355016,-0.143640,0.175042,0.205354
3,-0.102142,-0.030177,0.043103,-0.124312,-0.101125,0.120365,0.109692,0.113894,0.071132,-0.100546,...,-0.044994,0.036373,-0.025270,-0.126847,-0.074274,0.101442,0.077063,-0.221891,-0.156176,-0.101552
4,-0.003746,0.216489,0.135816,0.172621,-0.024125,0.078810,0.131609,-0.105895,0.154944,-0.179064,...,-0.058360,0.070368,0.081555,-0.093762,0.140260,0.073017,-0.277266,-0.290011,-0.044508,-0.005460
5,-0.070883,0.136839,-0.107019,0.138488,-0.050011,0.009649,-0.000811,0.055732,0.037941,-0.237982,...,-0.087438,-0.019289,0.013119,0.059823,-0.013332,0.123128,-0.155470,-0.100152,-0.120539,-0.085955
6,-0.152731,-0.197187,0.316664,0.240871,0.091317,0.263382,0.141488,0.087596,-0.162206,-0.219532,...,0.046099,0.028784,-0.032427,-0.007013,0.048064,0.080653,-0.082743,-0.167059,0.115532,0.021133
7,-0.016600,0.042699,-0.112553,-0.089345,-0.067912,0.213720,0.098729,-0.005292,-0.052328,-0.301222,...,-0.060628,0.199933,0.006417,-0.150100,0.139007,-0.097627,-0.132842,-0.041122,0.033128,0.160748
8,-0.158607,0.280632,0.039270,-0.019077,-0.066704,0.169796,-0.013151,-0.087869,0.054799,-0.198929,...,0.054915,0.091261,-0.092474,-0.049141,0.035477,0.011471,-0.142539,0.081263,-0.025351,0.032637
9,0.007694,0.082520,0.205605,-0.009245,-0.115227,0.004934,0.080968,-0.145590,-0.017780,-0.177736,...,0.113452,-0.029939,-0.000816,-0.221358,0.076932,-0.117759,-0.125910,-0.008756,-0.155806,-0.162951


In [36]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train, y_test = train_test_split(trainX2, Y, test_size=0.25)
print (X_train2.shape, y_train.shape)
print (X_test2.shape, y_test.shape)

(15000, 100) (15000, 1)
(5000, 100) (5000, 1)


In [40]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
rbf2 = SVC(C=1.0, kernel='rbf')
rbf2.fit(X_train2,y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [42]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
rbfpredictions = rbf2.predict(X_test2)
accuracy = accuracy_score(y_test, rbfpredictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 93.28%
