In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import Perceptron 

from sklearn.neighbors import NearestCentroid

In [3]:
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

Unnamed: 0,text,sentiment
0,two places invest all my money if could printi...,5
1,awesome google driverless cars will help the b...,5
2,if google maps can not keep up with road const...,2
3,autonomous cars seem way overhyped given the t...,2
4,just saw google self driving car on it was pai...,3


In [4]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
text         100 non-null object
sentiment    100 non-null object
dtypes: object(2)
memory usage: 1.6+ KB


In [5]:
x = my_df.text
y = my_df.sentiment

In [6]:
from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [7]:
#print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive",format(len(x_train),(len(x_train[y_train == 0]) / (len(x_train)*1.))*100,(len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
#print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive", format(len(x_validation),(len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,(len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
#print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive",format(len(x_test),(len(x_test[y_test == 0]) / (len(x_test)*1.))*100,(len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

In [8]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils
from sklearn.linear_model import LogisticRegression

In [9]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [10]:
def get_concat_vectors(model1,model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
        n += 1
    return vecs


In [11]:
#phrase modeling

In [12]:
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser

In [13]:
tokenized_train = [t.split() for t in x_train]


In [14]:
phrases = Phrases(tokenized_train)
bigram = Phraser(phrases)

In [15]:
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
print(bigram[sent])

['the', 'mayor', 'of', 'new', 'york', 'was', 'there']


In [16]:
#x_train[5]

In [17]:
bigram[x_train[5].split()]

['will',
 'driverless_cars',
 'eventually',
 'replace',
 'taxi',
 'drivers',
 'in',
 'cities']

In [18]:
def labelize_tweets_bg(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(bigram[t.split()], [prefix + '_%s' % i]))
    return result

In [19]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v_bg = labelize_tweets_bg(all_x, 'all')

  """


In [20]:
#DBOW Bigram

In [21]:
cores = multiprocessing.cpu_count()
model_bg_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dbow.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|██████████| 100/100 [00:00<00:00, 217885.92it/s]


In [22]:
for epoch in range(30):
    model_bg_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dbow.alpha -= 0.002
    model_bg_dbow.min_alpha = model_bg_dbow.alpha

100%|██████████| 100/100 [00:00<00:00, 442437.13it/s]
100%|██████████| 100/100 [00:00<00:00, 316312.52it/s]
100%|██████████| 100/100 [00:00<00:00, 521031.55it/s]
100%|██████████| 100/100 [00:00<00:00, 352166.58it/s]
100%|██████████| 100/100 [00:00<00:00, 167170.35it/s]
100%|██████████| 100/100 [00:00<00:00, 515270.76it/s]
100%|██████████| 100/100 [00:00<00:00, 333145.67it/s]
100%|██████████| 100/100 [00:00<00:00, 283207.56it/s]
100%|██████████| 100/100 [00:00<00:00, 425385.80it/s]
100%|██████████| 100/100 [00:00<00:00, 328965.02it/s]
100%|██████████| 100/100 [00:00<00:00, 413231.92it/s]
100%|██████████| 100/100 [00:00<00:00, 84087.89it/s]
100%|██████████| 100/100 [00:00<00:00, 411609.81it/s]
100%|██████████| 100/100 [00:00<00:00, 34069.56it/s]
100%|██████████| 100/100 [00:00<00:00, 476625.45it/s]
100%|██████████| 100/100 [00:00<00:00, 294337.12it/s]
100%|██████████| 100/100 [00:00<00:00, 321402.61it/s]
100%|██████████| 100/100 [00:00<00:00, 393831.36it/s]
100%|██████████| 100/100 [00:0

In [23]:
train_vecs_dbow_bg = get_vectors(model_bg_dbow, x_train, 100)
validation_vecs_dbow_bg = get_vectors(model_bg_dbow, x_validation, 100)

In [24]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_bg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
clf.score(validation_vecs_dbow_bg, y_validation)

1.0

In [26]:
from sklearn.neighbors import NearestCentroid
clf1 = NearestCentroid()
clf1.fit(train_vecs_dbow_bg, y_train)

NearestCentroid(metric='euclidean', shrink_threshold=None)

In [27]:
clf1.score(validation_vecs_dbow_bg, y_validation)

1.0

In [28]:
model_bg_dbow.save('d2v_model_bg_dbow.doc2vec')
model_bg_dbow = Doc2Vec.load('d2v_model_bg_dbow.doc2vec')
model_bg_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [29]:
#DBOW Bigram

In [30]:
cores = multiprocessing.cpu_count()
model_bg_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmc.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|██████████| 100/100 [00:00<00:00, 241607.37it/s]


In [31]:
for epoch in range(30):
    model_bg_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmc.alpha -= 0.002
    model_bg_dmc.min_alpha = model_bg_dmc.alpha

100%|██████████| 100/100 [00:00<00:00, 293103.00it/s]
100%|██████████| 100/100 [00:00<00:00, 304818.60it/s]
100%|██████████| 100/100 [00:00<00:00, 444311.86it/s]
100%|██████████| 100/100 [00:00<00:00, 432848.71it/s]
100%|██████████| 100/100 [00:00<00:00, 336891.89it/s]
100%|██████████| 100/100 [00:00<00:00, 288268.32it/s]
100%|██████████| 100/100 [00:00<00:00, 223220.01it/s]
100%|██████████| 100/100 [00:00<00:00, 320420.47it/s]
100%|██████████| 100/100 [00:00<00:00, 259067.57it/s]
100%|██████████| 100/100 [00:00<00:00, 253738.90it/s]
100%|██████████| 100/100 [00:00<00:00, 173032.34it/s]
100%|██████████| 100/100 [00:00<00:00, 221335.30it/s]
100%|██████████| 100/100 [00:00<00:00, 161257.36it/s]
100%|██████████| 100/100 [00:00<00:00, 146193.94it/s]
100%|██████████| 100/100 [00:00<00:00, 115354.90it/s]
100%|██████████| 100/100 [00:00<00:00, 167772.16it/s]
100%|██████████| 100/100 [00:00<00:00, 90336.08it/s]
100%|██████████| 100/100 [00:00<00:00, 216424.36it/s]
100%|██████████| 100/100 [00:

In [32]:
model_bg_dmc.most_similar('autonomous')

  """Entry point for launching an IPython kernel.


[('of', 0.997575044631958),
 ('car', 0.9975479245185852),
 ('it', 0.997412383556366),
 ('and', 0.9966800212860107),
 ('the', 0.9966778755187988),
 ('if', 0.9964960813522339),
 ('be', 0.9963785409927368),
 ('ready', 0.9962443113327026),
 ('autonomous_car', 0.9962088465690613),
 ('just', 0.9960346221923828)]

In [33]:
model_bg_dmc.most_similar(positive=['uber','audi'], negative=['car'], topn=50)

  """Entry point for launching an IPython kernel.


[('gets', 0.9906270503997803),
 ('the', 0.990483283996582),
 ('model', 0.9902991056442261),
 ('road', 0.9895973801612854),
 ('is', 0.9895446300506592),
 ('autonomous', 0.9888527989387512),
 ('autonomous_car', 0.9887654781341553),
 ('are', 0.9887272119522095),
 ('go', 0.988717794418335),
 ('product', 0.9886081218719482),
 ('be', 0.9885661602020264),
 ('been', 0.9885568022727966),
 ('your', 0.9885543584823608),
 ('california', 0.9884355068206787),
 ('and', 0.9884284734725952),
 ('of', 0.9884078502655029),
 ('with', 0.9883712530136108),
 ('future', 0.9883430004119873),
 ('up', 0.9883389472961426),
 ('need', 0.9882744550704956),
 ('audi_gets', 0.9882411360740662),
 ('technology', 0.9881616830825806),
 ('it', 0.9881564378738403),
 ('in', 0.9881138801574707),
 ('just', 0.9881059527397156),
 ('vs', 0.9879671931266785),
 ('year', 0.9879598617553711),
 ('america', 0.9879104495048523),
 ('but', 0.9877904653549194),
 ('ready', 0.987625002861023),
 ('automated_vehicle', 0.987592339515686),
 ('goog

In [34]:
train_vecs_dmc_bg = get_vectors(model_bg_dmc, x_train, 100)
validation_vecs_dmc_bg = get_vectors(model_bg_dmc, x_validation, 100)

In [35]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc_bg, y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
clf.score(validation_vecs_dmc_bg, y_validation)

1.0

In [37]:
model_bg_dmc.save('d2v_model_bg_dmc.doc2vec')
model_bg_dmc = Doc2Vec.load('d2v_model_bg_dmc.doc2vec')
model_bg_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [38]:
#DMM BIgram

In [39]:
cores = multiprocessing.cpu_count()
model_bg_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmm.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|██████████| 100/100 [00:00<00:00, 171897.70it/s]


In [40]:
for epoch in range(30):
    model_bg_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmm.alpha -= 0.002
    model_bg_dmm.min_alpha = model_bg_dmm.alpha

100%|██████████| 100/100 [00:00<00:00, 177349.01it/s]
100%|██████████| 100/100 [00:00<00:00, 261816.73it/s]
100%|██████████| 100/100 [00:00<00:00, 313475.64it/s]
100%|██████████| 100/100 [00:00<00:00, 315598.50it/s]
100%|██████████| 100/100 [00:00<00:00, 337705.64it/s]
100%|██████████| 100/100 [00:00<00:00, 314651.46it/s]
100%|██████████| 100/100 [00:00<00:00, 225258.00it/s]
100%|██████████| 100/100 [00:00<00:00, 245137.58it/s]
100%|██████████| 100/100 [00:00<00:00, 480447.19it/s]
100%|██████████| 100/100 [00:00<00:00, 364722.09it/s]
100%|██████████| 100/100 [00:00<00:00, 483214.75it/s]
100%|██████████| 100/100 [00:00<00:00, 485451.85it/s]
100%|██████████| 100/100 [00:00<00:00, 264624.86it/s]
100%|██████████| 100/100 [00:00<00:00, 546845.37it/s]
100%|██████████| 100/100 [00:00<00:00, 286496.17it/s]
100%|██████████| 100/100 [00:00<00:00, 219367.36it/s]
100%|██████████| 100/100 [00:00<00:00, 331303.63it/s]
100%|██████████| 100/100 [00:00<00:00, 323634.57it/s]
100%|██████████| 100/100 [00

In [41]:
train_vecs_dmm_bg = get_vectors(model_bg_dmm, x_train, 100)
validation_vecs_dmm_bg = get_vectors(model_bg_dmm, x_validation, 100)

In [42]:
clf = LogisticRegression()
clf.fit(train_vecs_dmm_bg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
clf.score(validation_vecs_dmm_bg, y_validation)

1.0

In [44]:
model_bg_dmm.save('d2v_model_bg_dmm.doc2vec')
model_bg_dmm = Doc2Vec.load('d2v_model_bg_dmm.doc2vec')
model_bg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [45]:
train_vecs_dbow_dmc_bg = get_concat_vectors(model_bg_dbow,model_bg_dmc, x_train, 200)
validation_vecs_dbow_dmc_bg = get_concat_vectors(model_bg_dbow,model_bg_dmc, x_validation, 200)

In [46]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc_bg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [47]:
clf.score(validation_vecs_dbow_dmc_bg, y_validation)

1.0

In [48]:
train_vecs_dbow_dmm_bg = get_concat_vectors(model_bg_dbow,model_bg_dmm, x_train, 200)
validation_vecs_dbow_dmm_bg = get_concat_vectors(model_bg_dbow,model_bg_dmm, x_validation, 200)

In [49]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm_bg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [50]:
clf.score(validation_vecs_dbow_dmm_bg, y_validation)

1.0

In [51]:
#trigram

In [52]:
tg_phrases = Phrases(bigram[tokenized_train])
trigram = Phraser(tg_phrases)

In [53]:
trigram[bigram[x_train[5].split()]]

['will',
 'driverless_cars',
 'eventually',
 'replace',
 'taxi',
 'drivers',
 'in',
 'cities']

In [54]:
def labelize_tweets_tg(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(trigram[bigram[t.split()]], [prefix + '_%s' % i]))
    return result

In [55]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v_tg = labelize_tweets_tg(all_x, 'all')

  """


In [56]:
model_tg_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_tg_dbow.build_vocab([x for x in tqdm(all_x_w2v_tg)])

100%|██████████| 100/100 [00:00<00:00, 300236.51it/s]


In [57]:
for epoch in range(30):
    model_tg_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v_tg)]), total_examples=len(all_x_w2v_tg), epochs=1)
    model_tg_dbow.alpha -= 0.002
    model_tg_dbow.min_alpha = model_tg_dbow.alpha

100%|██████████| 100/100 [00:00<00:00, 108464.03it/s]
100%|██████████| 100/100 [00:00<00:00, 227827.49it/s]
100%|██████████| 100/100 [00:00<00:00, 157977.55it/s]
100%|██████████| 100/100 [00:00<00:00, 249958.52it/s]
100%|██████████| 100/100 [00:00<00:00, 273244.56it/s]
100%|██████████| 100/100 [00:00<00:00, 280180.63it/s]
100%|██████████| 100/100 [00:00<00:00, 330000.31it/s]
100%|██████████| 100/100 [00:00<00:00, 229950.88it/s]
100%|██████████| 100/100 [00:00<00:00, 120803.69it/s]
100%|██████████| 100/100 [00:00<00:00, 225258.00it/s]
100%|██████████| 100/100 [00:00<00:00, 263958.72it/s]
100%|██████████| 100/100 [00:00<00:00, 302183.29it/s]
100%|██████████| 100/100 [00:00<00:00, 79167.69it/s]
100%|██████████| 100/100 [00:00<00:00, 81904.00it/s]
100%|██████████| 100/100 [00:00<00:00, 166705.25it/s]
100%|██████████| 100/100 [00:00<00:00, 101655.45it/s]
100%|██████████| 100/100 [00:00<00:00, 159783.01it/s]
100%|██████████| 100/100 [00:00<00:00, 298739.60it/s]
100%|██████████| 100/100 [00:0

In [58]:
train_vecs_dbow_tg = get_vectors(model_tg_dbow, x_train, 100)
validation_vecs_dbow_tg = get_vectors(model_tg_dbow, x_validation, 100)

In [59]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_tg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [60]:
clf.score(validation_vecs_dbow_tg, y_validation)

1.0

In [61]:
model_tg_dbow.save('d2v_model_tg_dbow.doc2vec')
model_tg_dbow = Doc2Vec.load('d2v_model_tg_dbow.doc2vec')
model_tg_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [62]:
#DMC Trigram

In [63]:
cores = multiprocessing.cpu_count()
model_tg_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_tg_dmc.build_vocab([x for x in tqdm(all_x_w2v_tg)])


100%|██████████| 100/100 [00:00<00:00, 186911.94it/s]


In [64]:
for epoch in range(30):
    model_tg_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v_tg)]), total_examples=len(all_x_w2v_tg), epochs=1)
    model_tg_dmc.alpha -= 0.002
    model_tg_dmc.min_alpha = model_tg_dmc.alpha

100%|██████████| 100/100 [00:00<00:00, 332353.72it/s]
100%|██████████| 100/100 [00:00<00:00, 425385.80it/s]
100%|██████████| 100/100 [00:00<00:00, 72779.87it/s]
100%|██████████| 100/100 [00:00<00:00, 135125.77it/s]
100%|██████████| 100/100 [00:00<00:00, 297679.49it/s]
100%|██████████| 100/100 [00:00<00:00, 327936.20it/s]
100%|██████████| 100/100 [00:00<00:00, 309542.73it/s]
100%|██████████| 100/100 [00:00<00:00, 268006.65it/s]
100%|██████████| 100/100 [00:00<00:00, 240361.26it/s]
100%|██████████| 100/100 [00:00<00:00, 303714.99it/s]
100%|██████████| 100/100 [00:00<00:00, 243148.06it/s]
100%|██████████| 100/100 [00:00<00:00, 57440.48it/s]
100%|██████████| 100/100 [00:00<00:00, 302183.29it/s]
100%|██████████| 100/100 [00:00<00:00, 276304.61it/s]
100%|██████████| 100/100 [00:00<00:00, 231985.84it/s]
100%|██████████| 100/100 [00:00<00:00, 290665.56it/s]
100%|██████████| 100/100 [00:00<00:00, 247451.56it/s]
100%|██████████| 100/100 [00:00<00:00, 265294.37it/s]
100%|██████████| 100/100 [00:0

In [65]:
train_vecs_dmc_tg = get_vectors(model_tg_dmc, x_train, 100)
validation_vecs_dmc_tg = get_vectors(model_tg_dmc, x_validation, 100)

In [66]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc_tg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [67]:
clf.score(validation_vecs_dmc_tg, y_validation)


1.0

In [68]:
model_tg_dmc.save('d2v_model_tg_dmc.doc2vec')
model_tg_dmc = Doc2Vec.load('d2v_model_tg_dmc.doc2vec')
model_tg_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [69]:
#DMM trigram

In [70]:
cores = multiprocessing.cpu_count()
model_tg_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_tg_dmm.build_vocab([x for x in tqdm(all_x_w2v_tg)])

100%|██████████| 100/100 [00:00<00:00, 274676.10it/s]


In [71]:
for epoch in range(30):
    model_tg_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v_tg)]), total_examples=len(all_x_w2v_tg), epochs=1)
    model_tg_dmm.alpha -= 0.002
    model_tg_dmc.min_alpha = model_tg_dmc.alpha

100%|██████████| 100/100 [00:00<00:00, 122211.66it/s]
100%|██████████| 100/100 [00:00<00:00, 252668.92it/s]
100%|██████████| 100/100 [00:00<00:00, 268865.64it/s]
100%|██████████| 100/100 [00:00<00:00, 287478.00it/s]
100%|██████████| 100/100 [00:00<00:00, 221218.57it/s]
100%|██████████| 100/100 [00:00<00:00, 192664.40it/s]
100%|██████████| 100/100 [00:00<00:00, 316312.52it/s]
100%|██████████| 100/100 [00:00<00:00, 105916.77it/s]
100%|██████████| 100/100 [00:00<00:00, 87728.59it/s]
100%|██████████| 100/100 [00:00<00:00, 317269.59it/s]
100%|██████████| 100/100 [00:00<00:00, 201940.49it/s]
100%|██████████| 100/100 [00:00<00:00, 89221.53it/s]
100%|██████████| 100/100 [00:00<00:00, 326659.19it/s]
100%|██████████| 100/100 [00:00<00:00, 308631.64it/s]
100%|██████████| 100/100 [00:00<00:00, 273422.69it/s]
100%|██████████| 100/100 [00:00<00:00, 303935.07it/s]
100%|██████████| 100/100 [00:00<00:00, 106158.04it/s]
100%|██████████| 100/100 [00:00<00:00, 102450.02it/s]
100%|██████████| 100/100 [00:0

In [72]:
train_vecs_dmm_tg = get_vectors(model_tg_dmm, x_train, 100)
validation_vecs_dmm_tg = get_vectors(model_tg_dmm, x_validation, 100)

In [73]:
clf = LogisticRegression()
clf.fit(train_vecs_dmm_tg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [74]:
clf.score(validation_vecs_dmm_tg, y_validation)


1.0

In [75]:
model_tg_dmm.save('d2v_model_tg_dmm.doc2vec')
model_tg_dmm = Doc2Vec.load('d2v_model_tg_dmm.doc2vec')
model_tg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [76]:
train_vecs_dbow_dmc_tg = get_concat_vectors(model_tg_dbow,model_tg_dmc, x_train, 200)
validation_vecs_dbow_dmc_tg = get_concat_vectors(model_tg_dbow,model_tg_dmc, x_validation, 200)

In [77]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc_tg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [78]:
clf.score(validation_vecs_dbow_dmc_tg, y_validation)

1.0

In [79]:
train_vecs_dbow_dmm_tg = get_concat_vectors(model_tg_dbow,model_tg_dmm, x_train, 200)
validation_vecs_dbow_dmm_tg = get_concat_vectors(model_tg_dbow,model_tg_dmm, x_validation, 200)

In [80]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm_tg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [81]:
clf.score(validation_vecs_dbow_dmm_tg, y_validation)

1.0

In [82]:
model_tg_dmm.save('d2v_model_ug_dbow.doc2vec')
model_ug_dbow = Doc2Vec.load('d2v_model_ug_dbow.doc2vec')
model_tg_dmm = Doc2Vec.load('d2v_model_tg_dmm.doc2vec')
model_ug_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_tg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [83]:
train_vecs_ugdbow_tgdmm = get_concat_vectors(model_ug_dbow,model_tg_dmm, x_train, 200)
validation_vecs_ugdbow_tgdmm = get_concat_vectors(model_ug_dbow,model_tg_dmm, x_validation, 200)

In [84]:
clf = LogisticRegression()
clf.fit(train_vecs_ugdbow_tgdmm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [85]:
clf.score(validation_vecs_ugdbow_tgdmm, y_validation)

1.0