In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')
SEED = 2000

In [12]:
data = pd.read_csv("../data/clean_tweet.csv", index_col=0)
data.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [13]:
x = data.text
y = data.target

In [14]:
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x,y, test_size=0.02, random_state=SEED)

In [15]:
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=0.5, random_state=SEED)

In [16]:
print(f"Train set has total {len(x_train)} with {len(x_train[y_train==0])*100/(len(x_train)*1.0)}% negative and {len(x_train[y_train==4])*100/(len(x_train)*1.0)}% positive")

Train set has total 1564120 with 50.020139119760636% negative and 0.0% positive


In [17]:
print(f"Validation set has total {len(x_validation)} with {len(x_validation[y_validation==0])*100/(len(x_validation)*1.0)}% negative and {len(x_validation[y_validation==4])*100/(len(x_validation)*1.0)}% positive")

Validation set has total 15960 with 49.454887218045116% negative and 0.0% positive


In [18]:
print(f"Test set has total {len(x_test)} with {len(x_test[y_test==0])*100/(len(x_test)*1.0)}% negative and {len(x_test[y_test==4])*100/(len(x_test)*1.0)}% positive")

Test set has total 15961 with 49.67733851262452% negative and 0.0% positive


### Doc2Vec model

In [21]:
def label_tweets_ug(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [24]:
all_data = pd.concat([x_train, x_validation, x_test])
all_data_w2v = label_tweets_ug(all_data, 'doc')

In [26]:
all_data_w2v[:10]

[TaggedDocument(words=['your', 'not', 'pregnant', 'oh', 'no', 'what', 'shame'], tags=['doc_288048']),
 TaggedDocument(words=['cleaning', 'the', 'bathroom'], tags=['doc_357753']),
 TaggedDocument(words=['feeling', 'left', 'out', 'you', 'never', 'recommend', 'anything', 'to', 'me'], tags=['doc_420123']),
 TaggedDocument(words=['home', 'sick', 'what', 'the', 'hell', 'wonder', 'if', 'it', 'll', 'mutate', 'into', 'swine', 'flu'], tags=['doc_348643']),
 TaggedDocument(words=['your', 'tweet', 'reminded', 'me', 'that', 'game', 'was', 'the', 'shit'], tags=['doc_1195630']),
 TaggedDocument(words=['grumpy', 'cause', 'can', 'not', 'go', 'to', 'move', 'marathong', 'and', 'have', 'to', 'baby', 'sit', 'cleo', 'and', 'aleisha', 'ing', 'hell'], tags=['doc_424869']),
 TaggedDocument(words=['its', 'some', 'special', 'performance', 'so', 'guess', 'we', 'will', 'not', 'be', 'able', 'to', 'see', 'them', 'perform', 'beatfreaks'], tags=['doc_675535']),
 TaggedDocument(words=['back', 'at', 'work', 'have', 'not

In [27]:
from sklearn.linear_model import LogisticRegression

In [29]:
cores = multiprocessing.cpu_count()

In [34]:
model_ug_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_data_w2v)])

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 2448648.27it/s]


In [37]:
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_data_w2v)]), total_examples=len(all_data_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3922507.74it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3925903.10it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3856719.25it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3975301.88it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3911693.83it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 4023467.33it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3997728.99it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3951447.26it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3932928.67it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3854463.08it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3927285.01it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3880443.51it/s]
100%|███████████████████████

In [40]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'doc_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [41]:
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)

In [43]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)
clf.score(validation_vecs_dbow, y_validation)

0.7425438596491228

In [44]:
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=2, negative=5, min_count=2, workers=cores,
                      alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_data_w2v)])

100%|█████████████████████████████| 1596041/1596041 [00:03<00:00, 428314.59it/s]


In [51]:
for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_data_w2v)]), total_examples=len(all_data_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3748590.22it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 2570420.04it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3463299.65it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3723728.71it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3782222.91it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3777487.00it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3642390.97it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3556890.40it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3622820.95it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3633088.23it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3464278.21it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3308955.23it/s]
100%|███████████████████████

In [52]:
train_vecs_dmc = get_vectors(model_ug_dmc, x_train, 100)
validation_vecs_dmc = get_vectors(model_ug_dmc, x_validation, 100)

In [53]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc, y_train)
clf.score(validation_vecs_dmc, y_validation)

0.6671679197994987

In [58]:
model_ug_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=4, negative=5, min_count=2, workers=cores,
                      alpha=0.065, min_alpha=0.065)
model_ug_dmm.build_vocab([x for x in tqdm(all_data_w2v)])

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3532870.65it/s]


In [66]:
for epoch in range(30):
    starttime = time.time()
    model_ug_dmm.train(utils.shuffle([x for x in tqdm(all_data_w2v)]), total_examples=len(all_data_w2v), epochs=1)
    model_ug_dmm.alpha -= 0.002
    model_ug_dmm.min_alpha = model_ug_dmm.alpha
    print(f"Total time taken for epoch={epoch} is {time.time()-starttime}")

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 1871262.64it/s]


Total time taken for epoch=0 is 129.57846403121948


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3936274.98it/s]


Total time taken for epoch=1 is 124.09920716285706


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3910254.35it/s]


Total time taken for epoch=2 is 124.15073609352112


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3864076.76it/s]


Total time taken for epoch=3 is 122.7156081199646


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3759902.69it/s]


Total time taken for epoch=4 is 123.7919671535492


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3871977.44it/s]


Total time taken for epoch=5 is 124.1875


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3709326.65it/s]


Total time taken for epoch=6 is 130.74987506866455


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3353492.09it/s]


Total time taken for epoch=7 is 137.87564396858215


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3585618.25it/s]


Total time taken for epoch=8 is 136.68061089515686


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3608699.08it/s]


Total time taken for epoch=9 is 136.7154939174652


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3586682.55it/s]


Total time taken for epoch=10 is 136.90621399879456


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3215399.24it/s]


Total time taken for epoch=11 is 159.64188885688782


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 2777854.10it/s]


Total time taken for epoch=12 is 159.18491005897522


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3182730.28it/s]


Total time taken for epoch=13 is 167.84329199790955


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3640396.34it/s]


Total time taken for epoch=14 is 125.53525400161743


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3962777.94it/s]


Total time taken for epoch=15 is 122.76637077331543


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3829180.83it/s]


Total time taken for epoch=16 is 123.44796895980835


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 2545900.98it/s]


Total time taken for epoch=17 is 123.48661279678345


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3961579.59it/s]


Total time taken for epoch=18 is 121.94313502311707


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3912039.01it/s]


Total time taken for epoch=19 is 122.63728523254395


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3941990.95it/s]


Total time taken for epoch=20 is 122.76999998092651


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3963585.07it/s]


Total time taken for epoch=21 is 121.97288608551025


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3891589.51it/s]


Total time taken for epoch=22 is 121.68202805519104


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3763652.73it/s]


Total time taken for epoch=23 is 121.571368932724


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3812389.32it/s]


Total time taken for epoch=24 is 121.98173594474792


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3845771.81it/s]


Total time taken for epoch=25 is 122.5416009426117


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3894369.60it/s]


Total time taken for epoch=26 is 122.02499318122864


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3864833.03it/s]


Total time taken for epoch=27 is 124.39642691612244


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3686637.49it/s]


Total time taken for epoch=28 is 122.31435203552246


100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3876540.30it/s]


Total time taken for epoch=29 is 122.42763805389404


In [67]:
train_vecs_dmm = get_vectors(model_ug_dmm, x_train, 100)
validation_vecs_dmm = get_vectors(model_ug_dmm, x_validation, 100)

In [68]:
clf = LogisticRegression()
clf.fit(train_vecs_dmm, y_train)
clf.score(validation_vecs_dmm, y_validation)

0.6287593984962406

In [65]:
def get_concat_vectors(model1,model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'doc_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
        n += 1
    return vecs

train_vecs_dbow_dmc = get_concat_vectors(model_ug_dbow,model_ug_dmc, x_train, 200)
validation_vecs_dbow_dmc = get_concat_vectors(model_ug_dbow,model_ug_dmc, x_validation, 200)

clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc, y_train)
clf.score(validation_vecs_dbow_dmc, y_validation)

0.7515037593984962

In [69]:
train_vecs_dbow_dmm = get_concat_vectors(model_ug_dbow,model_ug_dmm, x_train, 200)
validation_vecs_dbow_dmm = get_concat_vectors(model_ug_dbow,model_ug_dmm, x_validation, 200)

clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm, y_train)
clf.score(validation_vecs_dbow_dmm, y_validation)

0.7432957393483709

#### Bigrams

In [72]:
def label_tweets_bg(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(bigram[t.split()], [prefix + '_%s' % i]))
    return result

In [71]:
tokenized_train = [t.split() for t in x_train]
phrases = Phrases(tokenized_train)
bigram = Phraser(phrases)

In [73]:
all_data_w2v_bg = label_tweets_bg(all_data, 'doc')

In [None]:
cores = multiprocessing.cpu_count()
model_bg_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dbow.build_vocab([x for x in tqdm(all_data_w2v_bg)])

for epoch in range(30):
    model_bg_dbow.train(utils.shuffle([x for x in tqdm(all_data_w2v_bg)]), total_examples=len(all_data_w2v_bg), epochs=1)
    model_bg_dbow.alpha -= 0.002
    model_bg_dbow.min_alpha = model_bg_dbow.alpha
    
train_vecs_dbow_bg = get_vectors(model_bg_dbow, x_train, 100)
validation_vecs_dbow_bg = get_vectors(model_bg_dbow, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dbow_bg, y_train)
clf.score(validation_vecs_dbow_bg, y_validation)

In [None]:
cores = multiprocessing.cpu_count()
model_bg_dmc = Doc2Vec(dm=1, dm_concat=1, vector_size==100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmc.build_vocab([x for x in tqdm(all_data_w2v_bg)])

for epoch in range(30):
    model_bg_dmc.train(utils.shuffle([x for x in tqdm(all_data_w2v_bgb)]), total_examples=len(all_data_w2v_bg), epochs=1)
    model_bg_dmc.alpha -= 0.002
    model_bg_dmc.min_alpha = model_bg_dmc.alpha
    
train_vecs_dmc_bg = get_vectors(model_bg_dmc, x_train, 100)
validation_vecs_dmc_bg = get_vectors(model_bg_dmc, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dmc_bg, y_train)
clf.score(validation_vecs_dmc_bg, y_validation)

In [None]:
cores = multiprocessing.cpu_count()
model_bg_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmm.build_vocab([x for x in tqdm(all_data_w2v_bg)])

for epoch in range(30):
    model_bg_dmm.train(utils.shuffle([x for x in tqdm(all_data_w2v_bg)]), total_examples=len(all_data_w2v_bgb), epochs=1)
    model_bg_dmm.alpha -= 0.002
    model_bg_dmm.min_alpha = model_bg_dms.alpha
    
train_vecs_dmm_bg = get_vectors(model_bg_dmm, x_train, 100)
validation_vecs_dmm_bg = get_vectors(model_bg_dmm, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dmm_bg, y_train)
clf.score(validation_vecs_dmm_bg, y_validation)

Do the combined modelling and see the result (dbow+dmm and dbow+dmc)

In [74]:
tg_phrases = Phrases(bigram[tokenized_train])
trigram = Phraser(tg_phrases)

Do the same as we did above and see the result.