In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')
SEED = 2000

In [12]:
data = pd.read_csv("../data/clean_tweet.csv", index_col=0)
data.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [13]:
x = data.text
y = data.target

In [14]:
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x,y, test_size=0.02, random_state=SEED)

In [15]:
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=0.5, random_state=SEED)

In [16]:
print(f"Train set has total {len(x_train)} with {len(x_train[y_train==0])*100/(len(x_train)*1.0)}% negative and {len(x_train[y_train==4])*100/(len(x_train)*1.0)}% positive")

Train set has total 1564120 with 50.020139119760636% negative and 0.0% positive


In [17]:
print(f"Validation set has total {len(x_validation)} with {len(x_validation[y_validation==0])*100/(len(x_validation)*1.0)}% negative and {len(x_validation[y_validation==4])*100/(len(x_validation)*1.0)}% positive")

Validation set has total 15960 with 49.454887218045116% negative and 0.0% positive


In [18]:
print(f"Test set has total {len(x_test)} with {len(x_test[y_test==0])*100/(len(x_test)*1.0)}% negative and {len(x_test[y_test==4])*100/(len(x_test)*1.0)}% positive")

Test set has total 15961 with 49.67733851262452% negative and 0.0% positive


### Doc2Vec model

In [21]:
def label_tweets_ug(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [24]:
all_data = pd.concat([x_train, x_validation, x_test])
all_data_w2v = label_tweets_ug(all_data, 'doc')

In [26]:
all_data_w2v[:10]

[TaggedDocument(words=['your', 'not', 'pregnant', 'oh', 'no', 'what', 'shame'], tags=['doc_288048']),
 TaggedDocument(words=['cleaning', 'the', 'bathroom'], tags=['doc_357753']),
 TaggedDocument(words=['feeling', 'left', 'out', 'you', 'never', 'recommend', 'anything', 'to', 'me'], tags=['doc_420123']),
 TaggedDocument(words=['home', 'sick', 'what', 'the', 'hell', 'wonder', 'if', 'it', 'll', 'mutate', 'into', 'swine', 'flu'], tags=['doc_348643']),
 TaggedDocument(words=['your', 'tweet', 'reminded', 'me', 'that', 'game', 'was', 'the', 'shit'], tags=['doc_1195630']),
 TaggedDocument(words=['grumpy', 'cause', 'can', 'not', 'go', 'to', 'move', 'marathong', 'and', 'have', 'to', 'baby', 'sit', 'cleo', 'and', 'aleisha', 'ing', 'hell'], tags=['doc_424869']),
 TaggedDocument(words=['its', 'some', 'special', 'performance', 'so', 'guess', 'we', 'will', 'not', 'be', 'able', 'to', 'see', 'them', 'perform', 'beatfreaks'], tags=['doc_675535']),
 TaggedDocument(words=['back', 'at', 'work', 'have', 'not

In [27]:
from sklearn.linear_model import LogisticRegression

In [29]:
cores = multiprocessing.cpu_count()

In [34]:
model_ug_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_data_w2v)])

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 2448648.27it/s]


In [37]:
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_data_w2v)]), total_examples=len(all_data_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3922507.74it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3925903.10it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3856719.25it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3975301.88it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3911693.83it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 4023467.33it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3997728.99it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3951447.26it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3932928.67it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3854463.08it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3927285.01it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3880443.51it/s]
100%|███████████████████████

In [40]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'doc_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [41]:
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)

In [43]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)
clf.score(validation_vecs_dbow, y_validation)

0.7425438596491228

In [44]:
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=2, negative=5, min_count=2, workers=cores,
                      alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_data_w2v)])

100%|█████████████████████████████| 1596041/1596041 [00:03<00:00, 428314.59it/s]


In [45]:
for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_data_w2v)]), total_examples=len(all_data_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3773209.51it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3831166.29it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3845864.61it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3894956.46it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3907269.09it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3782966.71it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3871977.44it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3830328.91it/s]
100%|████████████████████████████| 1596041/1596041 [00:00<00:00, 3811785.83it/s]


KeyboardInterrupt: 