In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')

In [46]:
df = pd.read_csv('clean_tweets.csv')

In [47]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [49]:
df.dropna(inplace=True)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596714 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1596714 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


In [51]:
from sklearn.model_selection import train_test_split

In [52]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

## Doc2Vec Model

Here, I am going to implement Doc2Vec model using Gensim.
I will use the following approaches:
* DBOW (Distributed Bag Of Words) 
* DMC (Distributed Memory Concatenated)
* DMM (Distributed Memory Mean)
* DBOW + DMC
* DBOW + DMM

I will implement a simple logistic regression model with above vectors.

In [53]:
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

In [63]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

For doc2vec training, I use the whole data because it is completely unsupervised and there is no need to hold out any data because it is unlabelled.

In [64]:
def labelize_tweets(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [65]:
all_x = pd.concat([x_train, x_validation, x_test])
all_x_w2v = labelize_tweets(all_x, 'all')

In [66]:
len(all_x_w2v)

1596714

## DBOW

In [71]:
cores = multiprocessing.cpu_count()
model_ug_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, workers=cores, min_count=2, alpha = 0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1766088.75it/s]


One caveat of the way this algorithm runs is that, since the learning rate decrease over the course of iterating over the data, labels which are only seen in a single TaggedDocument during training will only be trained with a fixed learning rate. This frequently produces less than optimal results.

The below iteration implements explicit multiple pass, alpha-reduction approach with added shuffling.

In [72]:
%%time
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1839536.74it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3244647.01it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3186444.77it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3410778.20it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2175252.37it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3218330.27it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3367365.43it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3253804.43it/s]
100%|███████████████████████████████████

Wall time: 34min 15s
