In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('clean_tweets.csv')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596714 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1596714 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

## Doc2Vec Model

Here, I am going to implement Doc2Vec model using Gensim.
I will use the following approaches:
* DBOW (Distributed Bag Of Words) 
* DMC (Distributed Memory Concatenated)
* DMM (Distributed Memory Mean)
* DBOW + DMC
* DBOW + DMM

I will implement a simple logistic regression model with above vectors.

In [9]:
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

In [10]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

For doc2vec training, I use the whole data because it is completely unsupervised and there is no need to hold out any data because it is unlabelled.

In [11]:
def labelize_tweets(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [12]:
all_x = pd.concat([x_train, x_validation, x_test])
all_x_w2v = labelize_tweets(all_x, 'all')

In [13]:
len(all_x_w2v)

1596714

## DBOW

In [14]:
cores = multiprocessing.cpu_count()
model_ug_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, workers=cores, min_count=2, alpha = 0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1819169.66it/s]


One caveat of the way this algorithm runs is that, since the learning rate decrease over the course of iterating over the data, labels which are only seen in a single TaggedDocument during training will only be trained with a fixed learning rate. This frequently produces less than optimal results.

The below iteration implements explicit multiple pass, alpha-reduction approach with added shuffling.

In [15]:
%%time
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1730960.61it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1737110.75it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1768577.88it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1791194.88it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1762919.54it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2156919.64it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1930585.31it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:01<00:00, 1574401.83it/s]
100%|███████████████████████████████████

Wall time: 47min 50s


In [18]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [19]:
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
clf.score(validation_vecs_dbow, y_validation)

0.7360806663743972

In [54]:
model_ug_dbow.save('d2v_model_ug_dbow.doc2vec')
#model_tg_dbow = Doc2Vec.load('d2v_model_tg_dbow.doc2vec')

## DMC

In [24]:
cores = multiprocessing.cpu_count()
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, vector_size=100, negative=5, window=2,
                       workers=cores, min_count=2, alpha = 0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_x_w2v)])

100%|████████████████████████████████████████████████████████████████████| 1596714/1596714 [00:01<00:00, 969955.30it/s]


In [25]:
%%time
for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2827588.56it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3324451.35it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3475112.06it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2953095.55it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3213650.93it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2175248.84it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2844120.22it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2975999.92it/s]
100%|███████████████████████████████████

Wall time: 33min 30s


With DM model, we can see the semantic relationship between words.

In [28]:
model_ug_dmc.wv.most_similar("nice")

[('lovely', 0.7492602467536926),
 ('beautiful', 0.7370055317878723),
 ('great', 0.7369823455810547),
 ('bootiful', 0.732749879360199),
 ('wonderful', 0.7100207805633545),
 ('geat', 0.7081895470619202),
 ('fabulous', 0.6990154981613159),
 ('fantastic', 0.6940918564796448),
 ('baaaaddd', 0.6930461525917053),
 ('fab', 0.6819604635238647)]

In [29]:
model_ug_dmc.wv.most_similar("happy")

[('hapy', 0.7775416374206543),
 ('hapi', 0.6970286965370178),
 ('happyy', 0.6885430812835693),
 ('pleased', 0.6862632036209106),
 ('happpy', 0.6779634952545166),
 ('thrilled', 0.6471766233444214),
 ('maytes', 0.6368981599807739),
 ('haaaappy', 0.6358321309089661),
 ('happpppy', 0.6321025490760803),
 ('happppy', 0.6223570108413696)]

In [30]:
model_ug_dmc.wv.most_similar("sad")

[('depressed', 0.7377227544784546),
 ('bummed', 0.7218616604804993),
 ('frustated', 0.7171342372894287),
 ('upset', 0.7114315032958984),
 ('sadddd', 0.7111239433288574),
 ('frustating', 0.692534327507019),
 ('guted', 0.6885437369346619),
 ('gutted', 0.6817755699157715),
 ('exciteeeeed', 0.6740083694458008),
 ('happey', 0.670941174030304)]

In [31]:
model_ug_dmc.wv.most_similar("facebook")

[('myspace', 0.8465545177459717),
 ('youtube', 0.7960541248321533),
 ('fb', 0.7958188056945801),
 ('msn', 0.7688832879066467),
 ('flickr', 0.7637163400650024),
 ('bebo', 0.7617554068565369),
 ('skype', 0.7578939199447632),
 ('yahoo', 0.7534440755844116),
 ('linkedin', 0.7525861263275146),
 ('aim', 0.7207794189453125)]

In [32]:
train_vecs_dmc = get_vectors(model_ug_dmc, x_train, 100)
validation_vecs_dmc = get_vectors(model_ug_dmc, x_validation, 100)

In [33]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
clf.score(validation_vecs_dmc, y_validation)

0.6664996555395504

In [55]:
model_ug_dmc.save('d2v_model_ug_dmc.doc2vec')

## DMM

In [35]:
cores = multiprocessing.cpu_count()
model_ug_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, negative=5, window=4, workers=cores,
                        min_count=2, alpha = 0.065, min_alpha=0.065)
model_ug_dmm.build_vocab([x for x in tqdm(all_x_w2v)])

100%|████████████████████████████████████████████████████████████████████| 1596714/1596714 [00:06<00:00, 233577.57it/s]


In [36]:
%%time
for epoch in range(30):
    model_ug_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmm.alpha -= 0.002
    model_ug_dmm.min_alpha = model_ug_dmm.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1599358.43it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1791986.65it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1879953.73it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2141675.85it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1911209.59it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1730177.58it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1980183.08it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:01<00:00, 1429262.51it/s]
100%|███████████████████████████████████

Wall time: 1h 18min 44s


In [37]:
model_ug_dmm.wv.most_similar("nice")

[('great', 0.8799686431884766),
 ('good', 0.8777741193771362),
 ('beautiful', 0.8707109689712524),
 ('wonderful', 0.8631876707077026),
 ('lovely', 0.8624066114425659),
 ('weird', 0.8522484302520752),
 ('fantastic', 0.8506590127944946),
 ('cool', 0.8493940830230713),
 ('fun', 0.8412531614303589),
 ('busy', 0.8368297815322876)]

In [38]:
model_ug_dmm.wv.most_similar("happy")

[('sad', 0.867780864238739),
 ('excited', 0.8203477263450623),
 ('bummed', 0.8083328008651733),
 ('busy', 0.8071290254592896),
 ('upset', 0.8014763593673706),
 ('sure', 0.7936733365058899),
 ('lame', 0.7905880212783813),
 ('good', 0.7902517914772034),
 ('depressed', 0.7866224646568298),
 ('gutted', 0.7771831154823303)]

In [39]:
model_ug_dmm.wv.most_similar("sad")

[('bummed', 0.8702001571655273),
 ('happy', 0.867780864238739),
 ('lame', 0.8556241989135742),
 ('busy', 0.851775050163269),
 ('excited', 0.8425253629684448),
 ('sure', 0.840003490447998),
 ('upset', 0.8397306203842163),
 ('cool', 0.8391295075416565),
 ('gutted', 0.8303613662719727),
 ('tired', 0.8293877243995667)]

In [40]:
train_vecs_dmm = get_vectors(model_ug_dmm, x_train, 100)
validation_vecs_dmm = get_vectors(model_ug_dmm, x_validation, 100)

In [41]:
clf = LogisticRegression()
clf.fit(train_vecs_dmm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [42]:
clf.score(validation_vecs_dmm, y_validation)

0.7301935241435461

In [56]:
model_ug_dmm.save('d2v_model_ug_dmm.doc2vec')

## DBOW + DMC

In [43]:
def get_concat_vectors(model1, model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix], model2.docvecs[prefix])
        n += 1
    return vecs

In [46]:
train_vecs_dbow_dmc = get_concat_vectors(model_ug_dbow, model_ug_dmc, x_train, 200)
validation_vecs_dbow_dmc = get_concat_vectors(model_ug_dbow, model_ug_dmc, x_validation, 200)

In [47]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [49]:
clf.score(validation_vecs_dbow_dmc, y_validation)

0.7482307258721113

## DBOW + DMM

In [50]:
train_vecs_dbow_dmm = get_concat_vectors(model_ug_dbow, model_ug_dmm, x_train, 200)
validation_vecs_dbow_dmm = get_concat_vectors(model_ug_dbow, model_ug_dmm, x_validation, 200)

In [51]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [52]:
clf.score(validation_vecs_dbow_dmm, y_validation)

0.7534915763762761

Hence, the accuracies for the models using Logistic Regression are as follows:

* DBOW - 73.6%

* DMC - 66.64%

* DMM - 73.02%

* DBOW + DMC - 74.82%

* DBOW + DMM - 75.35%