In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')

In [3]:
df = pd.read_csv('clean_tweets.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596714 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1596714 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

In [9]:
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

In [10]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [13]:
def get_concat_vectors(model1, model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix], model2.docvecs[prefix])
        n += 1
    return vecs

# Phrase Modelling

It automatically detects common phrases – multi-word expressions/ word n-grams – from a stream of sentences.

In [1]:
from gensim.models.phrases import Phrases, Phraser

In [14]:
tokenised_train = [t.split() for t in x_train]

In [15]:
%%time
phrases = Phrases(tokenised_train)
bigram = Phraser(phrases)

Wall time: 2min 8s


In [17]:
bigram[['last', 'time', 'with', 'nutella', 'and', 'vanilla', 'ice', 'cream', 'sadface']]

['last', 'time', 'with', 'nutella', 'and', 'vanilla_ice', 'cream', 'sadface']

In [21]:
def labelize_tweets_bg(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(bigram[t.split()], [prefix + '_%s' % i]))
    return result

In [22]:
all_x = pd.concat([x_train, x_validation, x_test])
all_x_w2v_bg = labelize_tweets_bg(all_x, 'all')

## Bigram DBOW

In [31]:
model_bg = []

In [24]:
cores = multiprocessing.cpu_count()
model_bg_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, workers=cores, min_count=2, alpha = 0.065, min_alpha=0.065)
model_bg_dbow.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1846340.62it/s]


In [25]:
%%time
for epoch in range(30):
    model_bg_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dbow.alpha -= 0.002
    model_bg_dbow.min_alpha = model_bg_dbow.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1830836.28it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1902673.37it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1764103.70it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1945521.00it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1766863.14it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1823399.02it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1798397.80it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1775698.55it/s]
100%|███████████████████████████████████

Wall time: 55min 27s


In [26]:
train_vecs_dbow_bg = get_vectors(model_bg_dbow, x_train, 100)
validation_vecs_dbow_bg = get_vectors(model_bg_dbow, x_validation, 100)

In [27]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow_bg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
clf.score(validation_vecs_dbow_bg, y_validation)

0.744222458821319

In [32]:
model_bg.append(clf.score(validation_vecs_dbow_bg, y_validation))

In [29]:
model_bg_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## Bigram DMC

In [30]:
cores = multiprocessing.cpu_count()
model_bg_dmc = Doc2Vec(dm=1, dm_concat=1, vector_size=100, negative=5, window=2,
                       workers=cores, min_count=2, alpha = 0.065, min_alpha=0.065)
model_bg_dmc.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|████████████████████████████████████████████████████████████████████| 1596714/1596714 [00:04<00:00, 398963.84it/s]


In [33]:
%%time
for epoch in range(30):
    model_bg_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmc.alpha -= 0.002
    model_bg_dmc.min_alpha = model_bg_dmc.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1796547.67it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1807192.26it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1825734.83it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1869919.59it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:01<00:00, 1454694.56it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1973767.58it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2229107.53it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 1888799.18it/s]
100%|███████████████████████████████████

Wall time: 51min 57s


In [34]:
train_vecs_dmc_bg = get_vectors(model_bg_dmc, x_train, 100)
validation_vecs_dmc_bg = get_vectors(model_bg_dmc, x_validation, 100)

In [35]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc_bg, y_train)



In [36]:
clf.score(validation_vecs_dmc_bg, y_validation)

0.6703826642450054

In [37]:
model_bg.append(clf.score(validation_vecs_dmc_bg, y_validation))

In [38]:
model_bg_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## Bigram DMM

In [40]:
cores = multiprocessing.cpu_count()
model_bg_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, negative=5, window=4, workers=cores,
                        min_count=2, alpha = 0.065, min_alpha=0.065)
model_bg_dmm.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|████████████████████████████████████████████████████████████████████| 1596714/1596714 [00:02<00:00, 691864.80it/s]


In [41]:
%%time
for epoch in range(30):
    model_bg_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmm.alpha -= 0.002
    model_bg_dmm.min_alpha = model_bg_dmm.alpha

100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2854573.69it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3331565.97it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3097564.08it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3146255.59it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3130984.54it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3105096.84it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 2904911.04it/s]
100%|███████████████████████████████████████████████████████████████████| 1596714/1596714 [00:00<00:00, 3341079.63it/s]
100%|███████████████████████████████████

Wall time: 53min 4s


In [42]:
train_vecs_dmm_bg = get_vectors(model_bg_dmm, x_train, 100)
validation_vecs_dmm_bg = get_vectors(model_bg_dmm, x_validation, 100)

In [43]:
clf = LogisticRegression()
clf.fit(train_vecs_dmm_bg, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [44]:
clf.score(validation_vecs_dmm_bg, y_validation)

0.740840483497213

In [45]:
model_bg.append(clf.score(validation_vecs_dmm_bg, y_validation))

In [46]:
model_bg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## Bigram DBOW + DMC

In [47]:
train_vecs_bg_dbow_dmc = get_concat_vectors(model_bg_dbow, model_bg_dmc, x_train, 200)
validation_vecs_bg_dbow_dmc = get_concat_vectors(model_bg_dbow, model_bg_dmc, x_validation, 200)

In [48]:
clf = LogisticRegression()
clf.fit(train_vecs_bg_dbow_dmc, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [49]:
clf.score(validation_vecs_bg_dbow_dmc, y_validation)

0.7519258470595603

In [50]:
model_bg.append(clf.score(validation_vecs_bg_dbow_dmc, y_validation))

## Bigram DBOW + DMM

In [51]:
train_vecs_bg_dbow_dmm = get_concat_vectors(model_bg_dbow, model_bg_dmm, x_train, 200)
validation_vecs_bg_dbow_dmm = get_concat_vectors(model_bg_dbow, model_bg_dmm, x_validation, 200)

In [52]:
clf = LogisticRegression()
clf.fit(train_vecs_bg_dbow_dmm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [53]:
clf.score(validation_vecs_bg_dbow_dmm, y_validation)

0.7617586271685352

In [54]:
model_bg.append(clf.score(validation_vecs_bg_dbow_dmm, y_validation))

In [55]:
model_bg

[0.744222458821319,
 0.6703826642450054,
 0.740840483497213,
 0.7519258470595603,
 0.7617586271685352]