In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')

In [3]:
df = pd.read_csv('clean_tweets.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596714 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1596714 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

In [9]:
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

In [10]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [13]:
def get_concat_vectors(model1, model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix], model2.docvecs[prefix])
        n += 1
    return vecs

# Phrase Modelling

It automatically detects common phrases – multi-word expressions/ word n-grams – from a stream of sentences.

In [1]:
from gensim.models.phrases import Phrases, Phraser

In [14]:
tokenised_train = [t.split() for t in x_train]

In [15]:
%%time
phrases = Phrases(tokenised_train)
bigram = Phraser(phrases)

Wall time: 2min 8s


In [17]:
bigram[['last', 'time', 'with', 'nutella', 'and', 'vanilla', 'ice', 'cream', 'sadface']]

['last', 'time', 'with', 'nutella', 'and', 'vanilla_ice', 'cream', 'sadface']

In [21]:
def labelize_tweets_bg(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(bigram[t.split()], [prefix + '_%s' % i]))
    return result

In [22]:
all_x = pd.concat([x_train, x_validation, x_test])
all_x_w2v_bg = labelize_tweets_bg(all_x, 'all')