## Load the Tweets Dataset

In [9]:
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200)

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
tweets = pd.read_pickle("cleaned_tweets_v1.pkl")
tweets.head()

Unnamed: 0,label,tweet,cleaned_tweets,cleaned_tweets_without_stopwords
0,1,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,1,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally transparant silicon case thanks uncle yay sony xperia sonyexperias,finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2,1,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,love this would you talk makememories unplug relax iphone smartphone wifi connect,love talk makememories unplug relax iphone smartphone wifi connect
3,1,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,wired know george wa made that way iphone cute daventry home,wired know george way iphone cute daventry home
4,0,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk about question have unless pay them for their stupid support,amazing service apple talk question unless pay stupid support


In [3]:
tweets.shape

(7920, 4)

In [5]:
X = tweets['cleaned_tweets']
y = tweets['label']

# Word Embeddings

In [14]:
tweets['cleaned_tweets'][0]  # 1st tweet

'fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone'

In [12]:
tweets_list = list(tweets['cleaned_tweets'].apply(lambda x: x.split()))
tweets_list[0] # list of lists, where each tweet is a list of tokens, finally we have a list of tweets

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [16]:
# tweets_list

In [18]:
# Creating our own Word2Vec Model
from gensim.models import Word2Vec

# train model
cbow_model = Word2Vec(tweets_list, vector_size = 300, window = 3, min_count=10, sg=0)

In [19]:
# summarize the loaded model
print(cbow_model)

Word2Vec<vocab=1237, vector_size=300, alpha=0.025>


In [21]:
print(cbow_model.wv.index_to_key[:20] )   # printing the 1st 20 vocab words!

['iphone', 'apple', 'the', 'samsung', 'and', 'you', 'new', 'twitter', 'for', 'com', 'phone', 'sony', 'not', 'follow', 'this', 'pic', 'with', 'have', 'like', 'ipad']


In [23]:
len(cbow_model.wv.index_to_key)  # total number of words in the Vocab

1237

In [24]:
# access word vector for one word
cbow_model.wv.get_vector('ipad')

array([ 8.02080035e-02,  1.40748844e-01,  3.83791447e-01,  1.46812379e-01,
        1.39446929e-01, -2.13295743e-01,  2.98809707e-01,  5.30503154e-01,
        6.64356351e-02,  8.08329880e-03, -1.05925212e-02, -2.45266661e-01,
       -2.03678295e-01, -1.52947858e-01, -1.40868686e-02, -2.69171149e-01,
        1.34432241e-01,  1.98538508e-02, -2.02673916e-02, -1.78270295e-01,
       -2.68751413e-01, -2.41427228e-01,  1.13485806e-01,  8.46207421e-03,
        1.20829768e-01, -2.74891347e-01, -2.08213683e-02,  2.32238322e-01,
       -1.41006663e-01,  1.00656059e-02,  2.56013535e-02, -1.78655926e-02,
       -1.63230654e-02,  3.57875675e-02, -3.00654918e-02,  1.18028782e-01,
        4.35441405e-01, -4.54382718e-01, -5.61433993e-02, -5.43568805e-02,
       -3.20539926e-04, -2.46322285e-02,  4.69611548e-02, -1.39195547e-01,
        2.27178648e-01,  1.28573880e-01,  1.32822886e-01, -5.45659922e-02,
        1.79582715e-01,  5.02636671e-01,  1.30560517e-01,  8.25996250e-02,
       -1.11747146e-01, -

In [25]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [27]:
X.shape

(7920,)

In [28]:
tweets_temp = X.apply(document_vector)

In [30]:
tweets_temp.shape   

(7920,)

In [29]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

0    [0.026826207, 0.16647096, 0.14817807, 0.06845369, 0.12645465, -0.20806031, 0.19661863, 0.43319473, 0.089440934, 0.111117825, 0.006400846, -0.11084225, -0.079257414, -0.10315515, -0.15831566, -0.23...
1    [0.026638785, 0.19544089, 0.06592562, 0.19374664, 0.074396804, -0.23043604, 0.11404914, 0.33971593, 0.055556074, -0.020716509, -0.024682024, -0.047152627, -0.03719196, -0.07130316, -0.08024793, -0...
2    [-0.027298588, 0.17520703, -0.055500813, 0.098311655, 0.031758986, -0.19882078, 0.10240402, 0.32828465, 0.037724294, -0.07731754, -0.035604108, -0.12081506, -0.082958855, -0.05177357, -0.06717392,...
3    [-0.014918212, 0.19131161, -0.011942185, 0.1214936, 0.057457495, -0.2309837, 0.13210379, 0.3831135, 0.04729805, -0.05329325, -0.03377345, -0.11867264, -0.081632465, -0.067539744, -0.080679215, -0....
4    [-0.037641782, 0.18621095, -0.06971123, 0.1496355, 0.035168614, -0.21614198, 0.09731512, 0.3332366, 0.02101336, -0.13267139, -0.0468873, -0.11370128, -0.08639497, -0.053567152

In [31]:
tweets_temp[0].shape  # each document vector is 300-dimensional !

(300,)

In [32]:
type(tweets_temp)

pandas.core.series.Series

In [33]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 300
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(7920, 300)

In [35]:
# Create a new DF to store these new documnent features
df = pd.DataFrame(tweets_vec)
df['y'] = tweets['label']
df.dropna(how='any', axis=0, inplace=True)

In [36]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.026826,0.166471,0.148178,0.068454,0.126455,-0.20806,0.196619,0.433195,0.089441,0.111118,...,0.308817,0.122169,0.002031,0.192916,0.364714,0.032367,-0.056186,0.169601,-0.110197,1
1,0.026639,0.195441,0.065926,0.193747,0.074397,-0.230436,0.114049,0.339716,0.055556,-0.020717,...,0.233975,0.103947,0.048142,0.208316,0.201495,0.016743,-0.063562,0.167798,-0.096593,1
2,-0.027299,0.175207,-0.055501,0.098312,0.031759,-0.198821,0.102404,0.328285,0.037724,-0.077318,...,0.232205,0.130381,-0.005101,0.172866,0.158324,-0.039869,-0.082533,0.130919,-0.099018,1
3,-0.014918,0.191312,-0.011942,0.121494,0.057457,-0.230984,0.132104,0.383114,0.047298,-0.053293,...,0.26917,0.137099,0.000961,0.201074,0.215511,-0.033859,-0.08247,0.155982,-0.108283,1
4,-0.037642,0.186211,-0.069711,0.149635,0.035169,-0.216142,0.097315,0.333237,0.021013,-0.132671,...,0.217421,0.125933,0.008542,0.192938,0.113935,-0.071039,-0.099378,0.142263,-0.093069,0


In [37]:
df.shape

(7920, 301)

In [38]:
X_word_emb = df.drop('y', axis=1)
y = df['y']
X_word_emb.shape   # final feature matrix

(7920, 300)

In [41]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

n_splits = 5
kfold = StratifiedKFold(n_splits, shuffle=True, random_state=42)

In [42]:
LR = LogisticRegression(solver='liblinear', class_weight='balanced', penalty='l2', C=0.5)

results = cross_validate(LR, X_word_emb, y, scoring='accuracy', cv=kfold, \
                         return_train_score=True, return_estimator=True)
print(results['train_score'].mean().round(4), results['train_score'].std().round(4))
print(results['test_score'].mean().round(4), results['test_score'].std().round(4))

0.8336 0.0025
0.8326 0.0059


In [None]:
# H/W Try putting the Word Embedding part also in a pipeline

# Word Embeddings from GloVe Model

In [43]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [44]:
# load the converted model
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [45]:
model.get_vector('ipad')

array([-6.1104e-01, -4.7861e-01,  4.6234e-01, -5.8098e-02,  3.5714e-01,
       -7.4596e-02,  6.9281e-01, -3.9926e-01,  7.5479e-01,  4.4398e-01,
        3.4338e-01, -4.3246e-01,  7.0191e-02, -8.6011e-01,  2.3844e-01,
       -2.8153e-02, -5.6473e-01,  1.1724e+00,  6.9807e-01,  5.8976e-01,
        1.2442e-01, -1.2911e+00,  8.9142e-01,  1.0527e+00,  9.8682e-01,
        9.2627e-01,  2.9873e-01,  3.0862e-01, -7.4524e-01,  1.2628e-01,
       -2.3706e-01,  1.6102e+00,  4.7798e-03,  1.9434e-01,  1.0604e+00,
        7.8258e-01, -8.9174e-01,  1.6738e-01,  4.7110e-01, -7.6120e-01,
        3.0506e-01, -1.8910e-01,  6.6989e-02, -5.2704e-01, -4.1588e-01,
        2.2908e-01,  5.0584e-01, -4.6857e-01,  7.7799e-01,  6.3018e-01,
        3.6416e-01,  2.5758e-01,  3.0741e-01, -1.3649e-01, -7.5056e-01,
       -4.1256e-01, -3.2920e-01, -4.9191e-01,  4.0316e-01, -5.6319e-01,
        1.5338e-01,  6.2828e-01, -4.3093e-01,  4.9648e-01, -3.8445e-01,
        8.8047e-02,  8.1446e-02, -7.0781e-01,  2.2075e-01, -7.03

In [54]:
# model.index_to_key  # entire

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is',
 'was',
 'said',
 'with',
 'he',
 'as',
 'it',
 'by',
 'at',
 '(',
 ')',
 'from',
 'his',
 "''",
 '``',
 'an',
 'be',
 'has',
 'are',
 'have',
 'but',
 'were',
 'not',
 'this',
 'who',
 'they',
 'had',
 'i',
 'which',
 'will',
 'their',
 ':',
 'or',
 'its',
 'one',
 'after',
 'new',
 'been',
 'also',
 'we',
 'would',
 'two',
 'more',
 "'",
 'first',
 'about',
 'up',
 'when',
 'year',
 'there',
 'all',
 '--',
 'out',
 'she',
 'other',
 'people',
 "n't",
 'her',
 'percent',
 'than',
 'over',
 'into',
 'last',
 'some',
 'government',
 'time',
 '$',
 'you',
 'years',
 'if',
 'no',
 'world',
 'can',
 'three',
 'do',
 ';',
 'president',
 'only',
 'state',
 'million',
 'could',
 'us',
 'most',
 '_',
 'against',
 'u.s.',
 'so',
 'them',
 'what',
 'him',
 'united',
 'during',
 'before',
 'may',
 'since',
 'many',
 'while',
 'where',
 'states',
 'because',
 'now',
 'city',
 'made',
 'like',
 

In [63]:
def document_vector_GloVe(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in model.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(model.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean


In [64]:
tweets_temp1 = X.apply(document_vector_GloVe)

In [65]:
tweets_temp1

0       [-0.12796232, 0.004934112, 0.2997002, -0.1567011, -0.15583865, 0.09799757, 0.11052724, 0.035929985, 0.22409555, 0.35114682, 0.115047574, -0.15497755, 0.023922225, -0.14416587, 0.61092, 0.21855089,...
1       [0.18464375, -0.14675263, 0.44942373, 0.09611425, 0.10299387, -0.22091424, 0.09669463, -0.25706288, 0.0071493685, -0.15944123, 0.41342202, -0.015069997, -0.07561076, -0.042483754, -0.011530001, -0...
2       [-0.24398555, 0.050635442, 0.40788, -0.21215816, -0.09722363, 0.19359446, -0.23326969, 0.025276013, 0.38502225, -0.109271, 0.13588454, 0.10670471, 0.006712194, -0.24413653, 0.24706927, -0.20098938...
3       [-0.20001145, 0.072584935, 0.52819365, -0.3378445, -0.061915454, 0.043990027, -0.064606726, 0.0736255, 0.19891483, 0.06754082, 0.20419908, -0.10599995, 0.13987637, 0.1068021, 0.28936264, -0.173550...
4       [-0.0675545, 0.25669113, 0.31932634, -0.2808434, -0.2849553, 0.11903277, -0.22724922, 0.18872231, 0.06447208, -0.14311, 0.11120178, 0.12749399, 0.25818896, -0.1

In [66]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 100
tweets_vec1 = np.ones((len(tweets_temp1), embedding_size))*np.nan

for i in range(tweets_vec1.shape[0]):
    tweets_vec1[i,:] = tweets_temp1.iloc[i]

# tweets_vec.shape # this itself is your final FEATURE MATRIX
# Create a new DF to store these new documnent features
df1 = pd.DataFrame(tweets_vec1)
df1['y'] = tweets['label']
df1.dropna(how='any', axis=0, inplace=True)

X_word_emb_Glove = df1.drop('y', axis=1)
y = df1['y']
X_word_emb_Glove.shape

(7920, 100)

In [67]:
LR = LogisticRegression(solver='liblinear', class_weight='balanced', penalty='l2', C=0.5)

results = cross_validate(LR, X_word_emb_Glove, y, scoring='accuracy', cv=kfold, \
                         return_train_score=True, return_estimator=True)
print(results['train_score'].mean().round(4), results['train_score'].std().round(4))
print(results['test_score'].mean().round(4), results['test_score'].std().round(4))

0.8716 0.0015
0.8672 0.0079


In [None]:
# H/W Try re-building the model using 300d vectors from GoogleNews.bin