### Load The Tweets Dataset 

In [20]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200)

In [21]:
data = pd.read_pickle("tweets_cleaned.pkl")
data.head()

Unnamed: 0,id,label,tweet,cleaned_tweets_w/o_SW,cleaned_tweets_with_SW
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias,finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,love talk makememories unplug relax iphone smartphone wifi connect
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i am wired i know i am george i wa made that way iphone cute daventry home,wired know george way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk to me about a question i have unless i pay them for their stupid support,amazing service apple talk question unless pay stupid support


In [22]:
# 0 refers to positive sentiment, 1 is negative sentiment

# 10. Word Embeddings

In [23]:
# path = r'D:\OneDrive\Google Drive Files\Training\1 MASTER\NLP Master\New Notebooks'
# filename = path + r'\word2vec.txt'
# model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [24]:
tweets_list = list(data['cleaned_tweets_w/o_SW'].apply(lambda x: x.split()))
tweets_list[0] # list of lists, where each tweet is a list of tokens, finally we have a list of tweets

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [25]:
# !pip install gensim

In [76]:
# Creating your own Word2Vec Model & Train
from gensim.models import Word2Vec
# train model
cbow_model = Word2Vec(tweets_list, vector_size = 300, window = 3, min_count=5, sg=0)

In [77]:
# summarize the loaded model
print(cbow_model)

Word2Vec<vocab=2420, vector_size=300, alpha=0.025>


In [78]:
cbow_model.wv.index_to_key[:20]  # this your vocab 

['iphone',
 'apple',
 'i',
 'my',
 'the',
 'to',
 'a',
 'is',
 'samsung',
 'it',
 'and',
 'you',
 'new',
 'twitter',
 'for',
 'com',
 'phone',
 'me',
 'sony',
 'not']

In [79]:
len(cbow_model.wv.index_to_key)

2420

In [80]:
# Each document vector will have dimension [1 x 300]

In [111]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [112]:
tweets_temp = data['cleaned_tweets_w/o_SW'].apply(document_vector)

In [113]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

0    [0.07997607, 0.293474, -0.021608636, 0.060005073, 0.06600141, -0.40608853, 0.13082457, 0.5751103, -0.09426117, 0.039150346, -0.045154102, -0.19582038, -0.07701456, -0.07941555, -0.09803471, -0.156...
1    [0.0059037693, 0.17382163, -0.031965863, 0.067778036, -0.031802125, -0.3008976, 0.20901914, 0.46240765, 0.049188204, -0.28368744, 0.008748335, -0.2849252, -0.014212379, 0.12226435, -0.22998452, -0...
2    [0.016498206, 0.14017253, 0.022675788, 0.16010089, -0.048329283, -0.1778317, 0.2172064, 0.47418606, 0.10024565, -0.18243241, 0.04738049, -0.22659369, -9.981295e-05, 0.08509191, -0.22022502, -0.156...
3    [0.056304973, 0.13165323, 0.043049376, 0.22645403, -0.07688501, -0.15711595, 0.24954543, 0.5206074, 0.15020257, -0.23757361, 0.09555426, -0.27512482, 0.040973946, 0.1435443, -0.26918602, -0.155831...
4    [0.021529809, 0.11566198, 0.009598537, 0.20508789, -0.086675145, -0.15104221, 0.24962968, 0.4927249, 0.15590686, -0.27230245, 0.08227494, -0.2675885, 0.010829366, 0.13826883, 

In [114]:
tweets_temp[0].shape  # each document vecotr is 300-dimensional !!

(300,)

In [115]:
type(tweets_temp)

pandas.core.series.Series

In [116]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 300
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(7920, 300)

In [87]:
# Create a new DF to store these new documnent features
df = pd.DataFrame(tweets_vec)
df['y'] = data['label']
df.dropna(how='any', axis=0, inplace=True)

In [88]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.079976,0.293474,-0.021609,0.060005,0.066001,-0.406089,0.130825,0.57511,-0.094261,0.03915,...,0.256732,0.041669,-0.017489,0.247166,0.361355,0.017446,-0.196934,0.223411,-0.108381,0
1,0.005904,0.173822,-0.031966,0.067778,-0.031802,-0.300898,0.209019,0.462408,0.049188,-0.283687,...,0.255945,0.141843,0.013615,0.282416,0.293541,-0.035824,-0.173241,0.155709,-0.136512,0
2,0.016498,0.140173,0.022676,0.160101,-0.048329,-0.177832,0.217206,0.474186,0.100246,-0.182432,...,0.252025,0.156284,0.075745,0.271274,0.299471,0.004381,-0.052711,0.084117,-0.113255,0
3,0.056305,0.131653,0.043049,0.226454,-0.076885,-0.157116,0.249545,0.520607,0.150203,-0.237574,...,0.295913,0.211403,0.136778,0.362415,0.314652,0.018916,-0.003906,0.079329,-0.115577,0
4,0.02153,0.115662,0.009599,0.205088,-0.086675,-0.151042,0.24963,0.492725,0.155907,-0.272302,...,0.282295,0.205738,0.106733,0.327005,0.320674,0.01181,-0.031979,0.056954,-0.125649,1


In [89]:
df.shape

(7920, 301)

In [90]:
X_word_emb = df.drop('y', axis=1)
y = df['y']
X_word_emb.shape

(7920, 300)

In [91]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [92]:

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 


85.51 0.24
85.32 0.68


In [93]:
X = data['cleaned_tweets_w/o_SW']
y = data['label']

# we want to include only those words in the vocab which have min df of 5,
# means select only those words which occur ATLEAST in 5 documents!! 
# AND SELECT the TOP 300 FEATURES ONLY to build the model
CV = CountVectorizer(min_df=5, max_features=300)

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )
results = cross_validate(CV_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 

CV.fit_transform(X)
len(CV.vocabulary_)  # no. of features AFTER applying the stopwords

88.95 0.08
87.75 1.01


300

# 11. Word Embeddings from GloVe Model

In [94]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [95]:
# load the converted model
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [96]:
model.get_vector('analytics')

array([ 0.025135, -1.1037  , -0.014392,  0.175   ,  0.45659 , -0.86727 ,
       -0.057021, -0.66513 ,  0.35031 ,  0.46178 , -0.079201, -0.15928 ,
       -0.29051 , -0.37331 ,  0.58284 ,  0.47992 ,  0.47444 ,  0.018436,
        0.33742 ,  0.48474 , -1.0344  , -0.63262 , -0.043848,  0.33803 ,
       -0.27473 ,  0.46233 ,  0.92311 ,  1.6516  , -0.99585 , -0.41202 ,
       -0.22485 ,  0.17227 , -0.82582 ,  0.046938,  1.0012  , -0.22104 ,
       -0.81985 ,  0.072396,  0.67151 , -0.80752 ,  0.2998  , -0.20886 ,
       -1.3073  , -0.085651, -1.2405  , -0.59945 , -0.38276 , -0.014263,
        0.17119 ,  0.19705 , -0.17824 , -0.11378 ,  0.24159 ,  0.057804,
        0.044002, -1.1791  ,  0.48858 , -0.78541 ,  0.06117 ,  0.19021 ,
       -0.27743 , -0.9376  , -0.43884 ,  0.10984 , -0.59379 , -0.13567 ,
        0.050591, -0.062951,  1.2968  ,  0.35529 , -0.87356 ,  0.61764 ,
       -0.23356 , -0.74894 ,  0.35229 , -0.99631 ,  0.33625 , -0.027754,
       -0.85467 , -1.1996  ,  0.60355 ,  0.90339 , 

In [None]:
# model.index_to_key

In [97]:
def document_vector_GloVe(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(model.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean


In [None]:
tweets_temp = data['cleaned_tweets_w/o_SW'].apply(document_vector_GloVe)

In [None]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

In [109]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 100
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

# tweets_vec.shape # this itself is your final FEATURE MATRIX
# Create a new DF to store these new documnent features
df1 = pd.DataFrame(tweets_vec)
df1['y'] = data['label']
df1.dropna(how='any', axis=0, inplace=True)

X_word_emb = df1.drop('y', axis=1)
y = df1['y']
X_word_emb.shape

(7920, 100)

In [110]:

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 


87.22 0.03
86.29 0.67
