### Load The Tweets Dataset 

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200)

In [2]:
data = pd.read_pickle("tweets_cleaned.pkl")
data.head()

Unnamed: 0,id,label,tweet,cleaned_tweets_w/o_SW,cleaned_tweets_with_SW
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias,finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,love talk makememories unplug relax iphone smartphone wifi connect
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i am wired i know i am george i wa made that way iphone cute daventry home,wired know george way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk to me about a question i have unless i pay them for their stupid support,amazing service apple talk question unless pay stupid support


In [3]:
# 0 refers to positive sentiment, 1 is negative sentiment

# 10. Word Embeddings

In [5]:
tweets_list = list(data['cleaned_tweets_w/o_SW'].apply(lambda x: x.split()))
tweets_list[0] # list of lists, where each tweet is a list of tokens, finally we have a list of tweets

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [6]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 23.3.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# Creating your own Word2Vec Model & Train
from gensim.models import Word2Vec
# train model
cbow_model = Word2Vec(tweets_list, vector_size = 300, window = 3, min_count=5, sg=0)

In [8]:
# summarize the loaded model
print(cbow_model)

Word2Vec<vocab=2420, vector_size=300, alpha=0.025>


In [9]:
cbow_model.wv.index_to_key[:20]  # this your vocab 

['iphone',
 'apple',
 'i',
 'my',
 'the',
 'to',
 'a',
 'is',
 'samsung',
 'it',
 'and',
 'you',
 'new',
 'twitter',
 'for',
 'com',
 'phone',
 'me',
 'sony',
 'not']

In [10]:
len(cbow_model.wv.index_to_key)

2420

In [11]:
# Each document vector will have dimension [1 x 300]

In [12]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [13]:
tweets_temp = data['cleaned_tweets_w/o_SW'].apply(document_vector)

In [14]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

0    [0.10934084, 0.2850865, -0.03702128, 0.0824658, 0.02008623, -0.45371807, 0.14882354, 0.62065077, -0.12167322, 0.047155283, -0.05799838, -0.16212794, -0.10676005, -0.08618663, -0.14493874, -0.09566...
1    [0.015915114, 0.18201299, -0.039599158, 0.045006834, -0.004629258, -0.28631586, 0.18919937, 0.47859988, 0.04846849, -0.31849703, 0.015452245, -0.3032209, -0.033871964, 0.09248772, -0.24006715, -0....
2    [0.0072273207, 0.1256742, 0.046073195, 0.1193809, -0.02070676, -0.16472878, 0.20651849, 0.45720258, 0.12597854, -0.1954561, 0.04576041, -0.22337957, -0.0361608, 0.04666811, -0.18539162, -0.0872967...
3    [0.026155995, 0.11249809, 0.073917255, 0.17779328, -0.03442652, -0.13926001, 0.2543105, 0.508533, 0.19848895, -0.26223925, 0.08050963, -0.28436878, -0.017680736, 0.07888284, -0.22647314, -0.078523...
4    [0.0011964325, 0.10570404, 0.044683237, 0.14343865, -0.03508732, -0.13002998, 0.23423897, 0.4684211, 0.19434296, -0.3004223, 0.07944932, -0.27818927, -0.030255383, 0.08189084,

In [15]:
tweets_temp[0].shape  # each document vecotr is 300-dimensional !!

(300,)

In [16]:
type(tweets_temp)

pandas.core.series.Series

In [17]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 300
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(7920, 300)

In [18]:
# Create a new DF to store these new documnent features
df = pd.DataFrame(tweets_vec)
df['y'] = data['label']
df.dropna(how='any', axis=0, inplace=True)

In [19]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.109341,0.285087,-0.037021,0.082466,0.020086,-0.453718,0.148824,0.620651,-0.121673,0.047155,...,0.279967,0.067325,0.022342,0.247009,0.316534,0.070172,-0.171086,0.159013,-0.122541,0
1,0.015915,0.182013,-0.039599,0.045007,-0.004629,-0.286316,0.189199,0.4786,0.048468,-0.318497,...,0.269555,0.163191,0.025474,0.296438,0.272728,-0.029763,-0.203491,0.178734,-0.148404,0
2,0.007227,0.125674,0.046073,0.119381,-0.020707,-0.164729,0.206518,0.457203,0.125979,-0.195456,...,0.267637,0.134829,0.083807,0.297669,0.246951,0.021494,-0.07372,0.110414,-0.101642,0
3,0.026156,0.112498,0.073917,0.177793,-0.034427,-0.13926,0.25431,0.508533,0.198489,-0.262239,...,0.322102,0.169905,0.124331,0.395236,0.259743,0.03479,-0.042595,0.118608,-0.098591,0
4,0.001196,0.105704,0.044683,0.143439,-0.035087,-0.13003,0.234239,0.468421,0.194343,-0.300422,...,0.298727,0.174887,0.102851,0.360987,0.267,0.014301,-0.073648,0.106077,-0.109465,1


In [20]:
df.shape

(7920, 301)

In [21]:
X_word_emb = df.drop('y', axis=1)
y = df['y']
X_word_emb.shape

(7920, 300)

In [22]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [23]:

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 


85.51 0.18
85.21 0.82


In [24]:
X = data['cleaned_tweets_w/o_SW']
y = data['label']

# we want to include only those words in the vocab which have min df of 5,
# means select only those words which occur ATLEAST in 5 documents!! 
# AND SELECT the TOP 300 FEATURES ONLY to build the model
CV = CountVectorizer(min_df=5, max_features=300)

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )
results = cross_validate(CV_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 

CV.fit_transform(X)
len(CV.vocabulary_)  # no. of features AFTER applying the stopwords

88.95 0.07
87.75 1.01


300

# 11. Word Embeddings from GloVe Model

In [25]:
# from gensim.models import KeyedVectors
# from gensim.scripts.glove2word2vec import glove2word2vec
!pip install gensim
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# convert glove to word2vec format

glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

Defaulting to user installation because normal site-packages is not writeable


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 23.3.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [26]:
# # load the converted model
# filename = 'word2vec.txt'
# model = KeyedVectors.load_word2vec_format(filename, binary=False)
# load the converted model
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [27]:
model.get_vector('analytics')

array([ 0.025135, -1.1037  , -0.014392,  0.175   ,  0.45659 , -0.86727 ,
       -0.057021, -0.66513 ,  0.35031 ,  0.46178 , -0.079201, -0.15928 ,
       -0.29051 , -0.37331 ,  0.58284 ,  0.47992 ,  0.47444 ,  0.018436,
        0.33742 ,  0.48474 , -1.0344  , -0.63262 , -0.043848,  0.33803 ,
       -0.27473 ,  0.46233 ,  0.92311 ,  1.6516  , -0.99585 , -0.41202 ,
       -0.22485 ,  0.17227 , -0.82582 ,  0.046938,  1.0012  , -0.22104 ,
       -0.81985 ,  0.072396,  0.67151 , -0.80752 ,  0.2998  , -0.20886 ,
       -1.3073  , -0.085651, -1.2405  , -0.59945 , -0.38276 , -0.014263,
        0.17119 ,  0.19705 , -0.17824 , -0.11378 ,  0.24159 ,  0.057804,
        0.044002, -1.1791  ,  0.48858 , -0.78541 ,  0.06117 ,  0.19021 ,
       -0.27743 , -0.9376  , -0.43884 ,  0.10984 , -0.59379 , -0.13567 ,
        0.050591, -0.062951,  1.2968  ,  0.35529 , -0.87356 ,  0.61764 ,
       -0.23356 , -0.74894 ,  0.35229 , -0.99631 ,  0.33625 , -0.027754,
       -0.85467 , -1.1996  ,  0.60355 ,  0.90339 , 

In [28]:
# model.index_to_key

In [29]:
def document_vector_GloVe(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in model.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(model.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean


In [30]:
tweets_temp = data['cleaned_tweets_w/o_SW'].apply(document_vector_GloVe)

In [31]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

0    [-0.12796232, 0.004934112, 0.2997002, -0.1567011, -0.15583865, 0.09799757, 0.11052724, 0.035929985, 0.22409555, 0.35114682, 0.115047574, -0.15497755, 0.023922225, -0.14416587, 0.61092, 0.21855089,...
1    [0.10285442, -0.03414092, 0.44059834, 0.019384174, 0.0109245, -0.039047826, 0.014194754, -0.07490217, -0.098265, 0.039055835, 0.2747599, 0.06436991, -0.052626252, 0.019468, -0.050155003, -0.234040...
2    [-0.22620347, 0.12656459, 0.42639765, -0.27715844, -0.15769385, 0.19056438, -0.24652052, 0.096992, 0.34241086, -0.13856545, 0.19085309, 0.1367206, 0.060512602, -0.19153553, 0.20073484, -0.24162795...
3    [-0.15686598, 0.19567013, 0.49366295, -0.3283021, -0.46933356, 0.2035618, -0.02267706, 0.1738718, 0.19721664, -0.04747359, 0.51096183, 0.06516355, 0.1846259, 0.033529352, 0.18381229, -0.37266436, ...
4    [-0.07447391, 0.26465738, 0.3381867, -0.2901459, -0.3280814, 0.20322695, -0.18568636, 0.209613, 0.060292058, -0.121835224, 0.14546065, 0.14423917, 0.21310659, -0.08253591, 0.0

In [32]:
# Combining all the document vectors into a single numpy array (tweets_vec)
embedding_size = 100
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

# tweets_vec.shape # this itself is your final FEATURE MATRIX
# Create a new DF to store these new documnent features
df1 = pd.DataFrame(tweets_vec)
df1['y'] = data['label']
df1.dropna(how='any', axis=0, inplace=True)

X_word_emb = df1.drop('y', axis=1)
y = df1['y']
X_word_emb.shape

(7920, 100)

In [33]:

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 


87.22 0.03
86.29 0.67
