In [1]:
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
import numpy as np
pd.set_option("display.max_colwidth", 200)

In [2]:
df = pd.read_pickle('/content/cleaned_tweets_v1.pkl')
df.head()

Unnamed: 0,label,tweet,cleaned_tweets,cleaned_tweets_without_stopwords
0,1,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,1,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally transparant silicon case thanks uncle yay sony xperia sonyexperias,finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2,1,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,love this would you talk makememories unplug relax iphone smartphone wifi connect,love talk makememories unplug relax iphone smartphone wifi connect
3,1,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,wired know george wa made that way iphone cute daventry home,wired know george way iphone cute daventry home
4,0,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk about question have unless pay them for their stupid support,amazing service apple talk question unless pay stupid support


# New Section

In [3]:
df.shape

(7920, 4)

In [4]:
df.columns

Index(['label', 'tweet', 'cleaned_tweets', 'cleaned_tweets_without_stopwords'], dtype='object')

In [5]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,5894
0,2026


In [6]:
# splitting the data into x and y
x = df[['cleaned_tweets_without_stopwords']]
y = df['label']

print(y.head())
print()
print(x.head())


0    1
1    1
2    1
3    1
4    0
Name: label, dtype: int64

                                                                  cleaned_tweets_without_stopwords
0  fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1                       finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2                               love talk makememories unplug relax iphone smartphone wifi connect
3                                                  wired know george way iphone cute daventry home
4                                    amazing service apple talk question unless pay stupid support


In [7]:
# splitting the data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(6336, 1) (1584, 1) (6336,) (1584,)


In [8]:
# importing basic nlp cleaning method prior applying word2vec
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean(doc):
    regex = "[^a-zA-Z]"
    doc = re.sub(regex, " ", doc)
    doc = doc.lower()
    tokens = nltk.word_tokenize(doc)
    stop_words = list(set(stopwords.words('english')))
    custom_stop_words = ["i", "a", "h","iO","i0","1o","10","lo","l0","io ", "i0 "]
    stop_words = stop_words + custom_stop_words
    filterted_tokens = [word for word in tokens if word not in stop_words]
    lemmatized_token = [lemmatizer.lemmatize(token) for token in filterted_tokens]
    return " ".join(lemmatized_token)

In [9]:
x_train.head()

Unnamed: 0,cleaned_tweets_without_stopwords
7544,fucking apple iphone fucking lost fuck apple die apple
3918,sweet case cute iphoneography iphone
7839,bamboo vocation newyear trip thailand khaoko ilce sony vsco snapspeed
5673,given today samsung inch livingroom home black white yay thankyou
5966,soft grip battery case iphone powerplus perfect case fit snugly distributor


In [10]:
x_test.head()

Unnamed: 0,cleaned_tweets_without_stopwords
2221,moby buy cool gift today samsung iphone sony tablet
3794,restore ipod dumb ughh apple
823,preparing release foldable phone filed patten self repairing screen function apple influencer samsunggalaxy tech photooftheday money samsung save gadget tryit technology motorola
686,watchdog ubisoft playstation sony midnight gaming gamer videogames
7081,working saturday morning nofilter dreamjob sun apple sony poliedro


In [11]:
x_train['tokenised_sentences'] = x_train['cleaned_tweets_without_stopwords'].apply(lambda doc: doc.split())
x_train.head()

Unnamed: 0,cleaned_tweets_without_stopwords,tokenised_sentences
7544,fucking apple iphone fucking lost fuck apple die apple,"[fucking, apple, iphone, fucking, lost, fuck, apple, die, apple]"
3918,sweet case cute iphoneography iphone,"[sweet, case, cute, iphoneography, iphone]"
7839,bamboo vocation newyear trip thailand khaoko ilce sony vsco snapspeed,"[bamboo, vocation, newyear, trip, thailand, khaoko, ilce, sony, vsco, snapspeed]"
5673,given today samsung inch livingroom home black white yay thankyou,"[given, today, samsung, inch, livingroom, home, black, white, yay, thankyou]"
5966,soft grip battery case iphone powerplus perfect case fit snugly distributor,"[soft, grip, battery, case, iphone, powerplus, perfect, case, fit, snugly, distributor]"


In [12]:
x_test['tokenised_sentences'] = x_test['cleaned_tweets_without_stopwords'].apply(lambda doc: doc.split())
x_test.head()

Unnamed: 0,cleaned_tweets_without_stopwords,tokenised_sentences
2221,moby buy cool gift today samsung iphone sony tablet,"[moby, buy, cool, gift, today, samsung, iphone, sony, tablet]"
3794,restore ipod dumb ughh apple,"[restore, ipod, dumb, ughh, apple]"
823,preparing release foldable phone filed patten self repairing screen function apple influencer samsunggalaxy tech photooftheday money samsung save gadget tryit technology motorola,"[preparing, release, foldable, phone, filed, patten, self, repairing, screen, function, apple, influencer, samsunggalaxy, tech, photooftheday, money, samsung, save, gadget, tryit, technology, moto..."
686,watchdog ubisoft playstation sony midnight gaming gamer videogames,"[watchdog, ubisoft, playstation, sony, midnight, gaming, gamer, videogames]"
7081,working saturday morning nofilter dreamjob sun apple sony poliedro,"[working, saturday, morning, nofilter, dreamjob, sun, apple, sony, poliedro]"


# Applying Feature Transformation approach - Word2Vec

In [14]:
"""
## Capture the semantic relationship between words
## Embedding - dense vector (50-300 dimensions)

### Word2Vec Architecture

a) CBOW - Continous Bag of Words
b) Skip-Gram

1) input layer
2) hidden layer
3) output layer

4) Word Embedding
5) Context Window
6) Negative Sampling method
7) Hierarchical softmax method
8) Cosine Similarity

### Significance of Word2Vec
a) Semantic similarity
b) Efficient Learning
c) Transfer Learning
"""

'\n## Capture the semantic relationship between words\n## Embedding - dense vector (50-300 dimensions)\n\n### Word2Vec Architecture\n\na) CBOW - Continous Bag of Words\nb) Skip-Gram\n\n1) input layer \n2) hidden layer\n3) output layer\n\n4) Word Embedding\n5) Context Window\n6) Negative Sampling method\n7) Hierarchical softmax method\n8) Cosine Similarity\n\n### Significance of Word2Vec\na) Semantic similarity\nb) Efficient Learning\nc) Transfer Learning\n'

In [15]:
from gensim.models import Word2Vec

In [17]:
%time word_vec = Word2Vec(x_train['tokenised_sentences'], vector_size=300, min_count=1)
print(word_vec)

CPU times: user 1.41 s, sys: 57.6 ms, total: 1.47 s
Wall time: 938 ms
Word2Vec<vocab=13262, vector_size=300, alpha=0.025>


In [22]:
13262 * 300

3978600

In [18]:
x_train.shape

(6336, 2)

In [19]:
word_vec.corpus_count

6336

In [21]:
word_vec.vector_size

300

In [20]:
word_vec.wv.index_to_key

['iphone',
 'apple',
 'samsung',
 'new',
 'phone',
 'sony',
 'follow',
 'pic',
 'like',
 'ipad',
 'love',
 'day',
 'life',
 'android',
 'io',
 'photo',
 'galaxy',
 'case',
 'gain',
 'cute',
 'today',
 'got',
 'photography',
 'fun',
 'music',
 'news',
 'app',
 'time',
 'instagood',
 'smile',
 'happy',
 'funny',
 'birthday',
 'beautiful',
 'ipod',
 'lol',
 'work',
 'fashion',
 'make',
 'game',
 'girl',
 'itunes',
 'finally',
 'photooftheday',
 'tech',
 'friend',
 'update',
 'iphonex',
 'apps',
 'good',
 'want',
 'amazing',
 'sougofollow',
 'selfie',
 'follower',
 'thanks',
 'note',
 'baby',
 'sale',
 'gift',
 'fuck',
 'mac',
 'year',
 'fail',
 'look',
 'rts',
 'hate',
 'igers',
 'going',
 'camera',
 'free',
 'art',
 'home',
 'fucking',
 'iphonesia',
 'family',
 'charger',
 'best',
 'cool',
 'christmas',
 'iphoneonly',
 'live',
 'smartphone',
 'health',
 'summer',
 'hour',
 'white',
 'need',
 'suck',
 'minute',
 'thing',
 'night',
 'laptop',
 'plus',
 'nature',
 'picoftheday',
 'mobile',


In [25]:
def get_document_vector(doc, model):
  tokens = [word for word in doc if word in model]
  if tokens:
    doc_embedding = np.mean(model[tokens], axis=0)
  else:
    doc_embedding = np.zeros(model.vector_size)
  return doc_embedding

In [26]:
x_train['w2v_doc_embedding'] = x_train['tokenised_sentences'].apply(lambda doc: get_document_vector(doc, word_vec.wv))

In [28]:
x_train.head()

Unnamed: 0,cleaned_tweets_without_stopwords,tokenised_sentences,w2v_doc_embedding
7544,fucking apple iphone fucking lost fuck apple die apple,"[fucking, apple, iphone, fucking, lost, fuck, apple, die, apple]","[0.10904018, 0.4422333, -0.06355248, 0.18067737, -0.07075185, -0.49178082, 0.45827734, 1.0946965, 0.1061438, -0.27378878, 0.1408419, -0.35934407, -0.1168069, -0.03384682, -0.18869428, -0.27566332,..."
3918,sweet case cute iphoneography iphone,"[sweet, case, cute, iphoneography, iphone]","[0.10670489, 0.365905, -0.053823225, 0.15552959, -0.06294916, -0.44688863, 0.41107702, 0.9886174, 0.08370793, -0.24525985, 0.10891942, -0.31216264, -0.08813124, -0.029887468, -0.1608086, -0.247999..."
7839,bamboo vocation newyear trip thailand khaoko ilce sony vsco snapspeed,"[bamboo, vocation, newyear, trip, thailand, khaoko, ilce, sony, vsco, snapspeed]","[0.04010067, 0.16212907, -0.022837475, 0.065650575, -0.028646264, -0.17816392, 0.16534826, 0.39958256, 0.03534897, -0.09689092, 0.051683627, -0.13226894, -0.044564597, -0.011706686, -0.06866376, -..."
5673,given today samsung inch livingroom home black white yay thankyou,"[given, today, samsung, inch, livingroom, home, black, white, yay, thankyou]","[0.056507833, 0.228713, -0.029853532, 0.09165266, -0.03850908, -0.24799395, 0.22904718, 0.5550545, 0.05485321, -0.135446, 0.07885091, -0.18952033, -0.0663193, -0.012733105, -0.09599836, -0.1408234..."
5966,soft grip battery case iphone powerplus perfect case fit snugly distributor,"[soft, grip, battery, case, iphone, powerplus, perfect, case, fit, snugly, distributor]","[0.04913728, 0.1693698, -0.024648637, 0.07190624, -0.027800852, -0.20786975, 0.1888496, 0.4566024, 0.03907989, -0.11428294, 0.051610734, -0.1434694, -0.041360952, -0.012017837, -0.0746774, -0.1143..."


In [29]:
x_test['w2v_doc_embedding'] = x_test['tokenised_sentences'].apply(lambda doc: get_document_vector(doc, word_vec.wv))

In [30]:
x_test.head()

Unnamed: 0,cleaned_tweets_without_stopwords,tokenised_sentences,w2v_doc_embedding
2221,moby buy cool gift today samsung iphone sony tablet,"[moby, buy, cool, gift, today, samsung, iphone, sony, tablet]","[0.10592808, 0.39877066, -0.056871217, 0.16215344, -0.06669878, -0.44979033, 0.41716608, 1.0057712, 0.093354285, -0.24852265, 0.12735455, -0.33308443, -0.10700337, -0.027982507, -0.16919407, -0.25..."
3794,restore ipod dumb ughh apple,"[restore, ipod, dumb, ughh, apple]","[0.06129812, 0.26329464, -0.037120182, 0.104905605, -0.04158906, -0.28122175, 0.26317486, 0.626201, 0.06191513, -0.15508817, 0.086881235, -0.20973516, -0.07302944, -0.017649252, -0.11000731, -0.15..."
823,preparing release foldable phone filed patten self repairing screen function apple influencer samsunggalaxy tech photooftheday money samsung save gadget tryit technology motorola,"[preparing, release, foldable, phone, filed, patten, self, repairing, screen, function, apple, influencer, samsunggalaxy, tech, photooftheday, money, samsung, save, gadget, tryit, technology, moto...","[0.053042304, 0.24334298, -0.03678512, 0.078813106, -0.031064428, -0.24313839, 0.23846674, 0.5575647, 0.05607783, -0.14713919, 0.0883793, -0.1933668, -0.060884576, -0.02436373, -0.100825824, -0.14..."
686,watchdog ubisoft playstation sony midnight gaming gamer videogames,"[watchdog, ubisoft, playstation, sony, midnight, gaming, gamer, videogames]","[0.054687977, 0.2399372, -0.03392419, 0.09128714, -0.038643207, -0.25578597, 0.23841077, 0.5736857, 0.053922478, -0.1408017, 0.08004251, -0.19396956, -0.06897543, -0.017733455, -0.10255063, -0.145..."
7081,working saturday morning nofilter dreamjob sun apple sony poliedro,"[working, saturday, morning, nofilter, dreamjob, sun, apple, sony, poliedro]","[0.081314616, 0.34338775, -0.04594492, 0.13624294, -0.054900922, -0.36783323, 0.34190074, 0.82311726, 0.07788979, -0.20246853, 0.11329995, -0.27627012, -0.096559845, -0.023390234, -0.14536107, -0...."


# Random Forest Algorithm

In [33]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
%time rf.fit(list(x_train['w2v_doc_embedding']), y_train)
y_pred = rf.predict(list(x_test['w2v_doc_embedding']))
print(y_pred)

from sklearn import metrics

print(f'Model Accuracy : {metrics.accuracy_score(y_test, y_pred)} ')
print()
print(f"Model's Classification Performance : {metrics.classification_report(y_test, y_pred)}")

CPU times: user 12.9 s, sys: 27 ms, total: 12.9 s
Wall time: 13 s
[1 0 1 ... 1 0 1]
Model Accuracy : 0.7954545454545454 

Model's Classification Performance :               precision    recall  f1-score   support

           0       0.64      0.45      0.53       405
           1       0.83      0.92      0.87      1179

    accuracy                           0.80      1584
   macro avg       0.74      0.68      0.70      1584
weighted avg       0.78      0.80      0.78      1584



# Agenda for tomorrow :
# LSTM/GRU - long short term memory
# CNN Model
# Transformers