In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
##Data Preprocessing 

In [5]:
data.shape

(7920, 3)

In [6]:
data.duplicated().sum()

np.int64(0)

In [7]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
### Text Preprocessing 

In [9]:
import re
import string

convert uppercase to lowercase

In [10]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [11]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

remove links

In [12]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE)for x in x.split()))

In [13]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove Punctuations 

In [14]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text 

data["tweet"] = data["tweet"].apply(remove_punctuations)

In [15]:
data["tweet"].head(5)

0    fingerprint pregnancy test  android apps beaut...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

In [16]:
data["tweet"].head(20)

0     fingerprint pregnancy test  android apps beaut...
1     finally a transparant silicon case  thanks to ...
2     we love this would you go talk makememories un...
3     im wired i know im george i was made that way ...
4     what amazing service apple wont even talk to m...
5     iphone software update fucked up my phone big ...
6     happy for us  instapic instadaily us sony xper...
7     new type c charger cable uk  … bay amazon etsy...
8     bout to go shopping again listening to music i...
9     photo fun selfie pool water sony camera picoft...
10    hey apple when you make a new ipod dont make i...
11    ha not heavy machinery but it does what i need...
12    contemplating giving in to the iphone bandwago...
13    i just made another crazy purchase lol my theo...
14    shaqlockholmes samlouise1991 the battery is so...
15    from deepellum towards downtown dallas bigd sa...
16    like and share if you want this 3d phone case ...
17    go crazy  iphonesia iphone instagood insta

In [17]:
data["tweet"] = data["tweet"].str.replace('\d+', '', regex=True)

In [18]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    i am completely in love with the new iphone em...
7912    tune in turn on drop out  gtd in one app  mobi...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

remove stopwords 

In [19]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [20]:
import nltk

In [21]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [22]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [23]:
sw

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [24]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split()if x not in sw))

In [25]:
data["tweet"].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    im wired know im george made way iphone cute d...
4    amazing service apple wont even talk question ...
Name: tweet, dtype: object

stemming

In [26]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [27]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [28]:
data["tweet"].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

In [29]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,im wire know im georg made way iphon cute dave...
4,5,1,amaz servic appl wont even talk question unles...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0,would like wish amaz day make everi minut coun...
7917,7918,0,help love year old neighbor ipad morn made rea...
7918,7919,0,final got smart pocket wifi stay connect anyti...


In [30]:
from collections import Counter
vocab = Counter()

In [31]:
 vocab

Counter()

In [32]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,im wire know im georg made way iphon cute dave...
4,5,1,amaz servic appl wont even talk question unles...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0,would like wish amaz day make everi minut coun...
7917,7918,0,help love year old neighbor ipad morn made rea...
7918,7919,0,final got smart pocket wifi stay connect anyti...


In [33]:
for sentence in data ['tweet']:
    vocab.update(sentence.split())

In [34]:
len(vocab)

15949

In [35]:
data.shape

(7920, 3)

In [36]:
tokens = [key for key in vocab if vocab[key] > 10 ]

In [37]:
len(tokens)

1145

In [38]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding = 'utf-8')
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

In [39]:
###Divide Dataset

In [40]:
x = data ['tweet']
y = data ['label']

In [41]:
x

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       im wire know im georg made way iphon cute dave...
4       amaz servic appl wont even talk question unles...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [42]:
y

0       0
1       0
2       0
3       0
4       1
       ..
7915    0
7916    0
7917    0
7918    0
7919    0
Name: label, Length: 7920, dtype: int64

In [43]:
!pip install scikit-learn



In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [45]:
x_train,shape

NameError: name 'shape' is not defined

In [46]:
x_train.shape

(6336,)

In [47]:
x_test.shape

(1584,)

In [48]:
y_test

1858    1
4215    0
1847    0
3722    0
6963    0
       ..
1392    0
7202    0
356     0
5216    1
1555    1
Name: label, Length: 1584, dtype: int64

In [49]:
###vectorization

In [50]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []

    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1

        vectorized_lst.append(sentence_lst)

    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)

    return vectorized_lst_new


In [51]:
vectorized_x_train = vectorizer(x_train, tokens)

In [52]:
vectorized_x_test = vectorizer(x_test, tokens)

In [53]:
y_train

7334    1
507     1
5845    0
4575    0
6383    1
       ..
3047    0
7039    0
7828    0
862     0
461     0
Name: label, Length: 6336, dtype: int64

In [54]:
y_train.value_counts()

label
0    4687
1    1649
Name: count, dtype: int64

In [55]:
###Handle imbalance dataset

In [74]:
pip install --upgrade imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.11.0
    Uninstalling imbalanced-learn-0.11.0:
      Successfully uninstalled imbalanced-learn-0.11.0
Successfully installed imbalanced-learn-0.13.0
Note: you may need to restart the kernel to use updated packages.


In [75]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9374, 1145) (9374,)


In [76]:
y_train_smote.value_counts()

label
1    4687
0    4687
Name: count, dtype: int64

In [None]:
###model training and evaluation 

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [79]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def training_scores(y_act, y_pred):
   acc = round(accuracy_score(y_act, y_pred), 3)
   pr = round(precision_score(y_act, y_pred), 3)
   rec = round(recall_score(y_act, y_pred), 3)
   f1 = round(f1_score(y_act, y_pred), 3)
   print(f'Training Scores:\n\nAccuracy = {acc}\nPrecision = {pr}\nRecall = {rec}\nF1-Score = {f1}')

def validation_scores(y_act, y_pred):
   acc = round(accuracy_score(y_act, y_pred), 3)
   pr = round(precision_score(y_act, y_pred), 3)
   rec = round(recall_score(y_act, y_pred), 3)
   f1 = round(f1_score(y_act, y_pred), 3)
   print(f'Testing Scores:\n\nAccuracy = {acc}\nPrecision = {pr}\nRecall = {rec}\nF1-Score = {f1}')

In [None]:
###Logistics Regression

In [85]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = lr.predict(vectorized_x_train_smote)

y_test_pred = lr.predict(vectorized_x_test)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:

Accuracy = 0.941
Precision = 0.918
Recall = 0.968
F1-Score = 0.942
Testing Scores:

Accuracy = 0.864
Precision = 0.681
Recall = 0.809
F1-Score = 0.739


In [None]:
###Naive Bayes

In [86]:
mnb = MultinomialNB()
mnb.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = mnb.predict(vectorized_x_train_smote)

y_test_pred = mnb.predict(vectorized_x_test)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:

Accuracy = 0.91
Precision = 0.874
Recall = 0.959
F1-Score = 0.915
Testing Scores:

Accuracy = 0.855
Precision = 0.633
Recall = 0.926
F1-Score = 0.752


In [None]:
###Decision Tree

In [87]:
dt = DecisionTreeClassifier()
dt.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = dt.predict(vectorized_x_train_smote)

y_test_pred = dt.predict(vectorized_x_test)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:

Accuracy = 1.0
Precision = 1.0
Recall = 0.999
F1-Score = 1.0
Testing Scores:

Accuracy = 0.827
Precision = 0.63
Recall = 0.66
F1-Score = 0.645


In [None]:
###Random Forest 

In [88]:
rf = RandomForestClassifier()
rf.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = rf.predict(vectorized_x_train_smote)

y_test_pred = rf.predict(vectorized_x_test)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:

Accuracy = 1.0
Precision = 1.0
Recall = 0.999
F1-Score = 1.0
Testing Scores:

Accuracy = 0.866
Precision = 0.714
Recall = 0.729
F1-Score = 0.722


In [None]:
###SUPPORT VeCTOR Machine 

In [89]:
svm = SVC()
svm.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = svm.predict(vectorized_x_train_smote)

y_test_pred = svm.predict(vectorized_x_test)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:

Accuracy = 0.98
Precision = 0.964
Recall = 0.996
F1-Score = 0.98
Testing Scores:

Accuracy = 0.874
Precision = 0.708
Recall = 0.804
F1-Score = 0.753


In [None]:
import pickle

with open('../static/model/','wb') as file:
    pickle.dump(lr,file)