In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [3]:
df = pd.read_csv('twitter_training.csv')

In [4]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
df['Tag'] = df['Borderlands']
df['sentiment'] = df['Positive']
df['text'] = df['im getting on borderlands and i will murder you all ,']
df = df.drop(['2401','Borderlands','Positive','im getting on borderlands and i will murder you all ,','Tag'],axis=1)
df.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [7]:
pd.get_dummies(df['sentiment'],drop_first=True)

Unnamed: 0,Negative,Neutral,Positive
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
74676,0,0,1
74677,0,0,1
74678,0,0,1
74679,0,0,1


In [8]:
df

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [9]:
df = df.drop(df[df['sentiment']=='Neutral'].index,axis=0)

In [10]:
df

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [11]:
df = df.drop(df[df['sentiment']=='Irrelevant'].index,axis=0)

In [12]:
df

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [13]:
df = pd.get_dummies(df,columns=['sentiment'],drop_first=True)
df

Unnamed: 0,text,sentiment_Positive
0,I am coming to the borders and I will kill you...,1
1,im getting on borderlands and i will kill you ...,1
2,im coming on borderlands and i will murder you...,1
3,im getting on borderlands 2 and i will murder ...,1
4,im getting into borderlands and i can murder y...,1
...,...,...
74676,Just realized that the Windows partition of my...,1
74677,Just realized that my Mac window partition is ...,1
74678,Just realized the windows partition of my Mac ...,1
74679,Just realized between the windows partition of...,1


In [14]:
df

Unnamed: 0,text,sentiment_Positive
0,I am coming to the borders and I will kill you...,1
1,im getting on borderlands and i will kill you ...,1
2,im coming on borderlands and i will murder you...,1
3,im getting on borderlands 2 and i will murder ...,1
4,im getting into borderlands and i can murder y...,1
...,...,...
74676,Just realized that the Windows partition of my...,1
74677,Just realized that my Mac window partition is ...,1
74678,Just realized the windows partition of my Mac ...,1
74679,Just realized between the windows partition of...,1


In [15]:
d = df.dropna()
df

Unnamed: 0,text,sentiment_Positive
0,I am coming to the borders and I will kill you...,1
1,im getting on borderlands and i will kill you ...,1
2,im coming on borderlands and i will murder you...,1
3,im getting on borderlands 2 and i will murder ...,1
4,im getting into borderlands and i can murder y...,1
...,...,...
74676,Just realized that the Windows partition of my...,1
74677,Just realized that my Mac window partition is ...,1
74678,Just realized the windows partition of my Mac ...,1
74679,Just realized between the windows partition of...,1


In [16]:
x = d.drop('sentiment_Positive',axis=1)
y = d['sentiment_Positive']
xtrain,xtest,ytran,ytest = train_test_split(x,y,test_size=0.3,random_state=101)

In [17]:
cv = CountVectorizer()
s_xtrain = cv.fit_transform(xtrain['text'])
s_xtrain
s_xtext = cv.transform(xtest['text'])

### Using Multinomial NB

In [18]:
nb = MultinomialNB()
nb.fit(s_xtrain,ytran)
ypred = nb.predict(s_xtext)


In [19]:
ypred

array([1, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [20]:
ytest

29064    1
27807    1
13800    0
47507    1
70463    0
        ..
42862    0
48571    0
67886    0
47682    0
33435    0
Name: sentiment_Positive, Length: 12904, dtype: uint8

Accuracy in **Multinomial NB**

In [21]:
accuracy_score(ytest,ypred)

0.8817420954742715

### Using Gaussian NB

In [22]:
gnb = GaussianNB()
gnb.fit(s_xtrain.toarray(),ytran)
ypred = gnb.predict(s_xtext.toarray())
ypred

array([1, 1, 1, ..., 0, 0, 0], dtype=uint8)

Accuracy in **Gaussian NB**

In [23]:
accuracy_score(ytest,ypred)

0.7985120892746436

Using Bernoulli NB

In [24]:
bnb = BernoulliNB()
bnb.fit(s_xtrain,ytran)
ypred = bnb.predict(s_xtext)
ypred

array([1, 1, 1, ..., 0, 1, 0], dtype=uint8)

Accuracy in **Bernoulli NB**

In [25]:
accuracy_score(ytest,ypred)

0.8413670179789212

## Validation

In [26]:
vf = pd.read_csv('twitter_validation.csv')

In [27]:
vf

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [28]:
vf['text'] = vf['I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣']
vf['sentiment'] = vf['Irrelevant']
vf = vf.drop(['3364','Facebook','Irrelevant','I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'],axis=1)

In [29]:
vf = vf.drop(vf[vf['sentiment']=='Irrelevant'].index,axis=0)
vf = vf.drop(vf[vf['sentiment']=='Neutral'].index,axis=0)


In [30]:
vf

Unnamed: 0,text,sentiment
1,@Microsoft Why do I pay for WORD when it funct...,Negative
2,"CSGO matchmaking is so full of closet hacking,...",Negative
4,Hi @EAHelp I’ve had Madeleine McCann in my cel...,Negative
5,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...,Positive
6,"Rocket League, Sea of Thieves or Rainbow Six: ...",Positive
...,...,...
991,guess i'll broke.,Positive
992,Please explain how this is possible! How can t...,Negative
993,Good on Sony. As much as I want to see the new...,Positive
996,Today sucked so it’s time to drink wine n play...,Positive


In [31]:
vf.isnull().sum()

text         0
sentiment    0
dtype: int64

In [32]:
X = vf.drop('sentiment',axis=1)
Y = vf['sentiment']

In [33]:
S_X = cv.transform(X['text'])
S_X

<543x17922 sparse matrix of type '<class 'numpy.int64'>'
	with 9510 stored elements in Compressed Sparse Row format>

In [34]:
Y = (pd.get_dummies(Y,drop_first=False))
Y = Y.drop('Negative',axis=1)

In [35]:
nbpreds = nb.predict(S_X)
gnb_preds = gnb.predict(S_X.toarray())
bnb_preds = bnb.predict(S_X)

In [36]:
print(f"Validation Using Multinomial Naive Bayes: {accuracy_score(nbpreds,Y)}\n")
print(f"Validation Using Gaussian Naive Bayes: {accuracy_score(gnb_preds,Y)}\n")
print(f"Validation Using Bernoulli Naive Bayes: {accuracy_score(bnb_preds,Y)}\n")

Validation Using Multinomial Naive Bayes: 0.9208103130755064

Validation Using Gaussian Naive Bayes: 0.852670349907919

Validation Using Bernoulli Naive Bayes: 0.8931860036832413

