In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import  LabelEncoder,OneHotEncoder

In [3]:
df = pd.read_csv('twitter_training.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [6]:
df=df.dropna()

In [7]:
df['tag'] = df['Borderlands']
df['text'] = df['im getting on borderlands and i will murder you all ,']
df = df.drop(['2401','Borderlands','Positive','im getting on borderlands and i will murder you all ,'],axis=1)

In [8]:
ohe = OneHotEncoder(categories=['tag'])

In [9]:
y = df.drop('text',axis=1)

In [10]:
y

Unnamed: 0,tag
0,Borderlands
1,Borderlands
2,Borderlands
3,Borderlands
4,Borderlands
...,...
74676,Nvidia
74677,Nvidia
74678,Nvidia
74679,Nvidia


In [11]:
x = df.drop('tag',axis=1)
x

Unnamed: 0,text
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...
...,...
74676,Just realized that the Windows partition of my...
74677,Just realized that my Mac window partition is ...
74678,Just realized the windows partition of my Mac ...
74679,Just realized between the windows partition of...


In [12]:
le = LabelEncoder()
le.fit(y['tag'])
dfle = le.fit_transform(y['tag'])
dfle

array([ 4,  4,  4, ..., 21, 21, 21])

In [13]:
xtrain,xtest,ytrain,ytest = train_test_split(x,dfle,random_state=101,test_size=0.2)

In [14]:
tfi = TfidfVectorizer()
s_xtrain= tfi.fit_transform(xtrain['text'])
s_xtest = tfi.transform(xtest['text'])

In [15]:
s_xtrain

<59196x29762 sparse matrix of type '<class 'numpy.float64'>'
	with 970379 stored elements in Compressed Sparse Row format>

In [16]:
s_xtest

<14799x29762 sparse matrix of type '<class 'numpy.float64'>'
	with 241358 stored elements in Compressed Sparse Row format>

In [17]:
gnb = GaussianNB()
gnb.fit(s_xtrain.toarray(),ytrain)

In [18]:
gnbpreds = gnb.predict(s_xtest.toarray())
accuracy_score(gnbpreds,ytest)

0.7748496520035137

In [19]:
mnb = MultinomialNB()
mnb.fit(s_xtrain,ytrain)
mnbpreds = mnb.predict(s_xtest)

In [20]:
accuracy_score(ytest,mnbpreds)

0.8211365632813028

In [21]:
bnb = BernoulliNB()
bnb.fit(s_xtrain,ytrain)
bnbpreds = bnb.predict(s_xtest)

In [22]:
accuracy_score(bnbpreds,ytest)

0.7570781809581728

In [23]:
print(f"Train Test Split Using Multinomial Naive Bayes: {accuracy_score(mnbpreds,ytest)}\n")
print(f"Train Test Split Using Gaussian Naive Bayes: {accuracy_score(gnbpreds,ytest)}\n")
print(f"Train Test Split Using Bernoulli Naive Bayes: {accuracy_score(bnbpreds,ytest)}\n")

Train Test Split Using Multinomial Naive Bayes: 0.8211365632813028

Train Test Split Using Gaussian Naive Bayes: 0.7748496520035137

Train Test Split Using Bernoulli Naive Bayes: 0.7570781809581728



## Validation Set

In [24]:
vf = pd.read_csv('twitter_validation.csv')

In [25]:
vf

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [26]:
vf.isnull().sum()

3364                                                                                                                                                                                                                                                  0
Facebook                                                                                                                                                                                                                                              0
Irrelevant                                                                                                                                                                                                                                            0
I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣    0
dtype: i

In [27]:
vf['text'] = vf['I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣']
vf['tag'] = vf['Facebook']
vf.drop(['3364','Irrelevant','Facebook','I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'],axis=1)

Unnamed: 0,text,tag
0,BBC News - Amazon boss Jeff Bezos rejects clai...,Amazon
1,@Microsoft Why do I pay for WORD when it funct...,Microsoft
2,"CSGO matchmaking is so full of closet hacking,...",CS-GO
3,Now the President is slapping Americans in the...,Google
4,Hi @EAHelp I’ve had Madeleine McCann in my cel...,FIFA
...,...,...
994,⭐️ Toronto is the arts and culture capital of ...,GrandTheftAuto(GTA)
995,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,CS-GO
996,Today sucked so it’s time to drink wine n play...,Borderlands
997,Bought a fraction of Microsoft today. Small wins.,Microsoft


In [28]:
y = df['tag']
y

0        Borderlands
1        Borderlands
2        Borderlands
3        Borderlands
4        Borderlands
            ...     
74676         Nvidia
74677         Nvidia
74678         Nvidia
74679         Nvidia
74680         Nvidia
Name: tag, Length: 73995, dtype: object

In [29]:
yle = le.transform(y)
yle

array([ 4,  4,  4, ..., 21, 21, 21])

In [30]:
x = df['text']
s_xv = tfi.transform(x)
s_xv

<73995x29762 sparse matrix of type '<class 'numpy.float64'>'
	with 1211737 stored elements in Compressed Sparse Row format>

In [31]:
mnbvpred = mnb.predict(s_xv)
#gnbvpred = gnb.predict(s_xv.toarray())
bnbvpred = bnb.predict(s_xv)

In [33]:
print(f"Validation Using Multinomial Naive Bayes: {accuracy_score(mnbvpred,yle)}\n")
#print(f"Validation Using Gaussian Naive Bayes: {accuracy_score(gnbvpred,yle)}\n")
print(f"Validation Using Bernoulli Naive Bayes: {accuracy_score(bnbvpred,yle)}\n")

Validation Using Multinomial Naive Bayes: 0.8617609297925536

Validation Using Bernoulli Naive Bayes: 0.7860260828434353

