In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# Dataset

In [2]:
df = pd.read_csv('./database/csv_files/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

# Encoding Catagory feature

In [4]:
df2 = df.join(pd.get_dummies(df.Category, drop_first=True)).drop('Category', axis = 1)
df2.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


# How do we deal with the text in Message Column?
- We can use "CountVectorizer" to make every unique word in the dataset as an independent column
- & the column contains the (count) frequency of that word in that message

# CountVectorizer
- In following example "CountVectorizer" takes all unique words present in all the sentence down below and converts them into a feature of their own
- the data they contain is their frequnecy in that sentence
![image.png](attachment:image.png)

# Conversion of Message Columns to Vectors 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
countvectorizer = CountVectorizer()

In [7]:
df3 = pd.DataFrame(countvectorizer.fit_transform(df2.Message).todense(), columns= countvectorizer.get_feature_names())
df3.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Taking Only Relevent Word

In [8]:
word_counts = []

for i in df3.columns:
    word_counts.append((i,df3[i].value_counts()[1:].sum()/df3.shape[0]*100))

In [9]:
word_counts = pd.DataFrame(word_counts,columns=['Word','Count_percent'])
word_counts.head(10)

Unnamed: 0,Word,Count_percent
0,00,0.179469
1,000,0.484566
2,000pes,0.017947
3,008704050406,0.035894
4,0089,0.017947
5,0121,0.017947
6,01223585236,0.017947
7,01223585334,0.035894
8,0125698789,0.017947
9,02,0.143575


In [10]:
word_counts.shape

(8709, 2)

In [11]:
word_counts[word_counts.Count_percent>0.4].shape

(496, 2)

In [12]:
relevent_cols = word_counts[word_counts.Count_percent>0.4].Word
relevent_cols[:10]

1         000
295        10
296       100
297      1000
307       10p
350      150p
354    150ppm
362        16
367        18
385       1st
Name: Word, dtype: object

In [13]:
df4 = df3[relevent_cols]
df4.shape

(5572, 496)

# Train Test Split

In [14]:
y = df3.spam

In [15]:
df3_x_train,df3_x_test, df3_y_train, df3_y_test = train_test_split(df3,y,train_size = 0.6)
df4_x_train,df4_x_test, df4_y_train, df4_y_test = train_test_split(df4,y,train_size = 0.6)

# Naive Bayes

### Gaussian NB

In [16]:
# df3 : has all features

GNB_df3 = GaussianNB()
GNB_df3.fit(df3_x_train,df3_y_train)
GNB_df3.score(df3_x_test,df3_y_test)

0.9995513683266039

In [17]:
# df4 : has important features

GNB_df4 = GaussianNB()
GNB_df4.fit(df4_x_train,df4_y_train)
GNB_df4.score(df4_x_test,df4_y_test)

1.0

### Multinomial NB

In [18]:
# df3 : has all features

MNB_df3 = MultinomialNB()
MNB_df3.fit(df3_x_train,df3_y_train)
MNB_df3.score(df3_x_test,df3_y_test)

0.9995513683266039

In [19]:
# df4 : has important features

MNB_df4 = MultinomialNB()
MNB_df4.fit(df4_x_train,df4_y_train)
MNB_df4.score(df4_x_test,df4_y_test)

1.0

### Bernaulli NB

In [20]:
# df3 : has all features

BNB_df3 = BernoulliNB()
BNB_df3.fit(df3_x_train,df3_y_train)
BNB_df3.score(df3_x_test,df3_y_test)

0.9995513683266039

In [21]:
# df4 : has important features

BNB_df4 = BernoulliNB()
BNB_df4.fit(df4_x_train,df4_y_train)
BNB_df4.score(df4_x_test,df4_y_test)

1.0

# Example Prediction

In [22]:
email = ['Rofl. Its true to its name']

In [23]:
email_vectorized = countvectorizer.transform(email).toarray()
email_vectorized.shape

(1, 8709)

In [24]:
MNB_df3.predict(df3_x_test[1050:1060])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [25]:
df3_y_train.unique()

array([0], dtype=int64)