#### Naive Bayes Classification Algorithm
    prediction is based upon probability of an object

    naive : assumption that features are independent of each other
    bayes: used bayes theorem of conditional probability

In [104]:
import pandas as pd

df = pd.read_csv('./data/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [105]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [106]:
target = df['Survived']

In [107]:
inputs = df.drop(['Survived'], axis='columns')
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [108]:
inputs = pd.get_dummies(inputs,columns=['Sex'])
inputs.head()

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [109]:
inputs.rename(columns={'Sex_female':'female', 'Sex_male':'male'}, inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [110]:
# checking if any columns contain NA values
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [111]:
# filling NA values by mean
inputs['Age'] = inputs['Age'].fillna(inputs['Age'].mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [112]:
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [113]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(inputs, target, test_size=0.2, random_state=10) 
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(712, 5) (179, 5) (712,) (179,)


In [114]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, Y_train)

In [115]:
model.predict_proba(X_test[:10])

array([[9.88386356e-01, 1.16136435e-02],
       [9.86517813e-01, 1.34821869e-02],
       [9.87819352e-01, 1.21806477e-02],
       [1.69366845e-05, 9.99983063e-01],
       [2.50380494e-03, 9.97496195e-01],
       [9.86311511e-01, 1.36884891e-02],
       [9.87645433e-01, 1.23545669e-02],
       [9.73960508e-01, 2.60394920e-02],
       [9.88379286e-01, 1.16207142e-02],
       [9.87084311e-01, 1.29156895e-02]])

In [116]:
model.predict(X_test[:10])

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0], dtype=int64)

In [117]:
Y_test[:10]

590    0
131    0
628    0
195    1
230    1
646    0
75     0
586    0
569    1
287    0
Name: Survived, dtype: int64

In [118]:
model.score(X_test, Y_test)

0.8268156424581006

#### Spam Email Detection using Naive Bayes

In [119]:
df = pd.read_csv('./data/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [120]:
df = pd.get_dummies(df, columns=['Category'])
df.head()

Unnamed: 0,Message,Category_ham,Category_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [121]:
df = df.drop(['Category_ham'], axis='columns')
df.rename(columns={'Category_spam' : 'Spam'}, inplace=True)
df.head()

Unnamed: 0,Message,Spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
df['Message']

In [122]:
X_train, X_test, Y_train, Y_test = train_test_split(df['Message'], df['Spam'], test_size=0.2, random_state=10) 
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(4457,) (1115,) (4457,) (1115,)


In [124]:
from sklearn.feature_extraction.text import CountVectorizer

# we use count vectorizer method to convert message column to matrix of numbers
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [126]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_count, Y_train)

In [129]:
X_test_counts = v.transform(X_test)
model.score(X_test_counts, Y_test)

0.9829596412556054

In [130]:
from sklearn.pipeline import Pipeline

# using pipelines for using vectorizer & naive bayes directly on the data 
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [131]:
clf.fit(X_train, Y_train)

In [132]:
clf.score(X_test, Y_test)

0.9829596412556054