In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### Titanic Dataset

In [4]:
df = pd.read_csv('Titanictrain.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [6]:
target = df['Survived']
inputs = df.drop('Survived',axis=1)

In [7]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [8]:
inputs = pd.concat([inputs,dummies],axis=1)
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [9]:
inputs = inputs.drop('Sex',1)

In [11]:
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [12]:
inputs.isnull().sum()

Pclass      0
Age       177
Fare        0
female      0
male        0
dtype: int64

In [13]:
inputs.Age.fillna(inputs.Age.mean(),inplace=True)

In [14]:
inputs.isnull().sum()

Pclass    0
Age       0
Fare      0
female    0
male      0
dtype: int64

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 5) (179, 5) (712,) (179,)


In [17]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [18]:
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [19]:
model.score(X_test, y_test)

0.8100558659217877

In [20]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
468,3,29.699118,7.725,0,1
29,3,29.699118,7.8958,0,1
47,3,29.699118,7.75,1,0
844,3,17.0,8.6625,0,1
630,1,80.0,30.0,0,1
309,1,30.0,56.9292,1,0
438,1,64.0,263.0,0,1
679,1,36.0,512.3292,0,1
714,2,52.0,13.0,0,1
153,3,40.5,14.5,0,1


In [21]:
y_test[:10]

468    0
29     0
47     1
844    0
630    1
309    1
438    0
679    1
714    0
153    0
Name: Survived, dtype: int64

In [22]:
model.predict(X_test[:10])

array([0, 0, 1, 0, 0, 1, 1, 1, 0, 0], dtype=int64)

In [24]:
model.predict_proba(X_test[:10])

array([[9.88672451e-01, 1.13275492e-02],
       [9.88680772e-01, 1.13192284e-02],
       [8.99553149e-02, 9.10044685e-01],
       [9.87132203e-01, 1.28677968e-02],
       [7.93936208e-01, 2.06063792e-01],
       [4.65395600e-03, 9.95346044e-01],
       [7.71319140e-11, 1.00000000e+00],
       [1.86766084e-43, 1.00000000e+00],
       [9.71944485e-01, 2.80555154e-02],
       [9.88939409e-01, 1.10605905e-02]])

> #### The array shows the probabilities of each sample from Titanic data being in the category 0 (not survived) or 1 (survived). The category for which the probability is the highest is chosen as the predicted outcome (i.e. survived or not survived).

### Wine Classification Dataset

In [33]:
from sklearn import datasets
wine = datasets.load_wine()


In [28]:
df1 = pd.DataFrame(wine.data, columns = wine.feature_names)
df1.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [34]:
df1['target'] = wine.target

In [37]:
df1.tail(10)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
168,13.58,2.58,2.69,24.5,105.0,1.55,0.84,0.39,1.54,8.66,0.74,1.8,750.0,2
169,13.4,4.6,2.86,25.0,112.0,1.98,0.96,0.27,1.11,8.5,0.67,1.92,630.0,2
170,12.2,3.03,2.32,19.0,96.0,1.25,0.49,0.4,0.73,5.5,0.66,1.83,510.0,2
171,12.77,2.39,2.28,19.5,86.0,1.39,0.51,0.48,0.64,9.899999,0.57,1.63,470.0,2
172,14.16,2.51,2.48,20.0,91.0,1.68,0.7,0.44,1.24,9.7,0.62,1.71,660.0,2
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740.0,2
174,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0,2
177,14.13,4.1,2.74,24.5,96.0,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560.0,2


In [38]:
from sklearn.model_selection import train_test_split

In [40]:
y = df1['target']
X = df1.drop('target',1)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [42]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [43]:
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [44]:
model.score(X_test,y_test)

1.0

In [45]:
model.predict(X_test)

array([1, 2, 0, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 0, 2, 0, 1, 0, 2, 0,
       1, 1, 0, 0, 1, 1, 1, 2, 2, 1, 0, 1, 2, 2, 1, 1, 2, 2, 0, 2, 2, 2,
       0, 2, 2, 2, 0, 0, 0, 1, 0, 1])

In [56]:
model.predict_proba(X_test[:10])

array([[1.11832486e-11, 1.00000000e+00, 1.34473804e-10],
       [1.20229672e-12, 4.49146971e-14, 1.00000000e+00],
       [9.99999989e-01, 1.13918472e-08, 1.24913715e-29],
       [1.21093960e-03, 9.98789060e-01, 2.60804876e-29],
       [1.45797573e-06, 7.60101068e-14, 9.99998542e-01],
       [8.53559168e-21, 1.08291883e-13, 1.00000000e+00],
       [4.98860110e-08, 9.99999950e-01, 7.90048182e-48],
       [7.40165409e-13, 9.99529638e-01, 4.70361694e-04],
       [3.96171935e-14, 9.99999961e-01, 3.85074914e-08],
       [2.18993106e-02, 9.78100689e-01, 8.47052651e-32]])

> #### The array shows the probabilities of each sample being from the category 0, 1 or 2. The category for which the probability is the highest is selected as the category of the wine.

In [47]:
print(y_test)

88     1
159    2
11     0
74     1
158    2
149    2
99     1
96     1
90     1
95     1
134    2
65     1
171    2
165    2
169    2
15     0
145    2
7      0
77     1
41     0
150    2
32     0
118    1
92     1
40     0
1      0
75     1
114    1
64     1
163    2
147    2
69     1
26     0
97     1
146    2
151    2
111    1
119    1
170    2
142    2
29     0
152    2
136    2
167    2
46     0
174    2
177    2
139    2
20     0
31     0
28     0
102    1
44     0
59     1
Name: target, dtype: int32


In [49]:
model2 = MultinomialNB()
model2.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [51]:
model2.score(X_test,y_test)

0.7777777777777778

In [52]:
model2.predict(X_test)

array([2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 2, 0, 1, 0, 1, 0,
       1, 1, 2, 0, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1, 1, 2, 1, 0, 1, 2, 2,
       0, 2, 2, 1, 1, 0, 0, 1, 0, 2])

In [55]:
print(y_test)

88     1
159    2
11     0
74     1
158    2
149    2
99     1
96     1
90     1
95     1
134    2
65     1
171    2
165    2
169    2
15     0
145    2
7      0
77     1
41     0
150    2
32     0
118    1
92     1
40     0
1      0
75     1
114    1
64     1
163    2
147    2
69     1
26     0
97     1
146    2
151    2
111    1
119    1
170    2
142    2
29     0
152    2
136    2
167    2
46     0
174    2
177    2
139    2
20     0
31     0
28     0
102    1
44     0
59     1
Name: target, dtype: int32


> #### The accuracy of Gaussian Naive Bayes is more than the Multinomial Naive Bayes method for this data because all our features are actually continuous in nature.

In [62]:
import scipy.stats as st

#### Let's check for the normality in data
> #### H0 (null hypothesis): Data is normally distributed.
> #### H1 (alternate hypothesis): Data is not normally distributed.

In [63]:
st.shapiro(df1)

(0.3382537364959717, 0.0)

> #### The p value (0.0) is less than alpha (0.05). Thus, we fail to reject H0. This implies that the data is normally distributed. This explains the reason for the high accuracy of Gaussian Naive Bayes model because GaussianNB works best when the data distribution is normal.