In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.naive_bayes import GaussianNB

In [4]:
df = pd.read_csv('titanic.csv')

In [5]:
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


<h3> Pre Processing Data </h3>

In [6]:
df.columns

Index(['PassengerId', 'Name', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived'],
      dtype='object')

In [7]:
df.drop(columns = ['PassengerId', 'Name','SibSp', 'Parch','Ticket','Cabin', 'Embarked'],inplace = True)

In [8]:
df.isna().sum()

Pclass        0
Sex           0
Age         177
Fare          0
Survived      0
dtype: int64

In [9]:
mean_age = df['Age'].mean()
mean_age

29.69911764705882

In [10]:
df['Age'].replace(np.NaN,mean_age,inplace = True)

In [11]:
df.isna().sum()

Pclass      0
Sex         0
Age         0
Fare        0
Survived    0
dtype: int64

In [12]:
target = df['Survived']

In [13]:
features = df.drop(columns='Survived')

In [14]:
dummies = pd.get_dummies(features['Sex'],dtype=int)

In [15]:
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [16]:
features = pd.concat([features,dummies],axis ='columns')

In [17]:
features.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [18]:
features.drop(columns='Sex',inplace = True)

In [19]:
features.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [20]:
x_train,x_test,y_train,y_test = train_test_split(features,target,test_size=0.2)

In [21]:
x_train.head()

Unnamed: 0,Pclass,Age,Fare,female,male
98,2,34.0,23.0,1,0
155,1,51.0,61.3792,0,1
497,3,29.699118,15.1,0,1
85,3,33.0,15.85,1,0
532,3,17.0,7.2292,0,1


In [22]:
model  = GaussianNB()

In [23]:
model.fit(x_train.values,y_train.values)

In [24]:
model.predict([[3,20,30,1,0]])

array([1], dtype=int64)

In [25]:
model.predict([[1,20,5,0,1]])

array([0], dtype=int64)

In [26]:
model.score(x_test.values,y_test.values)

0.7541899441340782

In [27]:
cross_val_score(GaussianNB(),x_train,y_train)

array([0.7972028 , 0.83216783, 0.78169014, 0.73239437, 0.79577465])

In [29]:
model.predict_proba(x_test[:10].values)

array([[0.99024515, 0.00975485],
       [0.84863823, 0.15136177],
       [0.97897456, 0.02102544],
       [0.9878903 , 0.0121097 ],
       [0.97996464, 0.02003536],
       [0.01259944, 0.98740056],
       [0.99048592, 0.00951408],
       [0.7733887 , 0.2266113 ],
       [0.98841379, 0.01158621],
       [0.04852459, 0.95147541]])