In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB

In [2]:
titanic  = pd.read_csv("titanic_dataset.csv")
print(titanic.head())

   PassengerId                                               Name  Pclass  \
0            1                            Braund, Mr. Owen Harris       3   
1            2  Cumings, Mrs. John Bradley (Florence Briggs Th...       1   
2            3                             Heikkinen, Miss. Laina       3   
3            4       Futrelle, Mrs. Jacques Heath (Lily May Peel)       1   
4            5                           Allen, Mr. William Henry       3   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  \
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S   
1  female  38.0      1      0          PC 17599  71.2833   C85        C   
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S   
3  female  35.0      1      0            113803  53.1000  C123        S   
4    male  35.0      0      0            373450   8.0500   NaN        S   

   Survived  
0         0  
1         1  
2         1  
3         1  
4         0  


In [3]:
titanic.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis="columns",inplace=True)
print(titanic.head())

   Pclass     Sex   Age     Fare  Survived
0       3    male  22.0   7.2500         0
1       1  female  38.0  71.2833         1
2       3  female  26.0   7.9250         1
3       1  female  35.0  53.1000         1
4       3    male  35.0   8.0500         0


In [4]:
sex_dummies = pd.get_dummies(titanic.Sex)
print(sex_dummies)

     female  male
0         0     1
1         1     0
2         1     0
3         1     0
4         0     1
..      ...   ...
886       0     1
887       1     0
888       1     0
889       0     1
890       0     1

[891 rows x 2 columns]


In [5]:
merge = pd.concat([titanic,sex_dummies],axis="columns")
titanic_final = merge.drop(['Sex'],axis="columns")
print(titanic_final)

     Pclass   Age     Fare  Survived  female  male
0         3  22.0   7.2500         0       0     1
1         1  38.0  71.2833         1       1     0
2         3  26.0   7.9250         1       1     0
3         1  35.0  53.1000         1       1     0
4         3  35.0   8.0500         0       0     1
..      ...   ...      ...       ...     ...   ...
886       2  27.0  13.0000         0       0     1
887       1  19.0  30.0000         1       1     0
888       3   NaN  23.4500         0       1     0
889       1  26.0  30.0000         1       0     1
890       3  32.0   7.7500         0       0     1

[891 rows x 6 columns]


In [6]:
age_median = titanic_final.Age.median()
titanic_final.Age = titanic_final.Age.fillna(age_median)
print(titanic_final)

     Pclass   Age     Fare  Survived  female  male
0         3  22.0   7.2500         0       0     1
1         1  38.0  71.2833         1       1     0
2         3  26.0   7.9250         1       1     0
3         1  35.0  53.1000         1       1     0
4         3  35.0   8.0500         0       0     1
..      ...   ...      ...       ...     ...   ...
886       2  27.0  13.0000         0       0     1
887       1  19.0  30.0000         1       1     0
888       3  28.0  23.4500         0       1     0
889       1  26.0  30.0000         1       0     1
890       3  32.0   7.7500         0       0     1

[891 rows x 6 columns]


In [7]:
X = titanic_final[['Pclass','Age','Fare','male','female']].values
y = titanic_final['Survived'].values

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.98)

In [9]:
model = GaussianNB()

In [10]:
model.fit(X_train,y_train)
y_predicted = model.predict(X_test)
y_actual =y_test
print("y_predicted: ",y_predicted)
print("y_actual:   ",y_actual)

y_predicted:  [0 0 0 0 1 0 0 0 0 1 0 1 1 1 1 0 0 1]
y_actual:    [0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1]


In [11]:
score = model.score(X_test,y_test)
print(score)

0.7222222222222222
