### Predicting whether a person survived or not using Guassian Naive Bayes

<p>I have trained a Guassian Naive Bayes model on a dataset containing data of the people who survived/died in the giant Titanic incident. The model considers several parameters such as the passenger class, gender, age of passenger and cost of ticket to classify whether the person has survived the incident or not.</p>

<p>Surprisingly the model is giving an accuracy of 100%</p>

In [61]:
import pandas as pd
df = pd.read_csv("Titanic_dataset.csv")
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [34]:
# Removing unrelevant columns
df.drop(["Name", "PassengerId", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], axis = "columns", inplace=True)

In [35]:
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,34.5,7.8292
1,1,3,female,47.0,7.0
2,0,2,male,62.0,9.6875
3,0,3,male,27.0,8.6625
4,1,3,female,22.0,12.2875


In [36]:
target = df.Survived
inputs = df.drop("Survived", axis = "columns")

In [37]:
# Converting Sex columns into Dummies
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)


Unnamed: 0,female,male
0,0,1
1,1,0
2,0,1


In [38]:
# concatenating dummies to inputs dataframe
inputs = pd.concat([inputs, dummies], axis = "columns")
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,34.5,7.8292,0,1
1,3,female,47.0,7.0,1,0
2,2,male,62.0,9.6875,0,1


In [39]:
# Dropping the sex column
inputs = inputs.drop("Sex", axis = "columns")

In [40]:
inputs.head(5)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,34.5,7.8292,0,1
1,3,47.0,7.0,1,0
2,2,62.0,9.6875,0,1
3,3,27.0,8.6625,0,1
4,3,22.0,12.2875,1,0


In [41]:
# Checkinh whether there is any null values in the columns
inputs.columns[inputs.isna().any()]

Index(['Age', 'Fare'], dtype='object')

In [42]:
# Filling the missing age values with mean of the age
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head(12)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,34.5,7.8292,0,1
1,3,47.0,7.0,1,0
2,2,62.0,9.6875,0,1
3,3,27.0,8.6625,0,1
4,3,22.0,12.2875,1,0
5,3,14.0,9.225,0,1
6,3,30.0,7.6292,1,0
7,2,26.0,29.0,0,1
8,3,18.0,7.2292,1,0
9,3,21.0,24.15,0,1


In [54]:
# Filling the missing fare values with median of the fair
inputs.Fare = inputs.Fare.fillna(inputs.Fare.median())

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

In [45]:
len(X_train)

334

In [46]:
len(X_test)

84

In [47]:
# Importing Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [56]:
# Training the model
model.fit(X_train, y_train)

In [53]:
# Checking the accuracy score
model.score(X_test, y_test)

1.0

In [59]:
# Predicting Values
model.predict(X_test[:20])

array([1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [60]:
y_test.head(20)

385    1
146    0
64     0
51     0
348    0
93     0
332    0
383    1
175    1
303    0
229    0
314    1
271    0
122    1
295    0
177    0
151    0
183    0
352    0
173    0
Name: Survived, dtype: int64