# Titanic Survival prediction using NAIVE BAYES

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

### Load the Dataset

In [2]:
data = pd.read_csv("titanicsurvival.csv")
data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


### Summarize Dataset

In [3]:
data.shape

(891, 5)

In [4]:
data.describe()

Unnamed: 0,Pclass,Age,Fare,Survived
count,891.0,714.0,891.0,891.0
mean,2.308642,29.699118,32.204208,0.383838
std,0.836071,14.526497,49.693429,0.486592
min,1.0,0.42,0.0,0.0
25%,2.0,20.125,7.9104,0.0
50%,3.0,28.0,14.4542,0.0
75%,3.0,38.0,31.0,1.0
max,3.0,80.0,512.3292,1.0


### Mapping the test data into binary data

In [5]:
income_set = set(data['Sex'])
data['Sex']= data['Sex'].map({'female':0,'male':1}).astype(int)
print(data.head())

   Pclass  Sex   Age     Fare  Survived
0       3    1  22.0   7.2500         0
1       1    0  38.0  71.2833         1
2       3    0  26.0   7.9250         1
3       1    0  35.0  53.1000         1
4       3    1  35.0   8.0500         0


### Segregate Dataset into X(Input/Independent variables) and Y(Output/Dependent variables)

In [6]:
X =data.drop('Survived',axis='columns')
print(X)
Y = data[['Survived']]
print(Y)

     Pclass  Sex   Age     Fare
0         3    1  22.0   7.2500
1         1    0  38.0  71.2833
2         3    0  26.0   7.9250
3         1    0  35.0  53.1000
4         3    1  35.0   8.0500
..      ...  ...   ...      ...
886       2    1  27.0  13.0000
887       1    0  19.0  30.0000
888       3    0   NaN  23.4500
889       1    1  26.0  30.0000
890       3    1  32.0   7.7500

[891 rows x 4 columns]
     Survived
0           0
1           1
2           1
3           1
4           0
..        ...
886         0
887         1
888         0
889         1
890         0

[891 rows x 1 columns]


### Finding and removing NA values from our features X

In [7]:
X.columns[X.isna().any()]

Index(['Age'], dtype='object')

In [8]:
X.Age = X.Age.fillna(X.Age.mean())

### Test again to check any na value

In [9]:
X.isna().sum()

Pclass    0
Sex       0
Age       0
Fare      0
dtype: int64

### Splitting Dataset into Train and Test

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=0,test_size=0.25)

### Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training

In [12]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


### Predicting wheather new customers with Age,Sex,Pclass and fare will Survived or not

In [13]:
Pclass = int(input("Enter new Customers Pclass: "))
Sex = int(input("Enter new Customers Sex-For Male(1) or Female(0): "))
Age = int(input("Enter new Customers Age: "))
Fare = float(input("Enter new Customers Fare: "))
newCust = [[Pclass,Sex,Age,Fare]]
Status = model.predict(newCust)
print(Status)
if Status==1:
    print("Customer might be Survived")
else:
    print("Customer might not be Survived")

Enter new Customers Pclass:  3
Enter new Customers Sex-For Male(1) or Female(0):  1
Enter new Customers Age:  22
Enter new Customers Fare:  15.25


[1]
Customer might be Survived


### Prediction for all Test Data

In [14]:
y_pred = model.predict(X_test)
print(np.column_stack((y_pred,Y_test)))

[[0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]

In [15]:
### Evaluating Model - Confusion Matrix

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test,y_pred)
print("Confusion Matrix: ")
print(cm)

print("Accuracy Score: ",accuracy_score(Y_test,y_pred)*100)

Confusion Matrix: 
[[110  29]
 [ 21  63]]
Accuracy Score:  77.57847533632287
