# Import dependancies

In [108]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.model_selection import train_test_split


## Load titanic dataset and preprocess

In [109]:
# load dataset
df = pd.read_csv('titanic.csv')

In [110]:
# visualiasing first 20 entries of data
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [111]:
# description of the data
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Handling nan(empty values) in the dataset

There nan values in Age, cabin and Embarked columns which must be handled

### Get number of nan(empty values) in columns with NaN

In [112]:
# number of empty values
num_of_nan_Age = df['Age'].isnull().sum()
num_of_nan_Cabin = df['Cabin'].isnull().sum()
num_of_nan_Embarked = df['Embarked'].isnull().sum()
total_nan_values = df.isnull().sum().sum()


In [113]:
# print results
print("Number of empty values in Age: ",num_of_nan_Age)
print("Number of empty values in Cabin: ",num_of_nan_Cabin)
print("Number of empty values in Embarked: ",num_of_nan_Embarked)
print("Total number of empty values in entire dataset: ",total_nan_values)

Number of empty values in Age:  177
Number of empty values in Cabin:  687
Number of empty values in Embarked:  2
Total number of empty values in entire dataset:  866


There are total of 866 empty values in the entire dataset which can not be dropped.I will replace the nan values in Age coloumn with the average age and then for Cabin and Embarked columns with the mode value in each column.

### Get mean and mode values

In [114]:
average_age = df['Age'].mean()
cabin_mode = df['Cabin'].mode()[0]
embarked_mode = df['Embarked'].mode()[0]

In [115]:
print("Average age: ", average_age)
print("Cabin mode: ", cabin_mode)
print("Embarked mode: ", embarked_mode)

Average age:  29.69911764705882
Cabin mode:  B96 B98
Embarked mode:  S


### Change values

The mode for Cabin is C23 and that of Embarked is S

In [116]:
df.Age.fillna(value=average_age, inplace=True)
df.Cabin.fillna(value=cabin_mode, inplace=True)
df.Embarked.fillna(value=embarked_mode, inplace=True)


All empty values has been replace

In [117]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


Now all empty values has be replaced

In [118]:
print('Total number of NaN (empty values): ',  df.isnull().sum().sum()) 


Total number of NaN (empty values):  0


### Drop Name and ticket from data

In [119]:
new_df= df.drop(['Name','Ticket'], axis=1)

In [120]:
new_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.000000,1,0,7.2500,B96 B98,S
1,2,1,1,female,38.000000,1,0,71.2833,C85,C
2,3,1,3,female,26.000000,0,0,7.9250,B96 B98,S
3,4,1,1,female,35.000000,1,0,53.1000,C123,S
4,5,0,3,male,35.000000,0,0,8.0500,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.000000,0,0,13.0000,B96 B98,S
887,888,1,1,female,19.000000,0,0,30.0000,B42,S
888,889,0,3,female,29.699118,1,2,23.4500,B96 B98,S
889,890,1,1,male,26.000000,0,0,30.0000,C148,C


### Converting categorical data to a numerical value by one-hot-encoding

Machine learning models understands only numbers, so the need to change categorical data to numerical

In [121]:
one_hot_Sex = pd.get_dummies(new_df['Sex'])
# one_hot_Cabin = pd.get_dummies(new_df['Cabin'])
# one_hot_Embarked = pd.get_dummies(new_df['Embarked'])

In [122]:
new_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.000000,1,0,7.2500,B96 B98,S
1,2,1,1,female,38.000000,1,0,71.2833,C85,C
2,3,1,3,female,26.000000,0,0,7.9250,B96 B98,S
3,4,1,1,female,35.000000,1,0,53.1000,C123,S
4,5,0,3,male,35.000000,0,0,8.0500,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.000000,0,0,13.0000,B96 B98,S
887,888,1,1,female,19.000000,0,0,30.0000,B42,S
888,889,0,3,female,29.699118,1,2,23.4500,B96 B98,S
889,890,1,1,male,26.000000,0,0,30.0000,C148,C


In [123]:
new_df= df.drop(['Name','Ticket'], axis=1)

In [124]:
# Drop encoded columns
new_df = new_df.drop(['Sex','Cabin','Embarked','PassengerId'],axis = 1)


In [125]:
# Join the encoded columns to df
new_df = new_df.join([one_hot_Sex])

In [126]:
# Joining  the encoded df
new_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male
0,0,3,22.000000,1,0,7.2500,0,1
1,1,1,38.000000,1,0,71.2833,1,0
2,1,3,26.000000,0,0,7.9250,1,0
3,1,1,35.000000,1,0,53.1000,1,0
4,0,3,35.000000,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,0,1
887,1,1,19.000000,0,0,30.0000,1,0
888,0,3,29.699118,1,2,23.4500,1,0
889,1,1,26.000000,0,0,30.0000,0,1


In [127]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   female    891 non-null    uint8  
 7   male      891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(2)
memory usage: 43.6 KB


In [128]:
X = new_df.iloc[:,1:]
y = new_df['Survived']

# Build prediction model

### training and test data

In [129]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [130]:
print("Shape of training data:> ", X_train.shape)

Shape of training data:>  (712, 7)


### Build model

In [131]:
model = Sequential()
model.add(layers.Dense(512, activation="relu", input_shape=(7,)))
model.add(layers.Dense(1024, activation="relu" ))
model.add(layers.Dense(64, activation="relu" ))
model.add(layers.Dense(32, activation="relu" ))
model.add(layers.Dense(1,  activation='sigmoid' ))

### Model architecture

In [132]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 512)               4096      
                                                                 
 dense_41 (Dense)            (None, 1024)              525312    
                                                                 
 dense_42 (Dense)            (None, 64)                65600     
                                                                 
 dense_43 (Dense)            (None, 32)                2080      
                                                                 
 dense_44 (Dense)            (None, 1)                 33        
                                                                 
Total params: 597,121
Trainable params: 597,121
Non-trainable params: 0
_________________________________________________________________


In [133]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(0.0001), metrics=['accuracy'])

In [134]:
history = model.fit(
    X_train,
    y_train,
    epochs=30,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### predictions

In [135]:
preds = model.predict(X_val)




In [136]:
preds[:5]

array([[0.12658511],
       [0.1967811 ],
       [0.1450419 ],
       [0.76652974],
       [0.53948873]], dtype=float32)

Model accracy was about 77% whereas the validation accruracy was 79%, After whch few predictions were made.