<a href="https://colab.research.google.com/github/sowmyarajesh/ML_ANN/blob/main/ANN_BinaryClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
print(tf.__version__)

2.8.0


### Load and preprocess the data

In [28]:
# import dataset from the local . download the data from titanic folder in this repo or from https://www.kaggle.com/competitions/titanic/data
from google.colab import files
upload  = files.upload()

Saving gender_submission.csv to gender_submission (3).csv
Saving test.csv to test (3).csv
Saving train.csv to train (3).csv


In [29]:
!ls

'gender_submission (1).csv'   sample_data     test.csv	       train.csv
'gender_submission (2).csv'  'test (1).csv'  'train (1).csv'
'gender_submission (3).csv'  'test (2).csv'  'train (2).csv'
 gender_submission.csv	     'test (3).csv'  'train (3).csv'


In [30]:
train = pd.read_csv('train.csv')
x_test = pd.read_csv('test.csv')
y_test = pd.read_csv('gender_submission.csv')

In [31]:
y_test.columns, x_test.columns

(Index(['PassengerId', 'Survived'], dtype='object'),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [32]:
# size of the training and testing data set
train.shape, x_test.shape, y_test.shape


((891, 12), (418, 11), (418, 2))

In [33]:
# merge all both tables for test
test = pd.merge(x_test,y_test,how='inner',on='PassengerId')
test.shape, test.columns

((418, 12),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived'],
       dtype='object'))

Understanding the data

In [34]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [35]:
train.columns, test.columns

(Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived'],
       dtype='object'))

In [36]:
# frequency table for each columns
for col in train.columns:
  print("========{}==========\n".format(col))
  freq = train[col].value_counts()
  print(freq)



1      1
599    1
588    1
589    1
590    1
      ..
301    1
302    1
303    1
304    1
891    1
Name: PassengerId, Length: 891, dtype: int64

0    549
1    342
Name: Survived, dtype: int64

3    491
1    216
2    184
Name: Pclass, dtype: int64

Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: Name, Length: 891, dtype: int64

male      577
female    314
Name: Sex, dtype: int64

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88

In [37]:
# Missing values
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [38]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Survived         0
dtype: int64

columns in the data data set are ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
here we will not need the following columns in the model:

- PassengerId - it is an unique identifier to each record. 
- Name - refer to the name of the induvidual. No significance to output
- Ticket - refers to the ticket ID for reference.
- Fare - refers to the total fair. 
- cabin - lots oif missing values. Frequency table doesnot provide enough information for imputation.

**Handle missing value**

Age: dropping the column may have some effect on the output. The missing rows can be imputed with mean age

Embarked - drop the missing rows since it is only 2



In [39]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Fare','Cabin']
train = train.copy().drop(labels=drop_cols, axis=1)
test = test.copy().drop(labels=drop_cols, axis=1)

In [40]:
# handle missing value
train['Age'] = train['Age'].fillna(np.mean(train['Age']))
test['Age'] = test['Age'].fillna(np.mean(test['Age']))

In [41]:
np.mean(train['Age'])

29.699117647058763

In [42]:
train = train[~train['Embarked'].isna()]
test = test[~test['Embarked'].isna()]

In [43]:
train.isna().sum()


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [44]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [47]:
# one hot encoding of the categorical features
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
train['Sex']=LE.fit_transform(train['Sex'])
test['Sex']=LE.fit_transform(test['Sex'])
train= pd.get_dummies(train, drop_first=True)
test= pd.get_dummies(train, drop_first=True)

In [48]:
train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,0,1
1,1,1,0,38.0,1,0,0,0
2,1,3,0,26.0,0,0,0,1
3,1,1,0,35.0,1,0,0,1
4,0,3,1,35.0,0,0,0,1


In [56]:
x_train = train.copy()
y_train = train['Survived']
x_train.drop(labels=['Survived'], inplace=True, axis=1 )
x_test = test.copy()
y_test = test['Survived']
x_test.drop(labels=['Survived'], inplace=True, axis=1 )

x_train.columns, x_test.columns, y_train.shape, y_test.shape

(Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked_Q', 'Embarked_S'], dtype='object'),
 Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked_Q', 'Embarked_S'], dtype='object'),
 (889,),
 (889,))

In [58]:
x_train.shape

(889, 7)

### Build the model

To build the model,first initialize the model


In [57]:
# initialize the sequence
model = tf.keras.models.Sequential()


**Input layer**

 we will use the following parameters in input layer
(units=128,  activation='relu',  input_shape=(784,))


units : This can be any number with in the number of inputs to be filters from the input features. We start with different 128. 

activation: relu is the activation function as it is faster than sigmoid/ tanh functions

input_shape : it shows the number of columns in the input. 

In [63]:
inputLayer = tf.keras.layers.Dense(units=7, activation='relu', input_dim=7)
hiddenLayer1 =  tf.keras.layers.Dense(units=5, activation='relu')
outputLayer = tf.keras.layers.Dense(units=1, activation="sigmoid")
model.add(inputLayer)
model.add(hiddenLayer1)
model.add(outputLayer)

### Train the model

**Compile the model:**

First step to training is compile the defined model. 

The compiling step requires three parameters

optimizer = the function used to minimize the loss function. "adam" is the most common optimizer used

loss function = acts as a guide to the optimizer in the right direction so that optimizer can reach the global minimum. Since this is a binary classification problem, we will be using BinaryCrossentropy

metrices = this will list the performance metrices to be considered in guiding the optimizer in the right direction. In this model, we will be using the BinaryAccuracy

In [88]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [89]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 7)                 56        
                                                                 
 dense_1 (Dense)             (None, 7)                 56        
                                                                 
 dense_2 (Dense)             (None, 7)                 56        
                                                                 
 dense_3 (Dense)             (None, 7)                 56        
                                                                 
 dense_4 (Dense)             (None, 1)                 8         
                                                                 
 dense_5 (Dense)             (None, 7)                 14        
                                                                 
 dense_6 (Dense)             (None, 5)                 4

In [90]:
model.fit(x=x_train.to_numpy(), y=y_train.to_numpy(), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2862c7c5d0>

### Evaluate the model

In [85]:
y_test.value_counts()

0    549
1    340
Name: Survived, dtype: int64

In [91]:
test_loss, test_accuracy = model.evaluate(x_test,y_test)



In [92]:
print("Accuracy value = ", test_accuracy)

Accuracy value =  0.8402699828147888


In [96]:
from sklearn import metrics

y_pred =np.round(model.predict(x_test))
# print(y_pred)
metrics.confusion_matrix(y_test,y_pred)

array([[496,  53],
       [ 89, 251]])

In [97]:
metrics.accuracy_score(y_test,y_pred)

0.8402699662542182