<a href="https://colab.research.google.com/github/ramkumarr02/Titanic/blob/master/Titanic_DL_v2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

## Packages



In [0]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
tf.executing_eagerly()

print(tf.__version__)

2.0.0


## Load Data


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Titanic/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Titanic/test.csv')

train_copy = train.copy()
test_copy = test.copy()

# Functions

## Feature Engineering (Split columns)


In [0]:
def column_split(df):

    try:
        df['SurName'], df['Name'] = df['Name'].str.split(',', 1).str
        df['Title'], df['Name'] = df['Name'].str.split('.', 1).str
        df['Cabin_Section'] = df[df['Cabin'].notna()]['Cabin'].astype(str).str[0]
        df['Cabin_Nums'] = df[df['Cabin'].notna()]['Cabin'].str.count(" ") + 1
    except:
        pass
    
    return(df)

## One hot Encoder

In [0]:
def encode_str(df):

    df = pd.get_dummies(df, columns=['Sex', 'Embarked','Title','Cabin_Section', 'Ticket'])
    df = df.round(1)

    text_cols = list(df.select_dtypes(include=['object']).columns)
    df = df.drop(text_cols, axis=1)
    
    return(df)

## Data PreProcessor (All above processing)


In [0]:
def data_preprocess(df):

    df = column_split(df)
    df = encode_str(df)
    df.fillna(0, inplace = True)
 
    return(df)

## Scale Data

In [0]:
def scale_data(df):

    scaled_features = StandardScaler().fit_transform(df.values)
    df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
 
    return(df)

## Add & Remove columns to match the training data

In [0]:
def match_cols(df1, df2, col_name):
    miss_cols = set(df1[col_name]) - set(df2[col_name])
    print(len(miss_cols))    

    for col in miss_cols:
        df2 = df2.append([{col_name:col}], ignore_index=True)

    miss_cols = set(df1[col_name]) - set(df2[col_name])
    print(len(miss_cols)) 

    return(df2)  

# Training

## PreP, Split and Scale data

In [0]:
# Replicate dataframe multiple times
train = pd.concat([train]*20, ignore_index=True)

In [0]:
df = data_preprocess(train)

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(df.loc[:, df.columns != 'Survived'], df['Survived'],train_size = 0.8,random_state = 1)

In [0]:
scaled_train_x = scale_data(train_x)

## Model Layers

In [0]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(128, activation='relu', input_shape=[len(train_x.keys())]),
  tf.keras.layers.Dense(128, activation='relu'),              
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

## Model Compiler

In [0]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',    
    metrics=['accuracy'])

## Fitting

In [0]:
model.fit(scaled_train_x, train_y, epochs=20)

Train on 14256 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3365ed3eb8>

## Validation

### Scale and Convert valid_dfs to Tensor

In [0]:
scaled_valid_x = scale_data(valid_x)

### Model Evaluation

In [0]:
test_loss, test_accuracy = model.evaluate(scaled_valid_x, valid_y)
print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))



Test Loss 0.007930606924168767, Test Accuracy 0.9957912564277649


# Testing

## Data Prep

### Add Train-Test Categories to Test

In [0]:
test_org = test.copy()

test = match_cols(train_copy, test, 'Cabin')
test = match_cols(train_copy, test, 'Name')
test = match_cols(train_copy, test, 'Ticket')

110
0
889
0
566
0


### PreP Full_train & Test data

In [0]:
pp_train = data_preprocess(train_copy)
pp_test = data_preprocess(test)

### Remove Test-Train categories from Test

In [0]:
miss_cols = set(pp_test.columns) - set(pp_train.columns)
print(len(miss_cols))   

pp_test = pp_test.drop(miss_cols, axis=1)    

## Remove excess rows
pp_test = pp_test[pp_test.sum(axis=1) >= 4]
pp_train = pp_train[pp_train.sum(axis=1) >= 4]

249


### Scale Test and train Data

In [0]:
scaled_test_data = scale_data(pp_test)

target = pp_train.pop('Survived')
scaled_full_data = scale_data(pp_train)

## Fit Model to Full Data

In [0]:
model.fit(scaled_full_data, target, epochs=20)

Train on 891 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f33665a45c0>

## Predict

In [0]:
predictions = model.predict(scaled_test_data)
predictions = np.round(predictions, decimals= 0).tolist()
flat_list = [item for sublist in predictions for item in sublist]
flat_list = np.array(flat_list).astype(int)

test_org['Survived'] = flat_list



## Write into a CSV

In [0]:
df_results = test_org[['PassengerId', 'Survived']]
df_results.to_csv('results.csv', index = False)

In [0]:
#scaled_train_x.head(100).to_csv('train.csv')
#scaled_test_data.head(100).to_csv('test.csv')

In [0]:
df_results['Survived'].value_counts()

0    261
1    157
Name: Survived, dtype: int64

In [0]:
df_results

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
