In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools

# matplotlib
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Titanic: Machine Learning from Disaster

This kernel uses solve the prediction of survival of a person on the Titanic, using a Neural Network of Sequential Model from Keras API for TensorFlow.

Further, I will be using Grid Search for Hyperparameter tuning of the model.

## 1. Data Understanding

[](http://)First let us take a look at how our data looks like, to get more details on the characteristics of the attributes/features.

**1.1 Peek into the Data:**

In [None]:
# Reading the train and the test data.
trainData = pd.read_csv('../input/train.csv')
testData = pd.read_csv('../input/test.csv')

# Displaying a sample of the train data to get more detailed info
trainData.head()

**1.2  Summary of the data:**

In [None]:
trainData.describe()

**1.3 Some insights :**
* The data set consists of 12 attributes (PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked)
    1. PassengerId - Numeric data, it is unique for each passenger, not useful.
    2. Survived - Target data to be predicted, with incators 0 (Passenger Died) or 1 (Passenger Survived).
    3. Pclass - Numeric data, has 3 categories (1st, 2nd or 3rd), useful.
    4. Name - String data type, mostly unique for each passenger, not useful.
    5. Sex - Categorical data, has 2 categories (male, female), useful.
    6. Age - Numeric data, max age is 80, it contains few nulls (can be imputed), useful.
    7. SibSp - Numeric data, max number of siblings / spouses aboard is 8, useful.
    8. Parch - Numeric data, max number of parents / children aboard is 6, useful.
    9. Ticket - Categorical data, it has a prefix to the number,  it can be useful.
    10. Fare - Numeric data, higher the class, higher is the fare (need to check), max fare is 512,32 (currency is $ I guess) , useful.
    11. Cabin - Categorical data, it contains few nulls (can be imputed), it can be useful.
    12. Embarked - Categorical data, with 3 categories, C = Cherbourg, Q = Queenstown, S = Southampton, it contains few nulls (can be imputed), useful. 
* The attribute 'Survived' would be the attribute to be predicted, i.e, target attribute.
* Based on the values in the target attribute, i.e., 0/1 which states if the passenger survived or not, it is a Binary Classification.

## 2. Exploratory Data Analysis

**2.1 Data Types:**

Let us take a look at the type of data being handled.

In [None]:
trainData.dtypes

**2.2 Missing Data:**

Checking and calculating the amount of missing values in the dataset.

These missing values will be handled later.

In [None]:
trainData.apply(lambda x: x.isnull().any())

In [None]:
pd.DataFrame({'percent_missing': trainData.isnull().sum() * 100 / len(trainData)})

**2.3 Unique Values:**

Checking for the amount of unique values in each feature.

In [None]:
pd.DataFrame({'percent_unique': trainData.apply(lambda x: x.unique().size/x.size*100)})

From the results of sections 2.2 and 2.3, we see that the features/attributes Cabin, PassengerId and Name can be eliminated from the required features to build the model since, the feature Cabin has 77.1% of missing values, and the features PassengerId/Name has 100% unique features.

In [None]:
# Names of the features extarcted from the data
selFeatures = list(trainData.columns.values)
# Removing the target variable from the column values
targetCol = 'Survived'
selFeatures.remove(targetCol)

# Removing features with unique values
for i in selFeatures:
    if trainData.shape[0] == len(pd.Series(trainData[i]).unique()) :
        selFeatures.remove(i)
        
# Removing features with high percentage of missing values
selFeatures.remove('Cabin')

**2.4 Visualizations:**

Visualizing the data using interactive plots. Starting with a matrix scatter plot showing the relation between the features that will be used for the training. Followed by plots to check the survival rate based on features.

In [None]:
import seaborn as sns
sns.set(style="ticks")
plotFeatures = [x for x in selFeatures]
plotFeatures.append("Survived")
sns.pairplot(trainData[plotFeatures], hue="Survived")

In [None]:
targetClass = trainData.Survived.value_counts().values.tolist()
data = [go.Pie(labels=['Died','Survived'], values=targetClass,
              hoverinfo='label+percent', textinfo='value')]
layout = dict(
        title = "Comparison of Classes (Died/Survived)",
        autosize=False,
        width=500,
        height=500
    )

fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
def plotGraph(plotData,msg):
    trace1 = go.Bar(
    x=plotData.columns.values,
    y=plotData.values[0],
    name='No'
    )
    trace2 = go.Bar(
        x=plotData.columns.values,
        y=plotData.values[1],
        name='Yes'
    )
    data = [trace1, trace2]
    layout = dict(
        title = msg,
        xaxis= dict(title = plotData.columns.name),
        yaxis= dict(title= 'Number of people'),
        barmode='group',
        autosize=False,
        width=800,
        height=500
    )
    fig = dict(data=data, layout=layout)
    iplot(fig)

In [None]:
pclass = pd.crosstab([trainData.Survived], trainData.Pclass)
plotGraph(pclass,'Survived based on Pclass')

In [None]:
sex = pd.crosstab([trainData.Survived], trainData.Sex)
plotGraph(sex, 'Survived based on sex')

In [None]:
embarked = pd.crosstab([trainData.Survived], trainData.Embarked)
plotGraph(embarked, 'Survived based on embarked')

In [None]:
SibSp = pd.crosstab([trainData.Survived], trainData.SibSp)
plotGraph(SibSp, 'Survived based on SibSp')

In [None]:
Parch = pd.crosstab([trainData.Survived], trainData.Parch)
plotGraph(Parch, 'Survived based on Parch')

In [None]:
def plotLine(plotData,msg):
    trace1 = go.Scatter(
    x=plotData.columns.values,
    y=plotData.values[0],
    mode='lines',
    name='No'
    )
    trace2 = go.Scatter(
        x=plotData.columns.values,
        y=plotData.values[1],
        mode='lines',
        name='Yes'
    )
    data = [trace1, trace2]
    layout = dict(
        title = msg,
        xaxis= dict(title = plotData.columns.name),
        yaxis= dict(title= 'Number of people'),
        autosize=False,
        width=800,
        height=500
    )
    fig = dict(data=data, layout=layout)
    iplot(fig)

In [None]:
Age = pd.crosstab([trainData.Survived],trainData.Age)
plotLine(Age,'Survival based on Age')

In [None]:
Fare = pd.crosstab([trainData.Survived],trainData.Fare)
plotLine(Fare,'Survival based on Fare')

## 3. Data Preparation

**3.1 Feature Selection**

Selecting the columns required for building the model based on EDA.

In [None]:
# Also removing cabin and ticket features for the initial run.
selFeatures.remove('Ticket')
        
print("Target Class: '"+ targetCol + "'")
print('Features to be investigated: ')
print(selFeatures)

In [None]:
def handle_categorical_na(df):
    ## replacing the null/na/nan values in 'Cabin' attribute with 'X'
#     df.Cabin = df.Cabin.fillna(value='X')
#     ## Stripping the string data in 'Cabin' and 'Ticket' features of numeric values and duplicated characters
#     df.Cabin = [''.join(set(filter(str.isalpha, s))) for s in df.Cabin]
#     df.Ticket = [''.join(set(filter(str.isalpha, s))) for s in df.Ticket]
#     ## replacing the '' values in 'Ticket' attribute with 'X'
#     df.Ticket.replace(to_replace='',value='X',inplace=True)
    ## Imputing the null/na/nan values in 'Age' attribute with its mean value 
    df.Age.fillna(value=df.Age.mean(),inplace=True)
    ## replacing the null/na/nan values in 'Embarked' attribute with 'X'
    df.Embarked.fillna(value='X',inplace=True)
    return df

In [None]:
from sklearn.model_selection import train_test_split
seed = 7
np.random.seed(seed)
X_train, X_test, Y_train, Y_test = train_test_split(trainData[selFeatures], trainData.Survived, test_size=0.2)

X_train = handle_categorical_na(X_train)
X_test = handle_categorical_na(X_test)

## using One Hot Encoding for handling categorical data
X_train = pd.get_dummies(X_train,columns=['Embarked','Sex'],prefix=['Embarked','Sex'])
X_test = pd.get_dummies(X_test,columns=['Embarked','Sex'],prefix=['Embarked','Sex'])

In [None]:
common_col = [x for x in X_test.columns if x in X_train.columns]
X_test = X_test[common_col]

missing_col = [x for x in X_train.columns if x not in X_test.columns]
## Inserting missing columns in test data
for val in missing_col:
    X_test.insert(X_test.shape[1], val, pd.Series(np.zeros(X_test.shape[0])))

## 4. Modeling

**4.1 Neural Network**

Using Sequential model of Keras API to build a neural network. The model is tuned and optimezed using Grid Serach.

The optimized parameters are:
* Number of neurons in input and hidden layers
* Activation type
* Optimizer 
* Epochs
* Batch size
* Dropout ratio

Due to increased processing time, the best parameters were found and used. 

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm

in_shape = X_train.shape[1]

def create_model(optimizer='Adam', neurons=50):
    # Initialize the constructor
    model = Sequential()
    # Input - Layer
    model.add(Dense(neurons, input_dim=in_shape, activation=activation))
    # Hidden - Layers
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, activation = activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, activation = activation))
    # Output- Layer
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

def nn_hyperparameter_optimization():
    model = KerasClassifier(build_fn=create_model, verbose=0)
    # defining the grid search parameters
    neurons = [65, 75, 85]
    batch_size= [10, 20, 30, 40]
    epochs= [10, 20, 30, 40]
    optimizer = ['RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
    activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
    dropout_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
    param_grid = dict(neurons=neurons,
                      optimizer=optimizer,
                      batch_size=batch_size,
                      epochs=epochs,
                      activation=activation,
                      dropout_rate=dropout_rate)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
    grid_result = grid.fit(X_train, Y_train)

    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    return grid_result

## based on the hyper parameter optimization, the below model is built.
nn_model = Sequential()
# Input - Layer
nn_model.add(Dense(65, input_dim=in_shape, activation='relu'))
# Hidden - Layers
nn_model.add(Dropout(0.2))
nn_model.add(Dense(65, activation = 'relu'))
nn_model.add(Dropout(0.2))
nn_model.add(Dense(65, activation = 'relu'))
# Output- Layer
nn_model.add(Dense(1, activation='sigmoid'))
nn_model.compile(loss='binary_crossentropy', optimizer='Nadam', metrics=['accuracy'])
nn_model.fit(X_train, Y_train,
          batch_size=20,
          epochs=20,
          verbose=1,
          validation_data=(X_test, Y_test))

score = nn_model.evaluate(X_test, Y_test, verbose=2)
score_nn = score[1]
print('Test loss:', score[0])
print('Test accuracy:', score[1])

**4.2 Random Forest**

Using Random Forest to build a model. The model uses Random Search to find the best hyperparameter values.

The optimized parameters are:
* Number of trees
* Number of features to consider at every split
* Maximum number of levels in tree
* Minimum number of samples required to split a node
* Minimum number of samples required at each leaf node
* Method of selecting samples for training each tree


In [None]:
 def rf_hyperparameter_optimization():
    from sklearn.model_selection import RandomizedSearchCV
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    from sklearn.ensemble import RandomForestRegressor
    # Using the random grid to search for best hyperparameters
    # Creating the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 5 fold cross validation, 
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, Y_train)
    return rf_random.best_params_

## based on the hyper parameter optimization, the below model is built.
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=1400, min_samples_split=5, min_samples_leaf=4, max_features= 'sqrt', max_depth= 80, bootstrap= True)
rf_model.fit(X_train,Y_train)
# Fetching predictions
Y_pred = rf_model.predict(X_test)
# Calculating the test accuracy
from sklearn import metrics
score_rf = metrics.accuracy_score(Y_test, Y_pred)
print("Test Accuracy:",score_rf)

**4.3 Submit Predictions**

Comparing the accuracy of neural network and random forest models and submitting the predictions from the best model.

In [None]:
xTest = testData[selFeatures]
xTest = handle_categorical_na(xTest)
## using One Hot Encoding for handling categorical data
xTest = pd.get_dummies(xTest,columns=['Embarked','Sex'],prefix=['Embarked','Sex'])
common_col = [x for x in xTest.columns if x in X_train.columns]
xTest = xTest[common_col]
missing_col = [x for x in X_train.columns if x not in xTest.columns]
## Inserting missing columns in test data
for val in missing_col:
    xTest.insert(xTest.shape[1], val, pd.Series(np.zeros(xTest.shape[0])))
col_names = xTest.columns
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
xTest = my_imputer.fit_transform(xTest)
xTest = pd.DataFrame(xTest)
xTest.columns = col_names

submission = pd.DataFrame()
## Comparing and submitting the best result
if score_nn>score_rf:
    predictions = nn_model.predict_classes(xTest)
    predictions = [x[0] for x in predictions]
else:
    predictions = rf_model.predict(xTest)
submission = pd.DataFrame({'PassengerId': testData.PassengerId, 'Survived': predictions})
submission.to_csv('submission.csv', index=False)
submission.head()
