In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools

# matplotlib
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

First let us take a look at how our data looks like, to figure out the features that are important to us.

In [None]:
# Reading the train and the test data.
trainData = pd.read_csv('../input/train.csv')
testData = pd.read_csv('../input/test.csv')

# Displaying a sample of the train data to get more detailed info
trainData.head()

Displaying the summary of the training data

In [None]:
trainData.describe()

Selecting the columns required for building the model and checking for relations using visualization techniques.

In [None]:
# Names of the features extarcted from the data
selFeatures = list(trainData.columns.values)
# Removing the target variable from the column values
targetCol = 'Survived'
selFeatures.remove(targetCol)
# Removing features with unique values
for i in selFeatures:
    if trainData.shape[0] == len(pd.Series(trainData[i]).unique()) :
        selFeatures.remove(i)

# Also removing cabin and ticket features for the initial run.
selFeatures.remove('Ticket')
selFeatures.remove('Cabin')
        
print("Target Class: '"+ targetCol + "'")
print('Features to be investigated: ')
print(selFeatures)

In [None]:
def plotGraph(plotData,msg):
    trace1 = go.Bar(
    x=plotData.columns.values,
    y=plotData.values[0],
    name='No'
    )
    trace2 = go.Bar(
        x=plotData.columns.values,
        y=plotData.values[1],
        name='Yes'
    )
    data = [trace1, trace2]
    layout = dict(
        title = msg,
        xaxis= dict(title = plotData.columns.name),
        yaxis= dict(title= 'Number of people'),
        barmode='group',
        autosize=False,
        width=500,
        height=500
    )
    fig = dict(data=data, layout=layout)
    iplot(fig)

In [None]:
pclass = pd.crosstab([trainData.Survived], trainData.Pclass)
plotGraph(pclass,'Survived based on Pclass')

In [None]:
sex = pd.crosstab([trainData.Survived], trainData.Sex)
plotGraph(sex, 'Survived based on sex')

In [None]:
embarked = pd.crosstab([trainData.Survived], trainData.Embarked)
plotGraph(embarked, 'Survived based on embarked')

In [None]:
SibSp = pd.crosstab([trainData.Survived], trainData.SibSp)
plotGraph(SibSp, 'Survived based on SibSp')

In [None]:
Parch = pd.crosstab([trainData.Survived], trainData.Parch)
plotGraph(Parch, 'Survived based on Parch')

In [None]:
def plotLine(plotData,msg):
    trace1 = go.Scatter(
    x=plotData.columns.values,
    y=plotData.values[0],
    mode='lines',
    name='No'
    )
    trace2 = go.Scatter(
        x=plotData.columns.values,
        y=plotData.values[1],
        mode='lines',
        name='Yes'
    )
    data = [trace1, trace2]
    layout = dict(
        title = msg,
        xaxis= dict(title = plotData.columns.name),
        yaxis= dict(title= 'Number of people'),
        autosize=False,
        width=500,
        height=500
    )
    fig = dict(data=data, layout=layout)
    iplot(fig)

In [None]:
Age = pd.crosstab([trainData.Survived],trainData.Age)
plotLine(Age,'Survival based on Age')

In [None]:
Fare = pd.crosstab([trainData.Survived],trainData.Fare)
plotLine(Fare,'Survival based on Fare')

In [None]:
targetClass = trainData.Survived.value_counts().values.tolist()
data = [go.Bar(x=['Died','Survived'],y=targetClass)]
layout = dict(
        title = "Comparison of Classes (Died/Survived)",
        yaxis= dict(title= 'Number of people'),
        autosize=False,
        width=500,
        height=500
    )
fig = dict(data=data, layout=layout)
iplot(fig)

A matrix scatter plot showing the relation between the features that will be used for the training. It helps in finding corelations between the features.

In [None]:
import seaborn as sns
sns.set(style="ticks")
plotFeatures = [x for x in selFeatures]
plotFeatures.append("Survived")
sns.pairplot(trainData[plotFeatures], hue="Survived")

Building a neural network using scikit-learn

In [None]:
# Replacing categorical values
replaceStr = {"Sex":     {"male": 1, "female": 2},
              "Embarked": {"Q": 1, "S": 2, "C": 3 }}

trainData.fillna(0, inplace=True)
trainData.replace(replaceStr, inplace=True)
testData.fillna(0, inplace=True)
testData.replace(replaceStr, inplace=True)
trainData[selFeatures].head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainData[selFeatures], trainData.Survived, test_size=0.2)

In [None]:
## standerdise the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(trainData[selFeatures])

xTrain = scaler.transform(X_train)
X_test = scaler.transform(X_test)
xTest = scaler.transform(testData[selFeatures])

In [None]:
# Import `Sequential` from `keras.models`
from keras.models import Sequential

# Import `Dense` from `keras.layers`
from keras.layers import Dense, Dropout

# Initialize the constructor
model = Sequential()

# Add an input layer 
model.add(Dense(24, activation='relu', input_shape=(7,)))
model.add(Dropout(0.5))
# Add one hidden layer 
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
# Add an output layer 
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(xTrain, y_train,epochs=20, batch_size=5, verbose=1)

In [None]:
model.evaluate(X_test,y_test,verbose=1)

In [None]:
predictions = model.predict_classes(xTest,batch_size=1)
submission = pd.DataFrame({'PassengerId': testData.PassengerId, 'Survived': [x[0] for x in predictions]})
submission.to_csv('submission.csv', index=False)
submission.head()
