In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

#load training data
train = pd.read_csv('Data/train.csv');

train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


**Qualitative Attributes:** PassengerId, Survived, Pclass, Sex, Ticket, Cabin & Embarked.

**Quantitative Attributes:** SlibSp, Parch & Fare.

In [2]:
#load test data
test = pd.read_csv('Data/test.csv');

test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [3]:
complete_set = [train, test]

for data in complete_set:
    print(pd.isnull(data).sum())
    print('........................')

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
........................
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
........................


**PassengerId** is a Nominal attribute, this means if we reassign all passengers ids, it will not make any difference on the data set hence we can ignore this for now.

**Survived** is target attribute.

**Pclass** is numeric attribute and it does not have any missing value, lets check it’s impact on target attribute.

In [4]:
print(train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


Here we have taken mean, this data represents that Pclass=1 survival rate is 62.96%.

**Sex** attribute is categorical attribute having values male or female and does not have missing values.

In [5]:
print(train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())

      Sex  Survived
0  female  0.742038
1    male  0.188908


**Females** are more secure, Female survival rate is more than male.

**SibSp** is a numerical attribute represents the siblings/spouse.

In [6]:
print(train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean())

   SibSp  Survived
0      0  0.345395
1      1  0.535885
2      2  0.464286
3      3  0.250000
4      4  0.166667
5      5  0.000000
6      8  0.000000


**Parch** is also numerical attribute represents the children/parents.

In [7]:
print(train[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean())

   Parch  Survived
0      0  0.343658
1      1  0.550847
2      2  0.500000
3      3  0.600000
4      4  0.000000
5      5  0.200000
6      6  0.000000


**Family Size**

Based on the impact of SibSp & Parch, lets create a new attribute called Family size.

In [8]:
for data in complete_set:
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

print(train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


This seems to have a good effect on our prediction, lets check if what is the effect of being alone on this ship and create another attribute called **IsAlone**.

In [9]:
for data in complete_set:
    data['IsAlone'] = 0
    data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

print(train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())

   IsAlone  Survived
0        0  0.505650
1        1  0.303538


**Embarked** attribute is having missing values, best way to fill this would be by most occurred value. In this data set most occurred value for Embarked attribute is ‘S’.

In [10]:
for data in complete_set:
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

print(train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())

  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


**Fare** is a numeric attribute and it has missing values too. In case of Fare, filling missing values with median will be better choice than mean. It is a numeric attribute hence we need to convert this to categorical attribute to make it ready for ML algorithms we are going to use. Here we have categorized fare into 4 categories.

In [11]:
for data in complete_set:
    data['Fare'] = data['Fare'].fillna(train['Fare'].median())
    
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

print(train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

   CategoricalFare  Survived
0   (-0.001, 7.91]  0.197309
1   (7.91, 14.454]  0.303571
2   (14.454, 31.0]  0.454955
3  (31.0, 512.329]  0.581081


**Age** is also a numeric attribute and having missing values. Filling the missing values in age attribute is tricky one. We are generating random numbers between (mean — std) and (mean + std) and filling the same. We categorized age into 5 range.

In [12]:
for data in complete_set:
    age_avg = data['Age'].mean()
    age_std = data['Age'].std()
    age_null_count = data['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size = age_null_count)
    
    data['Age'][np.isnan(data['Age'])] = age_null_random_list
    data['Age'] = data['Age'].astype(int)
    
train['CategoricalAge'] = pd.cut(train['Age'], 5)

print(train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

  CategoricalAge  Survived
0  (-0.08, 16.0]  0.559633
1   (16.0, 32.0]  0.350877
2   (32.0, 48.0]  0.365854
3   (48.0, 64.0]  0.434783
4   (64.0, 80.0]  0.090909


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


**Name**, could have been ignored but if you look closely names has titles which can have impact on target attribute. Lets cross check the same.

In [13]:
for data in complete_set:
    data['Title'] = ''
    data['Title']=data['Name'].str.extract('([A-Za-z]+)\.')
    
    data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')

print(train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()) 
#train.head()

    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826


In [14]:
for data in complete_set:
    #Mapping dummies for Sex
    data['Sex'] = data['Sex'].map({'female':0, 'male':1}).astype(int)
    
    #Mapping dummies for Titles
    title_mapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'] = data['Title'].fillna(0)
    
    #Mapping dummies for Embarked
    data['Embarked'] = data['Embarked'].map({'S':0, 'C':1, 'Q':2})
    
    #Mapping dummies for Fare
    data.loc[data['Fare'] <= 7.91, 'Fare'] = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare'] = 2
    data.loc[data['Fare'] > 31, 'Fare'] = 3
    data['Fare'] = data['Fare'].astype(float)
    
    #Mapping dummies for Age
    data.loc[data['Age'] <= 16, 'Age'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[data['Age'] > 64, 'Age'] = 4
    
#Feature Selection
drop_elements = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize']
train.drop(drop_elements, axis=1, inplace=True)
train.drop(['CategoricalAge', 'CategoricalFare', 'PassengerId'], axis=1, inplace=True)
test.drop(drop_elements, axis=1, inplace=True)
    

In [15]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,IsAlone,Title
0,0,3,1,1,0.0,0,0,1
1,1,1,0,2,3.0,1,0,3
2,1,3,0,1,1.0,0,1,2
3,1,1,0,2,3.0,0,0,3
4,0,3,1,2,1.0,0,1,1


In [16]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,IsAlone,Title
0,892,3,1,2,0.0,2,1,1
1,893,3,0,2,0.0,0,0,3
2,894,2,1,3,1.0,2,1,1
3,895,3,1,1,1.0,0,1,1
4,896,3,0,1,1.0,0,0,3


Now the data looks clean.

### Train Test  Split

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y = train['Survived']

X = train.drop('Survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [108]:
import tensorflow as tf

In [109]:
#feature columns for estimator

feat_cols = []

for col in X.columns:
    feat_cols.append(tf.feature_column.numeric_column(col))

In [110]:
feat_cols

[_NumericColumn(key='Pclass', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Sex', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Embarked', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='IsAlone', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Title', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [111]:
#creating input function for the classifier
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=100, num_epochs=10, shuffle=True)

In [112]:
classifier = tf.estimator.DNNClassifier(hidden_units=[20,10,5,10,20], n_classes=2, feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_log_step_count_steps': 100, '_global_id_in_cluster': 0, '_train_distribute': None, '_is_chief': True, '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002E25FD382E8>, '_task_id': 0, '_service': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_master': '', '_session_config': None, '_keep_checkpoint_max': 5, '_model_dir': 'C:\\Users\\SAURAV~1.PRA\\AppData\\Local\\Temp\\tmpvlcl944r', '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_tf_random_seed': None, '_task_type': 'worker', '_device_fn': None}


In [119]:
classifier.train(input_fn=input_func, steps=11000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\SAURAV~1.PRA\AppData\Local\Temp\tmpvlcl944r\model.ckpt-63
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 63 into C:\Users\SAURAV~1.PRA\AppData\Local\Temp\tmpvlcl944r\model.ckpt.
INFO:tensorflow:loss = 36.14674, step = 64
INFO:tensorflow:Saving checkpoints for 126 into C:\Users\SAURAV~1.PRA\AppData\Local\Temp\tmpvlcl944r\model.ckpt.
INFO:tensorflow:Loss for final step: 8.963756.


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x2e2611b2d30>

In [120]:
pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=len(X_test), shuffle=False)

In [121]:
predictions = list(classifier.predict(input_fn=pred_fn))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\SAURAV~1.PRA\AppData\Local\Temp\tmpvlcl944r\model.ckpt-126
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [122]:
final_preds = []

for pred in predictions:
    final_preds.append(pred['class_ids'][0])

In [123]:
final_preds[1:10]

[0, 1, 1, 0, 1, 0, 0, 0, 0]

In [124]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, final_preds))
print('\n')
print(classification_report(y_test, final_preds))
print('\n')
print(accuracy_score(y_test, final_preds))




'''test_temp = test.drop('PassengerId', axis=1)
df = pd.concat([test['PassengerId'], pd.DataFrame(predictions, columns= ['Survived'])], axis=1)
df.to_csv('Survival_TF', index=False)'''

[[145  20]
 [ 34  69]]


              precision    recall  f1-score   support

           0       0.81      0.88      0.84       165
           1       0.78      0.67      0.72       103

   micro avg       0.80      0.80      0.80       268
   macro avg       0.79      0.77      0.78       268
weighted avg       0.80      0.80      0.80       268



0.7985074626865671


"test_temp = test.drop('PassengerId', axis=1)\ndf = pd.concat([test['PassengerId'], pd.DataFrame(predictions, columns= ['Survived'])], axis=1)\ndf.to_csv('Survival_TF', index=False)"