## Titanic Survival Prediction

**Project intended to compute wether a passenger survived or not when RMS titanic ship struck in iceberg and sank**

In [1]:
#importing some useful libraries for computational purpose along with data modeling 

import numpy as np
import pandas as pd

from sklearn.cross_validation import ShuffleSplit

#load dataset 
data = pd.read_csv('titanic_data.csv')

#removing the NaN values from Age class 

data['Age'].dropna(inplace = True)
data.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# cleansing of dataset for mathematical computation

#converting age into mathematical way i.e. is label encoding
data = data.replace(['male','female'],[1,0])

#encoding Embarked into numeric format.
data['Embarked'] = data['Embarked'].replace(['S','C','Q'],[2,0,1])
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2.0


In [3]:
#breaking full data set into features and output 
data.dropna(inplace=True)
#creating output i.e is survived or not 
survived = data['Survived']
data = data.drop('Survived',axis=1)

#picking features from our data set  
features = data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

#printing or showcasing head of features i.e is first 5 entries of features.
features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,0,38.0,1,0,71.2833,0.0
3,1,0,35.0,1,0,53.1,2.0
6,1,1,54.0,0,0,51.8625,2.0
10,3,0,4.0,1,1,16.7,2.0
11,1,0,58.0,0,0,26.55,2.0


In [4]:
#creating a performance mertircs function

#importing r2_score from sklearn 

from sklearn.metrics import r2_score

#defining function for performance
def performance_mertics(y_predict,y_true):
    """Function to compute performance metrics based on r2_score"""
    
    
    #computing score 
    score = r2_score(y_predict,y_true)
    
    #returning score i.e is computed value 
    return score

In [5]:
# spliting data set into train and test subset 

#importing train_test_split from sklearn for model building
from sklearn.cross_validation import train_test_split

#spliting value of train and test into X_train,X_test,y_train,y_test
X_train,X_test,y_train,y_test = train_test_split(features,survived,test_size = 0.25,random_state=20)

In [6]:
# building a model for prediction 

#importing some important functions from libraries for GridSearch,make_scorer,DescisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

def make_model(X,y):
    """Making a model for classifing wether a person survived or not"""
    
    #creating a cv_set for increasing variablity of data 
    cv_set = ShuffleSplit(X.shape[0],n_iter=10,test_size=0.1,random_state=20)
    
    #creating a param list for better prediction
    params = {'max_depth':list(range(1,11))}
    
    #making a regressor instance of DescisionTreeClassifier 
    reg = DecisionTreeClassifier()
    
    #make a scoring function for gridserach
    scoring_fun = make_scorer(performance_mertics)
    
    #making a grid of GridSearch
    grid = GridSearchCV(reg,param_grid=params,scoring=scoring_fun,cv=cv_set)
    
    #fitting the values X,y in grid model
    grid.fit(X,y)
    
    #returning best estimator for the the prediction
    return grid.best_estimator_



In [7]:
#making a classifier for our training dataset 
clf = make_model(X_train,y_train)

In [30]:
X_test.shape[0]
survival_predict = []
survival_true = []
for i in range(0,X_test.shape[0]):
    predict = clf.predict([X_test.iloc[i]])
    survival_predict.append(predict[0])
    survival_true.append(y_test.iloc[i])
output =  np.array([survival_predict,survival_true],dtype=int)
output

array([[1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
        0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
        0, 1]])