# 2.Problem Statement
Predicting Survival in the Titanic Data Set
We will be using a decision tree to make predictions about the Titanic data
set from Kaggle. This data set provides information on the Titanic
passengers and can be used to predict whether a passenger survived or
not.

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 
import sklearn 
from pandas import Series, DataFrame 
from pylab import rcParams 
from sklearn import preprocessing 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import accuracy_score,classification_report

In [2]:
Url= "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv "
titanic_df = pd.read_csv(Url)

titanic_df.columns

titanic_df.describe()

#  Function for data preprocessing
1. Feature selection (as given in the assignment)
2. Handling missing values

In [5]:
def data_preprocessing(df):
    #titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked'], 1, inplace=True)
    #print(titanic.tail())
    df = df.loc[:,['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]
    df.convert_objects(convert_numeric=True)
    df.fillna(0, inplace=True) 
    
    return df

# Function to handle categorical / Non-numerical data

In [7]:
def handle_non_numeric_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            
            x = 0            
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1                    
            df[column] = list(map(convert_to_int, df[column]))
            
    return df


# Data preprocessing
1. Call function data_preprocessing on titanic dataset to select features and handle missing data
2. Call function handle_non_numeric_data on titanic dataset to handle non-numeric data

In [8]:
titanic_df1 = data_preprocessing(titanic_df)
print(titanic_df1.tail())

     Survived  Pclass     Sex   Age  SibSp  Parch   Fare
886         0       2    male  27.0      0      0  13.00
887         1       1  female  19.0      0      0  30.00
888         0       3  female   0.0      1      2  23.45
889         1       1    male  26.0      0      0  30.00
890         0       3    male  32.0      0      0   7.75


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """


In [9]:
titanic_df1 = handle_non_numeric_data(titanic_df1)
print(titanic_df1.tail())

     Survived  Pclass  Sex   Age  SibSp  Parch   Fare
886         0       2    0  27.0      0      0  13.00
887         1       1    1  19.0      0      0  30.00
888         0       3    1   0.0      1      2  23.45
889         1       1    0  26.0      0      0  30.00
890         0       3    0  32.0      0      0   7.75


# Define input parameters for machine learning model

In [10]:

X = np.array(titanic_df1.drop(['Survived'], 1).astype(float))
Y = np.array(titanic_df1['Survived'])

# Apply model
1. Split dataset into training and test dataset
2. Apply machine learning model (Decision tree classifier)
3. Fit the model on input parameters X,Y

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
d_tree = DecisionTreeClassifier(min_samples_split=20, random_state=99)
d_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=99,
            splitter='best')

# Model evaluation
1. Predict outcome for test dataset using the model applied on trainning dataset
2. Calculate accuracy using the actual and predicted outcome

In [12]:
Y_pred = d_tree.predict(X_test)
print("Accuracy is ", accuracy_score(Y_test,Y_pred)*100)

Accuracy is  78.73134328358209


In [13]:
# this produces a 2x2 numpy array (matrix)
confusion = metrics.confusion_matrix(Y_test, Y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FN = confusion[1, 0]
print(f"Accuracy calculation using confusion metrics : {((TP + TN) / float(TP + TN + FP + FN))}")

Accuracy calculation using confusion metrics : 0.7873134328358209


In [14]:
print(f"classification_error using accuracy_score is : {1 - metrics.accuracy_score(Y_test, Y_pred)}")
print(f"classification_error using confusion metrics is : {(FP + FN) / float(TP + TN + FP + FN)}")

classification_error using accuracy_score is : 0.2126865671641791
classification_error using confusion metrics is : 0.2126865671641791


# Model deployment (Check outcome for unseen data)

In [15]:
arr = np.array([[3,1,22,1,0,7.25],[3.,1.,20.,0.,0.,7.8958],[1,0,38,1,0,71.2833]])
d_tree.predict(arr)

array([0, 0, 0], dtype=int64)