In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import warnings
warnings.filterwarnings('ignore')

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
true_result = pd.read_csv('../input/gender_submission.csv')

train.sample(3)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# Some imputation need to be done with variables: Age, Cabin and Embarked
train.isnull().sum()

Missing records in Age are too many which may make the data unbalanced when we impute it with mean of age across the dataset. I recommended to treat this group as a separated bucket from all ages. These people may be in a lower class and don't have a lot of profile information when they buy a ticket

In [None]:
# describle 1 column to get quartile values
train.Fare.describe()

__Variable Transformation__

In [None]:
import re
def clean_variable(df):
    df.Age = df.Age.fillna(-0.5)
    #df.Age = df.Age.fillna(df.Age.mean())
    #df.Age = pd.cut(df.Age, (-1, 0, 20, 28, 38, 90), labels=['Unknown','1-quantile', '2-quantile', '3-quantile', '4-quantile'])
    df.Age = pd.cut(df.Age, (-1,0, 18, 65, 100), labels=['Unknown','Child', 'Adult', 'Senior'])
    df.Embarked = df.Embarked.fillna('N')
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    #df.Ticket = df.Ticket.apply(lambda x: x[0])
    df.Fare = df.Fare.fillna(-0.5)
    #df.Fare = pd.cut(df.Fare, (-1, 0, 8, 14, 31, 600), labels=['Unknown','1-quantile', '2-quantile', '3-quantile', '4-quantile'])
    df.Fare = pd.cut(df.Fare,  (-1, 15, 1000), labels=['Economy', 'Business'])
    df['Title'] = df.Name.apply(lambda x: x.split(', ')[1].split('.')[0])
    df= df.drop(['Name' ], axis = 1)
    return df
train = clean_variable(train)
test = clean_variable(test)
train.head()

In [None]:
sns.barplot(x="Age", y="Survived", hue="Sex", data=train);

In [None]:
#unique titles in both train and test
pd.concat([train[['Title']],test[['Title']]],axis = 0).Title.unique()

In [None]:
# normalize the titles
normalized_titles = {
    "Capt":"Officer",        "Col":"Officer",    "Major":"Officer",    "Dr":"Officer",              "Rev":"Officer",
    "Jonkheer":"Royalty",    "Don":"Royalty",    "Sir" :"Royalty",     "the Countess":"Royalty",    "Dona":"Royalty",    "Lady" :"Royalty",
    "Mme":"Mrs",             "Ms":"Mrs",         "Mrs" :"Mrs",
    "Mlle":"Miss",           "Miss" :"Miss",
    "Mr" :"Mr",
    "Master" :"Master"
    }
# map the normalized titles to the current titles 
train.Title = train.Title.map(normalized_titles)
test.Title = test.Title.map(normalized_titles)

In [None]:
pd.concat([train[['Title']],test[['Title']]],axis = 0).Title.unique()

In [None]:
sns.barplot(x="Cabin", y="Survived", hue="Sex", data=train);

__Label Encoding__

In [None]:
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Title','Embarked']#,'Ticket'
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test
    
train, test = encode_features(train, test)
train.head()

__Splitting up the Training Data__

In [None]:
from sklearn.model_selection import train_test_split

X_all = train.drop(['Survived', 'PassengerId','Ticket'], axis=1)
y_all = train['Survived']

num_test = 0.30
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=123)
X_train.head()

__Fitting and Tuning an Algorithm__

__DecisionTreeClassifier__ (https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = DecisionTreeClassifier()

# Choose some parameter combinations to try
parameters = {'criterion': ['entropy', 'gini'],
              'splitter': ['best','random'],
              'max_depth': [2, 3, 5, 10, 20], 
              'min_samples_split': [3, 5, 8, 10],
              'min_samples_leaf': [1,2,5], 
              'max_features': ['log2','sqrt','auto'], 
              'random_state': [1234]
             }
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer,cv=3,iid = True)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)
print(clf)

train_predictions = clf.predict(X_train)
print('train accuracy: '+str(accuracy_score(y_train, train_predictions)))
test_predictions = clf.predict(X_test)
print('test accuracy: '+str(accuracy_score(y_test, test_predictions)))

#Feature importances
feature_importance_df = pd.DataFrame(
                    {'Features': X_train.columns,
                     'Importances': clf.feature_importances_
                    }).sort_values(by=['Importances'],ascending=False)
feature_importance_df

In [None]:
#Feature importance plot
sns.barplot(x="Importances", y="Features", data=feature_importance_df,palette="Blues_d")

__Validate with KFold__

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

Parameters:	
- n_splits : int, default=3: Number of folds. Must be at least 2. - Changed in version 0.20: n_splits default value will change from 3 to 5 in v0.22.
- shuffle : boolean, optional: Whether to shuffle the data before splitting into batches.
- random_state : int, RandomState instance or None, optional, default=None. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Used when shuffle == True.

Methods:
- get_n_splits([X, y, groups])	Returns the number of splitting iterations in the cross-validator
- split(X[, y, groups])	Generate indices to split data into training and test set.

In [None]:
from sklearn.model_selection  import KFold

def run_kfold(clf):
    kf = KFold(n_splits=5) # n_sample in each split will be 4:1 (total = 5) for train and test
    kf.get_n_splits(X_all)
    outcomes = []
    fold = 0
    for train_index, test_index in kf.split(X_all):
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome))

run_kfold(clf)

__Predict the Actual Test Data__

And now for the moment of truth. Make the predictions, export the CSV file, and upload them to Kaggle.

In [None]:
ids = test['PassengerId']
predictions = clf.predict(test.drop(['PassengerId','Ticket'], axis=1))

output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.to_csv('titanic-predictions.csv', index = False)
output.head()

In [None]:
#Check same PassengerId
y_pred_id = output['PassengerId'].tolist()
y_true_id = true_result['PassengerId'].tolist()
print('If PassengerId is in the right order: '+str(y_pred_id == y_true_id))
#Public score
y_pred = output['Survived'].tolist()
y_true = true_result['Survived'].tolist()
print('Public score: '+str(accuracy_score(y_true, y_pred))