#Titanic Challenge - Kaggle
###Implementing a notebook to better understand data science best practices

I - Import all necessary packages and libraries needed to complete the tasks

In [39]:
import sys #access to system parameters
print("Python version: {}".format(sys.version))

import pandas as pd ##collection of functions for data processing and analysis modeled after 
                    # R dataframes with SQL like features
print("\nPandas version: {}".format(pd.__version__))

import matplotlib
print("Matpltlib version: {}".format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython.display import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)



# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print('Input Data Files:\n' + check_output(["ls", "../rmsTitanic/input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


#Highly used modeling algorithms
from sklearn import \
    svm, tree, linear_model, \
    neighbors, naive_bayes, \
    ensemble, discriminant_analysis, \
    gaussian_process

from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics 

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import pandas.tools.plotting

#show plots in Jupyter Notebook browser
%matplotlib inline 
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

Python version: 2.7.15 |Anaconda, Inc.| (default, May  1 2018, 18:37:05) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]

Pandas version: 0.23.2
Matpltlib version: 2.2.2
NumPy version: 1.14.5
SciPy version: 1.1.0
IPython version: 5.7.0
scikit-learn version: 0.19.1
-------------------------
Input Data Files:
gender_submission.csv
test.csv
train.csv



II - Import Raw data into a pandas data frame and analyze the data at a high level.

In [40]:
#Import Data from csv file
data_raw = pd.read_csv('../rmsTitanic/input/train.csv')

#Use the test data provided for validation
data_val = pd.read_csv('../rmsTitanic/input/test.csv')

#Create a copy of the input data set
data1 = data_raw.copy(deep = True)

data_cleaner = [data1, data_val]

#preview data
print(data_raw.info())
data_raw.sample(10)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
858,859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24.0,0,3,2666,19.2583,,C
641,642,1,1,"Sagesser, Mlle. Emma",female,24.0,0,0,PC 17477,69.3,B35,C
634,635,0,3,"Skoog, Miss. Mabel",female,9.0,3,2,347088,27.9,,S
169,170,0,3,"Ling, Mr. Lee",male,28.0,0,0,1601,56.4958,,S
626,627,0,2,"Kirkland, Rev. Charles Leonard",male,57.0,0,0,219533,12.35,,Q
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
380,381,1,1,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,C
55,56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S
553,554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22.0,0,0,2620,7.225,,C
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q


III - Clean and pre-process data

Correct, Complete, Create and Convert
	-- Correct: Correct any non-acceptable data inputs 
	-- Complete: Fill missing values and fields (Ex: Mode for categorical impute, mean/median for quantitative impute)
	-- Create: Add new features that will bring value to predicting the independent variable
	-- Covert: Convert to necessary format (Ex: Categorical to dummy vars, formatting dates, etc.)

In [79]:
## SUmmarize null values in the dataset

display("Null values in training data by column:", data1.isnull().sum() )

display("Null values in test data by column:", data_val.isnull().sum())

data_raw.describe(include= 'all')


'Null values in training data by column:'

Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         2
familySize       0
isAlone          0
Title            0
fareBins         0
ageBins          0
Sex_Code         0
Embarked_Code    0
AgeBin_Code      0
FareBin_Code     0
Title_Code       0
dtype: int64

'Null values in test data by column:'

PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            327
Embarked           0
familySize         0
isAlone            0
Title              0
fareBins           0
ageBins            0
Sex_Code           0
Embarked_Code      0
AgeBin_Code        0
FareBin_Code       0
Title_Code         0
dtype: int64

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Graham, Mr. George Edward",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [42]:
#Impute missing vals

for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode(), inplace=True)
    
#Deleting cabin feature column and Unique ID columns like PassengerId and Ticket from training data
deleteCols = ['PassengerId', 'Ticket', 'Cabin']
data1.drop(deleteCols, axis=1, inplace=True)

display(data1.isnull().sum())
display(data_val.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [43]:
## Create new features - feature engineering

for dataset in data_cleaner:
    #Family Size = Siblings/Spouse + Parents/Children + passenger
    dataset['familySize'] = dataset['SibSp']  + dataset['Parch'] + 1
    
    dataset['isAlone'] = 1 #Initialize feature to all true
    dataset['isAlone'].loc[dataset['familySize']>1] = 0
    
    #Splitting title from name of passenger
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    
    #Fare Bins -- 4
    dataset['fareBins'] = pd.qcut(dataset['Fare'], 4)
    #Age Bins -- 5
    dataset['ageBins'] = pd.qcut(dataset['Age'], 6)
    
#cleanup rare title names
title_names = (data1['Title'].value_counts() < 10)
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

data1.head()

print('-'*20)

data1.info()

data_val.info()

display(data1.sample(10))

--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      889 non-null object
familySize    891 non-null int64
isAlone       891 non-null int64
Title         891 non-null object
fareBins      891 non-null category
ageBins       891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch    

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,familySize,isAlone,Title,fareBins,ageBins
536,0,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,26.55,S,1,1,Misc,"(14.454, 31.0]","(40.5, 80.0]"
551,0,2,"Sharp, Mr. Percival James R",male,27.0,0,0,26.0,S,1,1,Mr,"(14.454, 31.0]","(25.0, 28.0]"
804,1,3,"Hedman, Mr. Oskar Arvid",male,27.0,0,0,6.975,S,1,1,Mr,"(-0.001, 7.91]","(25.0, 28.0]"
306,1,1,"Fleming, Miss. Margaret",female,28.0,0,0,110.8833,C,1,1,Miss,"(31.0, 512.329]","(25.0, 28.0]"
249,0,2,"Carter, Rev. Ernest Courtenay",male,54.0,1,0,26.0,S,2,0,Misc,"(14.454, 31.0]","(40.5, 80.0]"
747,1,2,"Sinkkonen, Miss. Anna",female,30.0,0,0,13.0,S,1,1,Miss,"(7.91, 14.454]","(28.0, 31.0]"
194,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44.0,0,0,27.7208,C,1,1,Mrs,"(14.454, 31.0]","(40.5, 80.0]"
232,0,2,"Sjostedt, Mr. Ernst Adolf",male,59.0,0,0,13.5,S,1,1,Mr,"(7.91, 14.454]","(40.5, 80.0]"
542,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11.0,4,2,31.275,S,7,0,Miss,"(31.0, 512.329]","(0.419, 19.0]"
823,1,3,"Moor, Mrs. (Beila)",female,27.0,0,1,12.475,S,2,0,Mrs,"(7.91, 14.454]","(25.0, 28.0]"


In [75]:
# Convert to necessary format for mathematical analysis
label = LabelEncoder()

for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['ageBins'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['fareBins'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    
#define y variable aka target/outcome
Target = ['Survived']

#define x variables for original features aka feature selection
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'familySize', 'isAlone'] #pretty name/values for charts
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
data1_xy =  Target + data1_x
print('Original X Y:  ', data1_xy)


#define x variables for original w/bin features to remove continuous variables
data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'familySize', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin
print('Bin X Y:  ', data1_xy_bin)

#define x and y variables for dummy features original
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy
print('Dummy X Y:  ', data1_xy_dummy)
data1_dummy.sample(10)



('Original X Y:  ', ['Survived', 'Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'familySize', 'isAlone'])
('Bin X Y:  ', ['Survived', 'Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'familySize', 'AgeBin_Code', 'FareBin_Code'])
('Dummy X Y:  ', ['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'familySize', 'isAlone', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Misc', 'Title_Miss', 'Title_Mr', 'Title_Mrs'])


Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,familySize,isAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
746,3,1,1,16.0,20.25,3,0,0,1,0,0,1,0,0,0,1,0
381,3,0,2,1.0,15.7417,3,0,1,0,1,0,0,0,0,1,0,0
368,3,0,0,28.0,7.75,1,1,1,0,0,1,0,0,0,1,0,0
112,3,0,0,22.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0
631,3,0,0,51.0,7.0542,1,1,0,1,0,0,1,0,0,0,1,0
470,3,0,0,28.0,7.25,1,1,0,1,0,0,1,0,0,0,1,0
760,3,0,0,28.0,14.5,1,1,0,1,0,0,1,0,0,0,1,0
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0
205,3,0,1,2.0,10.4625,2,0,1,0,0,0,1,0,0,1,0,0
799,3,1,1,30.0,24.15,3,0,1,0,0,0,1,0,0,0,0,1


Double Check data cleaning

In [76]:
print('Train columns with null values:')
display(data1.isnull().sum())
print("-"*10)
data1.info()
print("-"*10)

print('Test/Validation columns with null values:' )
display(data_val.isnull().sum())
print("-"*10)
print (data_val.info())
print("-"*10)

data_raw.describe(include = 'all')

Train columns with null values:


Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         2
familySize       0
isAlone          0
Title            0
fareBins         0
ageBins          0
Sex_Code         0
Embarked_Code    0
AgeBin_Code      0
FareBin_Code     0
Title_Code       0
dtype: int64

----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
Embarked         889 non-null object
familySize       891 non-null int64
isAlone          891 non-null int64
Title            891 non-null object
fareBins         891 non-null category
ageBins          891 non-null category
Sex_Code         891 non-null int64
Embarked_Code    891 non-null int64
AgeBin_Code      891 non-null int64
FareBin_Code     891 non-null int64
Title_Code       891 non-null int64
dtypes: category(2), float64(2), int64(11), object(4)
memory usage: 120.3+ KB
----------
Test/Validation columns with null values:


PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            327
Embarked           0
familySize         0
isAlone            0
Title              0
fareBins           0
ageBins            0
Sex_Code           0
Embarked_Code      0
AgeBin_Code        0
FareBin_Code       0
Title_Code         0
dtype: int64

----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 21 columns):
PassengerId      418 non-null int64
Pclass           418 non-null int64
Name             418 non-null object
Sex              418 non-null object
Age              418 non-null float64
SibSp            418 non-null int64
Parch            418 non-null int64
Ticket           418 non-null object
Fare             418 non-null float64
Cabin            91 non-null object
Embarked         418 non-null object
familySize       418 non-null int64
isAlone          418 non-null int64
Title            418 non-null object
fareBins         418 non-null category
ageBins          418 non-null category
Sex_Code         418 non-null int64
Embarked_Code    418 non-null int64
AgeBin_Code      418 non-null int64
FareBin_Code     418 non-null int64
Title_Code       418 non-null int64
dtypes: category(2), float64(2), int64(11), object(6)
memory usage: 63.1+ KB
None
----------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Graham, Mr. George Edward",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [78]:
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x_calc], data1[Target], random_state = 0)

train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data1[data1_x_bin], data1[Target] , random_state = 0)

train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = model_selection.train_test_split(data1_dummy[data1_x_dummy], data1[Target], random_state = 0)

print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))

train1_x_bin.head()

Data1 Shape: (891, 19)
Train1 Shape: (668, 8)
Test1 Shape: (223, 8)


Unnamed: 0,Sex_Code,Pclass,Embarked_Code,Title_Code,familySize,AgeBin_Code,FareBin_Code
105,1,3,4,3,1,2,0
68,0,3,4,2,7,0,1
253,1,3,4,3,2,3,2
320,1,3,4,3,1,1,0
706,0,2,4,4,1,5,1


IV - Exploratory Data Analysis

In [105]:
#Variable corelation with survival
for x in data1_x:
    if data1[x].dtype != 'float64':
        print('Survival correlation by: %s' % x)
        display(data1[[x, Target[0]]].groupby(x, as_index= True).mean())
        print('-'*10 + '\n')
    
#Crosstab view example
print('Crosstab View example:')
display(pd.crosstab(data1['Title'], data1[Target[0]]))


Survival correlation by: Sex


Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


----------

Survival correlation by: Pclass


Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


----------

Survival correlation by: Embarked


Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.553571
Q,0.38961
S,0.336957


----------

Survival correlation by: Title


Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Master,0.575
Misc,0.444444
Miss,0.697802
Mr,0.156673
Mrs,0.792


----------

Survival correlation by: SibSp


Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
0,0.345395
1,0.535885
2,0.464286
3,0.25
4,0.166667
5,0.0
8,0.0


----------

Survival correlation by: Parch


Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
0,0.343658
1,0.550847
2,0.5
3,0.6
4,0.0
5,0.2
6,0.0


----------

Survival correlation by: familySize


Unnamed: 0_level_0,Survived
familySize,Unnamed: 1_level_1
1,0.303538
2,0.552795
3,0.578431
4,0.724138
5,0.2
6,0.136364
7,0.333333
8,0.0
11,0.0


----------

Survival correlation by: isAlone


Unnamed: 0_level_0,Survived
isAlone,Unnamed: 1_level_1
0,0.50565
1,0.303538


----------

Crosstab View example:


Survived,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,17,23
Misc,15,12
Miss,55,127
Mr,436,81
Mrs,26,99
