In [65]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas_profiling
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

## Data Dictionary
- Survived: 0 = No, 1 = Yes
- pclass: Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
- sibsp: # of siblings / spouses aboard the Titanic
- parch: # of parents / children aboard the Titanic
- ticket: Ticket number
- cabin: Cabin number
- embarked: Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

## Variable Notes
- pclass: A proxy for socio-economic status (SES)
- 1st = Upper
- 2nd = Middle
- 3rd = Lower
- age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
- sibsp: The dataset defines family relations in this way...
- Sibling = brother, sister, stepbrother, stepsister
- Spouse = husband, wife (mistresses and fiancés were ignored)
- parch: The dataset defines family relations in this way...
- Parent = mother, father
- Child = daughter, son, stepdaughter, stepson
- Some children travelled only with a nanny, therefore parch=0 for them.

## Load Data

In [66]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [67]:
#missing value check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Age and Embarked have missing values

In [68]:
sns.pairplot(df, hue='Survived')
#plt.show()

<seaborn.axisgrid.PairGrid at 0x7fd0f5945c10>

In [69]:
df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [70]:
df['Cabin'].describe()

count         204
unique        147
top       B96 B98
freq            4
Name: Cabin, dtype: object

## Drop columns and split data

In [71]:
X = df.drop(['Survived', 'Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1)
y = df['Survived']
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state= 42)

In [73]:
corrMat = X_train.corr()
mask = np.array(corrMat)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMat, mask=mask,vmax=.8, square=True,annot=True)

<AxesSubplot:>

## Create Pipeline

Create numeric transformer

In [74]:
numeric_features = ['Age', 'Fare', 'SibSp']

In [75]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler()
    )
#age has missing value

In [76]:
categorical_features = ['Embarked', 'Sex', 'Pclass']

In [77]:
# handle unknown means it will ignore nan's if it finds them
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [78]:
# you can also create custom functions
def name_length(df):
    length = df[df.columns[0]].str.len()
    return length.values.reshape(-1, 1)

In [79]:
# pre-process transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        #('name', FunctionTransformer(name_length), ['Name'])
    ],
    remainder='drop') 


## Pipeline for logistic regression

In [80]:
# create the model pipeline
pipeline＿lr = make_pipeline(preprocessor, LogisticRegression(max_iter=300))

In [81]:
# fit the pipeline to training data
pipeline_lr.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare', 'SibSp']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Embarked', 'Sex',
                                                   'Pclass'])])),
                ('logisticregression', LogisticRegression(max_iter=300))])

In [82]:
# calculate the accuracy score from train data
print('logistic regression accuracy score(train): %.4f' % pipeline_lr.score(X_train, y_train))

# calculate the accuracy score from test data
print('logistic regression accuracy score(test): %.4f' % pipeline_lr.score(X_test, y_test))

logistic regression accuracy score(train): 0.8315
logistic regression accuracy score(test): 0.8149


## Pipeline for Random forest classifier

In [83]:
pipeline＿rfc = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, max_depth=5))

In [84]:
pipeline_rfc.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare', 'SibSp']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Embarked', 'Sex',
                                                   'Pclass'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=5))])

In [85]:
print('random forest accuracy score(train): %.4f' % pipeline_rfc.score(X_train, y_train))
print('random forest accuracy score(test): %.4f' % pipeline_rfc.score(X_test, y_test))

random forest accuracy score(train): 0.9045
random forest accuracy score(test): 0.8008


## Pipeline for Xgboost classifier

In [86]:
stopper

NameError: name 'stopper' is not defined

In [None]:
pipeline＿xgbc = make_pipeline(preprocessor, xgb.XGBClassifier(use_label_encoder=False, random_state=42))

In [None]:
pipeline_xgbc.fit(X_train, y_train)

In [None]:
print('logistic regression accuracy score(train): %.4f' % pipeline_xgbc.score(X_train, y_train))
print('logistic regression accuracy score(test): %.4f' % pipeline_xgbc.score(X_test, y_test))

NameError: name 'pipeline_xgbc' is not defined

## Hyperparameter optimization

In [87]:
pipeline_rfc_opt = make_pipeline(preprocessor, RandomForestClassifier())
pprint(pipeline_rfc_opt.get_params())

{'columntransformer': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Age', 'Fare', 'SibSp']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Embarked', 'Sex', 'Pclass'])]),
 'columntransformer__cat': OneHotEncoder(handle_unknown='ignore'),
 'columntransformer__cat__categories': 'auto',
 'columntransformer__cat__drop': None,
 'columntransformer__cat__dtype': <class 'numpy.float64'>,
 'columntransformer__cat__handle_unknown': 'ignore',
 'columntransformer__cat__sparse': True,
 'columntransformer__n_jobs': None,
 'columntransformer__num': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
  

In [88]:
# define our hyperparameters to combine
hyperparam_grid = {
    'randomforestclassifier__max_depth': [2, 5, 10, 20, 30],
    'randomforestclassifier__n_estimators': [5, 10, 50, 100, 200],
    'randomforestclassifier__min_samples_leaf': [1,5,10]
}

In [89]:
grid_cv = GridSearchCV(estimator=pipeline_rfc_opt,            # unfitted model/estimator
                       param_grid=hyperparam_grid,  
                       cv=5,                        # number of folds, k
                       scoring='accuracy')                # scoring metric

In [90]:
# fit all models with all the different hyperparamters
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['Age',
                                                                          'Fare',
                                                                          'SibSp']),
                                                                        ('cat',
                                                             

In [91]:
results_df = pd.DataFrame(grid_cv.cv_results_)

In [92]:
results_df.shape

(75, 16)

In [93]:
results_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_randomforestclassifier__max_depth',
       'param_randomforestclassifier__min_samples_leaf',
       'param_randomforestclassifier__n_estimators', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [94]:
results_df[['param_randomforestclassifier__max_depth', 'param_randomforestclassifier__min_samples_leaf', 'param_randomforestclassifier__n_estimators', 'mean_test_score',
       'std_test_score', 'rank_test_score']].sort_values('rank_test_score')
#the order of rank_test_score is the order od mean_test_score

Unnamed: 0,param_randomforestclassifier__max_depth,param_randomforestclassifier__min_samples_leaf,param_randomforestclassifier__n_estimators,mean_test_score,std_test_score,rank_test_score
60,30,1,5,0.832698,0.099033,1
47,20,1,50,0.821429,0.093062,2
64,30,1,200,0.821429,0.099475,2
48,20,1,100,0.821270,0.092998,4
20,5,5,5,0.821111,0.072768,5
...,...,...,...,...,...,...
42,10,10,50,0.781905,0.083688,71
2,2,1,50,0.781746,0.064955,72
31,10,1,10,0.771111,0.122470,73
0,2,1,5,0.765397,0.118938,74


In [95]:
grid_cv.best_params_  

{'randomforestclassifier__max_depth': 30,
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__n_estimators': 5}

In [102]:
#estimating the test performance after tuning
pipeline_rfc_best = make_pipeline(preprocessor ,RandomForestClassifier(max_depth=30, n_estimators=5, min_samples_leaf=1))

cross_acc = cross_val_score(estimator=pipeline_rfc_best,  
                            X=X_train,           
                            y=y_train,          
                            cv=5,                 
                            scoring='accuracy')    
                              

In [103]:
cross_acc

array([0.77777778, 0.69444444, 0.80555556, 0.91428571, 0.82857143])

In [104]:
cross_acc.mean()

0.8041269841269841

In [115]:
pipeline_rfc_best.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare', 'SibSp']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Embarked', 'Sex',
                                                   'Pclass'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=30, n_estimators=5))])

In [116]:
pipeline_best.score(X_train, y_train)

0.9887640449438202

In [117]:
pipeline_best.score(X_test, y_test)

0.7868162692847125

## Kaggle submission

In [118]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [119]:
X_k = test.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1)
X_k


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0000,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,S
414,1,female,39.0,0,0,108.9000,C
415,3,male,38.5,0,0,7.2500,S
416,3,male,,0,0,8.0500,S


In [120]:
predictions_lr = pipeline_lr.predict(X_k)
submission_lr = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions_lr})

In [121]:
predictions_rfc = pipeline_rfc.predict(X_k)
submission_rfc = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions_rfc})

In [122]:
predictions_rfc_best = pipeline_rfc_best.predict(X_k)
submission_rfc_best = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions_rfc_best})

In [123]:
filename = 'Titanic Predictions LR.csv'
submission_lr.to_csv(filename,index=False)
print('Saved file: ' + filename)

filename = 'Titanic Predictions RFC.csv'
submission_rfc.to_csv(filename,index=False)
print('Saved file: ' + filename)

filename = 'Titanic Predictions RFC best.csv'
submission_rfc_best.to_csv(filename,index=False)
print('Saved file: ' + filename)

Saved file: Titanic Predictions LR.csv
Saved file: Titanic Predictions RFC.csv
Saved file: Titanic Predictions RFC best.csv


## next
- randomeforest, xgboost, SVM
- cross validation, grid search
- change preprocessor depending on column
- thinka bout 'Name', 'Cabin', 'TIcket'
- regularization
- feature importance