In [None]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns

In [None]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

>Author: Frank E. Harrell Jr., Thomas Cason  Source: [Vanderbilt Biostatistics](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html)   
<br/><br/>

>The original Titanic dataset, describing the survival status of individual passengers on the Titanic. The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. The principal source for data about Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.  
<br/><br/>
>Thomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.   For more information about how this dataset was constructed: http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt   

### Attribute information
The variables on our extracted dataset are pclass, survived, name, age, embarked, home.dest, room, ticket, boat, and sex. 


- survival: Survival (0 = No, 1 = Yes)
- pclass:	Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- sex:	Sex	
- Age:	Age in years	
- sibsp:	# of siblings / spouses aboard the Titanic	
- parch:	# of parents / children aboard the Titanic	
- ticket:	Ticket number	
- fare:	Passenger fare	
- cabin:	Cabin number	
- embarked:	Port of Embarkation(C = Cherbourg, Q = Queenstown, S = Southampton)

# Explore the data

In [None]:
X.info()

In [None]:
X.describe()

In [None]:
X.head(2).T

In [None]:
sns.boxplot(x=y,y='fare',data=X)

There seems to be a correlation between the fare payed, and survival chance.

In [None]:
sns.boxplot(x=y,y='age',data=X)

Age does not seem to impact by itself.

In [None]:
import pandas as pd
p = pd.concat([X, y], axis=1)
g = sns.FacetGrid(p, col="sex",  row="pclass")
g.map(sns.boxplot, "survived", "age")

But in a facet grid, there seems to be a correlation for some groups.

In [None]:
sns.boxplot(x='pclass',y='age',data=X)

But it's hard to tell if that is a causal relationship, or if it's just because of the correlation with other factors.

In [None]:
np.mean(X.isna())

We have some nan's.

In [None]:
X.drop(["cabin", "ticket", "boat", "body", "home.dest", "name"], axis=1, inplace=True)

Let's drop some variables that are probably not informative. It could still be the case that they correlate with something else. E.g., imagine home destination to be correlated with the location on the deck, which in term could be correlated with survival chances.

A train-test split:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

In [None]:
np.mean(X_train.isna())

We still have soms nans. Let's use a `SimpleImputer` for that. For the numeric data we use the median.

In [None]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

And for the categorical, let's use most frequent.

In [None]:
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[ 
    	('imputer', SimpleImputer(strategy='most_frequent')),
	('ohe', OneHotEncoder(handle_unknown='ignore'))
])

We combine this into a preprocessor.

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
X_ = preprocessor.fit_transform(X_train)
X_.shape

That seems to work as expected!

Let's test a basic logistic regression.

In [None]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

A SVC:

In [None]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV, HalvingGridSearchCV
import scipy.stats as stats

param_dist = {'svc__C': stats.expon(scale=10), 'svc__gamma': stats.expon(scale=1)}

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('svc', SVC())])

sh = HalvingRandomSearchCV(clf, param_dist, 
			cv=5,
      factor=2,
      random_state=0)
sh.fit(X_train, y_train)
sh.best_params_

In [None]:
print("model score: %.3f" % sh.score(X_test, y_test))

And a RFC:

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rfc', RandomForestClassifier())])
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
n_feats = X_train.shape[1]

In [None]:
from scipy.stats import randint

param_dist = {"rfc__max_depth": [3, None],
              "rfc__max_features": randint(1, n_feats+1),
              "rfc__bootstrap": [True, False],
              "rfc__criterion": ["gini", "entropy"]}

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rfc', RandomForestClassifier(n_estimators=20))])

sh = HalvingRandomSearchCV(clf, param_dist, cv=5,
      factor=2,
      random_state=0)
sh.fit(X_train, y_train)
sh.best_params_

In [None]:
print("model score: %.3f" % sh.score(X_test, y_test))