# Titanic data
### getting started with kaggle

In [1]:
#load data
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from xgboost import XGBClassifier, XGBRegressor

data_dir = 'data/'

data = pd.read_csv(data_dir + 'train.csv')
labels = data['Survived']
data = data.drop('Survived', axis=1)
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [57]:
print(imputer_1.get_feature_names())
pd.DataFrame(X[0:6, :]).head()

['drp__Pclass', 'drp__Sex', 'drp__Age', 'drp__SibSp', 'drp__Parch', 'drp__Fare', 'cabin_imp__Cabin', 'Emb_imp__Embarked']


Unnamed: 0,0,1,2,3,4,5,6,7
0,3,male,22,1,0,7.25,Z,S
1,1,female,38,1,0,71.2833,C,C
2,3,female,26,0,0,7.925,Z,S
3,1,female,35,1,0,53.1,C,S
4,3,male,35,0,0,8.05,Z,S


In [5]:
print(data.shape)
print(data.columns)

(891, 12)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


**Pclass** : ticket class <br>
**SibSp** : # sibling or spouse aboard <br>
**Parch** : # parents or children aboard <br>
**Embark** : Port

In [9]:
# check the cabin data null values
data[(data.Pclass == 3) & (~data.Cabin.isnull())].shape

(12, 12)

In [14]:
# check which columns contain nan values
def findNaNCol(data):
    nancols = []
    for column in data.columns:
        if data[column].isnull().any():
            nancols.append(column)
    return nancols

nancols = findNaNCol(data)
nancols

['Age', 'Cabin', 'Embarked']

In [16]:
# check if test data also contains same columns
test_data = pd.read_csv(data_dir + 'test.csv')
findNaNCol(test_data)

['Age', 'Fare', 'Cabin']

### Note:
We need to find ways to impute 'Age' and 'Fare' <br>
I dont think 'Embarked' is at all important <br>
'Cabin' can be converted into categorical (? maybe? need to check)

In [17]:
# if there is any Pclass that has non null values of cabin
print(data.Pclass[data.Cabin.isnull()].unique())

[3 2 1]


In [23]:
print(f'unique_cabin : {data.Cabin.unique().shape}\ntotal_Cabin_nan : {sum(data.Cabin.isnull())}')

unique_cabin : (148,)
total_Cabin_nan : 687


### Note2:
After imputing 'Age' and 'Fare', the list of explanatory variables becomes <br>
{id, Pclass, sex, age, sibsp, parch, fare, cabin}

using sibsp and parch can be tricky, maybe i need to form separate model for those, to provide probability of surviving, when you have extra family member

### Note3:
I think number of siblings or parents/children can defintely tell something about the age group of a person. The remaining variables that make a little to no sense are {Pclass, sex, Fare}

# Create Pipelines

-- Drop Name, Ticket + add fam_n_frnd ;; Impute, categorize Cabin and Embarked <br>
-- One hot encode Cabin, Embarked, Sex, Pclass <br>
-- Create ordinal fare group {note that we can only group the data in training set}<br>
-- Impute ordinal fare {transformation will be tricky as it contains NaN values}<br>
-- impute Age <br>
====== thus cleaning is done
-- finally fit xgboost, svm and knn and use hard voting

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
import re

class CabinImputer(BaseEstimator, TransformerMixin):
    def __init__(self, imp='Z'):
        self.imp = imp
        pat = re.compile(r'^[A-Z]')
        self.categorizer = np.vectorize(lambda x: pat.match(x).group())
    def fit(self, X=None, y=None):
        self.imputer = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=self.imp)
        return self 
    def transform(self, X):
        X = self.imputer.fit_transform(X)
        X[:, 0] = self.categorizer(X[:, 0])
        return X
    def get_feature_names(self):
        return np.array(['Cabin'])
    def get_params(self, deep=True):
        return {'imp': self.imp}

class ModeImputer(SimpleImputer):
    def get_feature_names(self):
        return np.array(['Embarked'])

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_ix=[0, 2, 7], features=[]):
        self.drop_list = drop_ix
        self.features = features
    def fit(self, X=None, y=None):
        self.drops = self.drop_list
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        return np.delete(X, self.drops, axis=1)
    def get_feature_names(self):
        return np.delete(np.array(self.features), self.drop_list, axis=0)
    def get_params(self, deep=True):
        return {'drop_ix': self.drop_list, 'features': self.features}

class DoNothing(BaseEstimator, TransformerMixin):
    def __init__(self, features=['Age', 'SibSp', 'Parch']):
        self.features = features
    def fit(self, X=None, y=None):
        self.feature_len = X.shape[1]
        return self
    def transform(self, X):
        return X
    def get_params(self, deep=True):
        return {'features': self.features}
    def get_feature_names(self):
        return self.features

class FareCat(BaseEstimator, TransformerMixin):
    """ This assumes that no value is missing in Fare column """
    def __init__(self):
        self.imputing_model = None
    def fit(self, X=None, y=None):
        return self
    def transform(self, X):
        X = pd.Series(X[:, 0])
        X = pd.cut(X, [-1, 0.1, 10, 30, 100, 200, np.inf], labels=np.arange(0, 6))
        return np.reshape(X.to_numpy(), (-1, 1))
    def get_feature_names(self, deep=True):
        return ['Fare']

cols = data.columns.drop(['Cabin', 'Embarked'])
imputer_1 = ColumnTransformer([
    ('drp', DropColumns(drop_ix=[0, 2, 7], features=cols), cols),
    ('cabin_imp', CabinImputer(), ['Cabin']),
    ('Emb_imp', ModeImputer(strategy='most_frequent'), ['Embarked'])
])

ohe_categories = [np.array([1, 2, 3], dtype=object),
                np.array(['female', 'male'], dtype=object),
                np.array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Z'], dtype=object),
                np.array(['C', 'Q', 'S'], dtype=object)]
                # had to fit a model and get the categories
ohe_cols = [0, 1, 6, 7]
nthn_cols = [2, 3, 4]
imputer_2 = ColumnTransformer([
    ('ohe', OneHotEncoder(categories=ohe_categories), ohe_cols),
    ('nthn', DoNothing(), nthn_cols),
    ('farecut', FareCat(), [5])
])

half_pipeline = Pipeline([
    ('imp1', imputer_1),
    ('imp2', imputer_2),
])

X = half_pipeline.fit_transform(data)
X = pd.DataFrame(X)
X.columns = imputer_2.get_feature_names()
X.head()

Unnamed: 0,ohe__x0_1,ohe__x0_2,ohe__x0_3,ohe__x1_female,ohe__x1_male,ohe__x2_A,ohe__x2_B,ohe__x2_C,ohe__x2_D,ohe__x2_E,ohe__x2_F,ohe__x2_G,ohe__x2_T,ohe__x2_Z,ohe__x3_C,ohe__x3_Q,ohe__x3_S,nthn__Age,nthn__SibSp,nthn__Parch,farecut__Fare
0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,22,1,0,1
1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,38,1,0,3
2,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,26,0,0,1
3,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,35,1,0,3
4,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,35,0,0,1


In [3]:
#trying to keep indexing clean as much as possible
Pclass_ix = np.s_[:, 0:3]
Sex_ix = np.s_[:, 3:5]
Cabin_ix = np.s_[:, 5:14]
Embarked_ix = np.s_[:, 14:17]
Age_ix = np.s_[:, 17]
SibSp_ix = np.s_[:, 18]
Parch_ix = np.s_[:, 19]
Fare_ix = np.s_[:, 20]

#create a function to name data frames
namedict = {'Pclass': ['1', '2', '3'], 
            'Sex': ['female', 'male'],
            'Cabin': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Z'],
            'Embarked': ['C', 'Q', 'S'],
            'Age': '',
            'SibSp': '',
            'Parch': '',
            'Fare': '',
            }

def col_names(features=[], extra = []):
    namelist = []
    for feature in features:
        for category in namedict[feature]:
            namelist.append(feature + '_' + category)
    namelist.extend(extra)
    return namelist

### Pipeline: (conitinued)
Imputing Fare and Age

In [90]:
# Imputing classes for Fare and age
class ImputeFare(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = XGBClassifier(max_depth=3, max_leaf_nodes=3, n_estimators=30)
    def fit(self, X=None, y=None):
        y = X[Fare_ix]
        nan_ix = np.isnan(y.astype(float))
        X = X[~nan_ix]
        y = y[~nan_ix]
        X = np.c_[X[Pclass_ix], X[Sex_ix], X[Cabin_ix],
            X[Embarked_ix], X[SibSp_ix] + X[Parch_ix]]
        self.model = self.model.fit(X,y)
        return self
    def transform(self, X):
        y = X[Fare_ix]
        nan_ix = np.isnan(y.astype(float))
        #select predictor set
        X_pred = np.c_[X[Pclass_ix], X[Sex_ix], X[Cabin_ix],
            X[Embarked_ix], X[SibSp_ix] + X[Parch_ix]]
        X_pred = X_pred[nan_ix]
        #select features
        #replace nan variables
        X[nan_ix, Fare_ix[1]] = self.model.predict(X_pred)
        return X 

class ImputeAge(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = XGBRegressor(max_depth=3, max_leaf_nodes=4, n_estimators=13)
    def fit(self, X=None, y=None):
        y = X[Age_ix]
        nan_ix = np.isnan(y.astype(float))
        X = X[~nan_ix]
        y = y[~nan_ix]
        X = np.c_[X[Pclass_ix], X[SibSp_ix], X[Parch_ix], 
                X[Sex_ix], X[Fare_ix]]
        self.model = self.model.fit(X, y)
        return self
    def transform(self, X):
        y = X[Age_ix]
        nan_ix = np.isnan(y.astype(float))
        #select predictor set
        X_pred = np.c_[X[Pclass_ix], X[SibSp_ix], X[Parch_ix], 
                X[Sex_ix], X[Fare_ix]]
        X_pred = X_pred[nan_ix]
        #replace nan variables
        X[nan_ix, Age_ix[1]] = self.model.predict(X_pred)
        return X

In [97]:
cleaning_pipeline = Pipeline([
    ('imp1', imputer_1),
    ('imp2', imputer_2),
    ('fare_imp', ImputeFare()),
    ('age_imp', ImputeAge()),
])

X = cleaning_pipeline.fit_transform(data)

array([22.0, 38.0, 26.0, 35.0, 35.0, 28.847219467163086, 54.0, 2.0, 27.0,
       14.0, 4.0, 58.0, 20.0, 39.0, 14.0, 55.0, 2.0, 33.31815719604492,
       31.0, 24.800701141357422, 35.0, 34.0, 15.0, 28.0, 8.0, 38.0,
       28.847219467163086, 19.0, 24.800701141357422, 28.847219467163086,
       40.0, 35.076568603515625, 24.800701141357422, 66.0, 28.0, 42.0,
       28.847219467163086, 21.0, 18.0, 14.0, 40.0, 27.0,
       28.847219467163086, 3.0, 19.0, 28.847219467163086,
       25.979949951171875, 24.800701141357422, 24.37738800048828, 18.0,
       7.0, 21.0, 49.0, 29.0, 65.0, 40.654579162597656, 21.0, 28.5, 5.0,
       11.0, 22.0, 38.0, 45.0, 4.0, 44.0030517578125, 19.965560913085938,
       29.0, 19.0, 17.0, 26.0, 32.0, 16.0, 21.0, 26.0, 32.0, 25.0,
       28.847219467163086, 28.847219467163086, 0.83, 30.0, 22.0, 29.0,
       24.800701141357422, 28.0, 17.0, 33.0, 16.0, 28.847219467163086,
       23.0, 24.0, 29.0, 20.0, 46.0, 26.0, 59.0, 28.847219467163086, 71.0,
       23.0, 34.0, 34.0,

In [91]:
X = model_pipeline.fit_transform(data)
print(sum(nan_ix))
testdata = pd.read_csv(data_dir + 'test.csv')
X_test = model_pipeline.fit_transform(testdata)
y = X_test[Fare_ix]
nan_ix = np.isnan(y.astype(float))

1


In [92]:
fim = ImputeFare()
X1 = fim.fit_transform(X_test)
X1[nan_ix, Fare_ix[1]]

array([1.0], dtype=object)