In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder, LabelBinarizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import pickle
import PipelineHelper

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
# predict Survived from Pclass, Sex, Age and Fare

In [5]:
target = 'Survived'
numerical_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex']

In [6]:
set(df['Pclass'].tolist())

{1, 2, 3}

In [7]:
mi = PipelineHelper.MultiItemSelector(key_list=numerical_features)
imputer = Imputer(strategy='median')
scaler = StandardScaler()
numerical_pipeline = Pipeline([
    ('selector', mi),
    ('imputer', imputer),
    ('scaler', scaler)
])

In [8]:
categorical_pipelines = []

for feature in categorical_features:
    pipeline = Pipeline([
        ('selector', PipelineHelper.ItemSelector(key=feature)),
        ('one_hot', PipelineHelper.CustomLabelBinarizer(unseen='__New__'))
    ])
    
    categorical_pipelines.append((feature+'_categorical_pipeline', pipeline))

In [9]:
feature_union = FeatureUnion(
    transformer_list = [('numerical_pipeline', numerical_pipeline)] + categorical_pipelines
)

model_pipeline = Pipeline([
    ('feature_union', feature_union),
    ('algorithm', RandomForestClassifier(n_jobs=-1, random_state=1))
])

parameters = {'algorithm__n_estimators': [50, 100, 200, 500],
              "algorithm__criterion": ["gini", "entropy"],
              'algorithm__max_features': ["auto", "sqrt", "log2"],
              "algorithm__max_depth": [1, 3, 10, 100, None],
              "algorithm__bootstrap": [True, False]}

In [10]:
grid_search = RandomizedSearchCV(model_pipeline, parameters, n_jobs=1, verbose=1, refit=True, scoring='roc_auc', n_iter=2)
grid_search.fit(df, df[target].values)
print('best param', grid_search.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.4s finished


best param {'algorithm__n_estimators': 500, 'algorithm__max_features': 'sqrt', 'algorithm__max_depth': None, 'algorithm__criterion': 'entropy', 'algorithm__bootstrap': True}


In [11]:
# retrain the pipeline with the best param and pickle the trained pipeline
cleaned_best_params = {key.split('__')[1]: grid_search.best_params_[key] for key in grid_search.best_params_}
model_pipeline.named_steps['algorithm'].set_params(n_jobs=-1, **cleaned_best_params)
model_pipeline.fit(df, df[target].values)
pickle.dump(model_pipeline, open('model_pipeline.pkl', 'wb'))
print('finished pickling model')

finished pickling model


In [12]:
# test the pickled model
loaded_model_pipeline = pickle.load(open('model_pipeline.pkl', 'rb'))
input_data1 = {'Age': 12, 'Fare': 1, 'Pclass': 1, 'Sex': None}
input_data2 = {'Age': 90, 'Fare': 100, 'Pclass': 100, 'Sex': 'Haha'}
[each[1] for each in loaded_model_pipeline.predict_proba(pd.DataFrame([input_data1, input_data2]))]

[0.596, 0.424]