In [15]:
import numpy as np
import pandas as pd
import os
from utils import get_label, load_data, preprocess_dataframe, get_tuner, save_model, load_model, print_scores
from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import LabelEncoder
import ydata_profiling as ydp
from sklearn.pipeline import Pipeline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
train_df, test_df = load_data()
label = get_label()

In [4]:
input_features = train_df.columns.tolist()
input_features.remove('PassengerId')
input_features.remove('Cabin')
input_features.remove('Parch')
input_features.remove('SibSp')
input_features.remove('Name')
input_features.remove('Survived')
input_features

['Pclass', 'Sex', 'Age', 'Ticket', 'Fare', 'Embarked']

In [7]:
X_train, y_train, X_val, y_val = preprocess_dataframe(
    train_df,
    input_features=input_features,
    drop_na=False,
    fill_na=True,
    enable_categorical=False,
    drop_duplicates=True,
)
X_test = preprocess_dataframe(
    test_df,
    input_features=input_features,
    drop_na=False,
    fill_na=True,
    enable_categorical=False,
    test_split=None
)

X_train.head(5)

Converting Pclass to label
Converting Sex to label
Converting Embarked to label
Converting Pclass to label
Converting Sex to label
Converting Embarked to label


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Has_Sibsp,Has_Parch,Has_Family
331,0,1,1.112223,-0.078684,2,0,0,0
733,1,1,-0.445495,-0.377145,2,0,0,0
382,2,1,0.177592,-0.474867,2,0,0,0
704,2,1,-0.237799,-0.47623,2,1,0,1
813,2,0,-1.622436,-0.025249,2,1,1,1


In [8]:
X_train.duplicated().sum()

0

In [9]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Has_Sibsp', 'Has_Parch',
       'Has_Family'],
      dtype='object')

In [10]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

In [11]:
feature_subset = ['Sex', 'Age', 'Fare', 'Has_Family']
rf_clf.fit(X_train[feature_subset], y_train)

In [12]:
rf_clf.score(X_val[feature_subset], y_val)

0.7932960893854749

In [13]:
hparam_grid = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'classifier__min_samples_split': [2, 5, 10, 15, 20],
    'classifier__min_samples_leaf': [1, 2, 5, 10, 15, 20],
    # 'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__bootstrap': [True, False],
    'classifier__criterion': ['gini', 'entropy'],
    # 'classifier__warm_start': [True, False],
    # 'classifier__oob_score': [True, False],
    'classifier__class_weight': ['balanced', 'balanced_subsample', None],
    'classifier__ccp_alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'classifier__max_leaf_nodes': [None, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
}
pipeline = Pipeline([('classifier', RandomForestClassifier())])
tuner = get_tuner(pipeline, hparam_grid, use_random_search=True)

In [14]:
tuner.fit(X_train[feature_subset], y_train)

In [16]:
print_scores(tuner, X_train[feature_subset],
             y_train, X_val[feature_subset], y_val)

Train Score: 0.7822085889570553
Val score: 0.7821229050279329


(0.7822085889570553, 0.7821229050279329)

In [17]:
tuner.best_estimator_.get_params('classifier')

{'memory': None,
 'steps': [('classifier',
   RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced',
                          criterion='entropy', max_depth=20, max_leaf_nodes=50,
                          min_samples_split=5, n_estimators=400))],
 'verbose': False,
 'classifier': RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced',
                        criterion='entropy', max_depth=20, max_leaf_nodes=50,
                        min_samples_split=5, n_estimators=400),
 'classifier__bootstrap': True,
 'classifier__ccp_alpha': 0.1,
 'classifier__class_weight': 'balanced',
 'classifier__criterion': 'entropy',
 'classifier__max_depth': 20,
 'classifier__max_features': 'sqrt',
 'classifier__max_leaf_nodes': 50,
 'classifier__max_samples': None,
 'classifier__min_impurity_decrease': 0.0,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 5,
 'classifier__min_weight_fraction_leaf': 0.0,
 'classifier__n_estimators': 400,
 'classifier__n_jobs': None,
 '