In [1]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.pipeline import Pipeline
import time
import numpy as np
import random

In [2]:
# set random seeds
np.random.seed(0)
random.seed(0)

In [8]:
%run "/Users/rileyfox/Code/Feature_Engineering/logger.py"

In [9]:
def simple_grid_search(x_train, y_train, x_test, y_test, feature_engineering_pipeline):
    '''
    Helper function to grid search an ExtraTreesClassifier model 
    and print a classification report for the best parameter settings.
    Best is the best cross-validated accuracy on the training set.
    '''

    params = {
        'max_depth': [10, None],
        'n_estimators': [10, 50, 100, 500],
        'criterion': ['gini', 'entropy']
    }

    base_model = ExtraTreesClassifier()
    
    model_grid_search = GridSearchCV(base_model, param_grid=params, cv=3)  # cv is number of folds in stratified K fold
    start_time = time.time()  # start time
    if feature_engineering_pipeline:  # fit FE pipeline to training data and use it to transform test data
        parsed_x_train = feature_engineering_pipeline.fit_transform(x_train, y_train)
        parsed_x_test = feature_engineering_pipeline.transform(x_test)
    else:
        logger.info('No feature engineering pipeline specified, using inputs as is')
        parsed_x_train = x_train
        parsed_x_test = x_test
    
    parse_time = time.time()
    logger.info(f'Parsing took {(parse_time - start_time):.2f} seconds')

    model_grid_search.fit(parsed_x_train, y_train)
    fit_time = time.time()
    logger.info(f'Training took {(fit_time - start_time):.2f} seconds')

    best_model = model_grid_search.best_estimator_

    print(classification_report(y_true=y_test, y_pred=best_model.predict(parsed_x_test)))
    end_time = time.time()
    logger.info(f'Overall took {(end_time - start_time):.2f} seconds')

    return best_model

In [10]:
def advanced_grid_search(x_train, y_train, x_test, y_test, ml_pipeline, params, cv=3, include_probas=False, is_regression=False):
    '''
    Helper function to grid search a ML pipeline with feature engineering included.
    Prints classification report for the best parameter settings.
    Best is the best cross-validated accuracy on the training set.
    '''

    model_grid_search = GridSearchCV(ml_pipeline, param_grid=params, cv=cv, error_score=-1)
    start_time = time.time()
    
    model_grid_search.fit(x_train, y_train)
    
    best_model = model_grid_search.best_estimator_

    y_preds = best_model.predict(x_test)

    if is_regression:
        rmse = np.sqrt(mean_squared_error(y_pred=y_preds, y_true=test_set['pct_change_eod']))
        logger.info(f'RMSE: {rmse:.5f}')
    else:
        print(classification_report(y_true=y_test, y_pred=y_preds))

    logger.info(f'Best params: {model_grid_search.best_params_}')
    end_time = time.time()
    logger.info(f'Overall took {(start_time - end_time):f2} seconds')

    if include_probas:
        y_probas = best_model.predict_proba(x_test).max(axis=1)
        return best_model, y_preds, y_probas

    return best_model, y_preds