In [11]:
import abc

class ModelPipeline(object):
    
    __metaclass__ = abc.ABCMeta
    
    @abc.abstractmethod
    def get_data(self, data, label, train_test_col):
        pass 
    
    @abc.abstractmethod
    def optimize_hyperparam(self):
        pass    
    
    @abc.abstractmethod
    def fit_model(self):
        pass
    
    @abc.abstractmethod
    def transform_model(self):
        pass
    # Hoook me
    def ensemble_model(self):
        pass    

    @abc.abstractmethod
    def evaluate_result(self):
        pass
    
    #定義method的執行流程
    def execute_pipeline(self, data, label, train_test_col):
        self.get_data(data, label, train_test_col)
        self.optimize_hyperparam()
        self.fit_model()
        self.transform_model()
        self.evaluate_result()

In [12]:
import ast
import csv
import logging
import random
import lightgbm as lgb
import pandas as pd
import numpy as np
from timeit import default_timer as timer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
# https://github.com/hyperopt/hyperopt
from hyperopt import STATUS_OK, hp, tpe, Trials, fmin
from hyperopt.pyll.stochastic import sample

MAX_EVALS = 10
N_FOLDS = 5
# lgb hyperparameter grid
PARAM_GRID = {
    'class_weight': [None, 'balanced'],
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(30, 150)),
    'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
SUBSAMPLE_DIST = list(np.linspace(0.5, 1, 100))

 # boosting type domain 
BOOSTING_TYPE = {'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('subsample', 0.5, 1)}, 
#                                              {'boosting_type': 'dart', 'subsample': hp.uniform('subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}])}

SPACE = {
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
#                                                  {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                                 {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}


class TrainLGBModel(ModelPipeline):
    def __init__(self, search_type):
        self.search_type = search_type
        self.model = None
        self.fitted_model = None
        self.prediction = None
        self.features = None
        self.labels = None
        self.test_labels = None
        self.test_features = None
        self.train_set = None
    
    def get_data(self, data, label, train_test_col):
        data = pd.read_csv(data)
        train = data[data[train_test_col] == 'train']
        test = data[data[train_test_col] == 'test']
        
        train_labels = np.array(train[label].astype(np.int32)).reshape((-1,))
        self.test_labels = np.array(test[label].astype(np.int32)).reshape((-1,))
        
        train = train.drop(columns = [train_test_col, label])
        test = test.drop(columns = [train_test_col, label])
        self.features = np.array(train)
        self.test_features = np.array(test)
        self.labels = train_labels[:] 
        print('Train shape: ', train.shape)
        print('Test shape: ', test.shape)
        print("Successfully load training data!")
        self.train_set = lgb.Dataset(self.features, label = self.labels)
    
    def optimize_hyperparam(self):
        if self.search_type == 'default':
            self.model = lgb.LGBMClassifier()
        if self.search_type == 'bayesian':
            global  ITERATION
            ITERATION = 0            
            # Draw a sample
            params = sample(BOOSTING_TYPE)
            print(params)
            # Sample from the full space
            x = sample(SPACE)
            print(x)
            # Conditional logic to assign top-level keys
            subsample = x['boosting_type'].get('subsample', 1.0)
            x['boosting_type'] = x['boosting_type']['boosting_type']
            x['subsample'] = subsample
            print(x) 
            # optimization algorithm
            tpe_algorithm = tpe.suggest
            # Keep track of results
            bayes_trials = Trials()
            # File to save first results
            self.out_file = 'results/gbm_trials.csv'
            of_connection = open(self.out_file, 'w')
            writer = csv.writer(of_connection)

            # Write the headers to the file
            writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
            of_connection.close()
            
            # Global variable
            # Run optimization
            best = fmin(fn = self._gradient_boosting_objective, space = SPACE, algo = tpe.suggest, 
                        max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))
            # Sort the trials with lowest loss (highest AUC) first
            bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])
            bayes_trials_results[:2]
            
            results = pd.read_csv('results/gbm_trials.csv')

            # Sort with best scores on top and reset index for slicing
            results.sort_values('loss', ascending = True, inplace = True)
            results.reset_index(inplace = True, drop = True)
            results.head()
            
            # Convert from a string to a dictionary
            ast.literal_eval(results.loc[0, 'params'])
              
            # Extract the ideal number of estimators and hyperparameters
            best_bayes_estimators = int(results.loc[0, 'estimators'])
            best_bayes_params = ast.literal_eval(results.loc[0, 'params']).copy()

            # Re-create the best model and train on the training data
            self.model = lgb.LGBMClassifier(n_estimators=best_bayes_estimators, n_jobs = -1, 
                                                   objective = 'binary', random_state = 50, **best_bayes_params)
            print('This was achieved after {} search iterations'.format(results.loc[0, 'iteration']))

        if self.search_type == 'random':
            random.seed(100)
            for i in range(MAX_EVALS):
                # Randomly sample parameters for gbm
                params = {key: random.sample(value, 1)[0] for key, value in PARAM_GRID.items()}            
                if params['boosting_type'] == 'goss':
                    # Cannot subsample with goss
                    params['subsample'] = 1.0                    
                else:
                    # Subsample supported for gdbt and dart
                    params['subsample'] = random.sample(SUBSAMPLE_DIST, 1)[0]
                
            results_list = self._random_objective(params, i)
            random_results = pd.DataFrame(columns = ['loss', 'params', 'iteration', 'estimators', 'time'],
                                       index = list(range(MAX_EVALS)))
            # Add results to next row in dataframe
            random_results.loc[i, :] = results_list
            # Sort results by best validation score
            random_results.sort_values('loss', ascending = True, inplace = True)
            random_results.reset_index(inplace = True, drop = True)
            
            best_random_params = random_results.loc[0, 'params'].copy()
            best_random_estimators = int(random_results.loc[0, 'estimators'])
            print(best_random_params)
            self.model = lgb.LGBMClassifier(n_estimators=best_random_estimators, n_jobs = -1, 
                                       objective = 'binary', **best_random_params, random_state = 50)
            print('This was achieved using {} search iterations.'.format(random_results.loc[0, 'iteration']))
    
    def fit_model(self):
        start = timer()
        self.fitted_model = self.model.fit(self.features, self.labels)
        lgb_time = timer()-start       
        print("The best model from {} search training time is {:.4f} seconds".format(self.search_type ,lgb_time))
    
    def transform_model(self):
        self.predictions = self.model.predict_proba(self.test_features)[:, 1]
        print('The best model from {} search scores {:.4f} on the test data.'.format(self.search_type, roc_auc_score(self.test_labels, self.predictions)))
            
    def evaluate_result(self):
        auc = roc_auc_score(self.test_labels, self.predictions)
        print("The AUC from {} search is {:.4f}".format(self.search_type, auc))
    
    
    def _random_objective(self, params, iteration, n_folds = N_FOLDS):
        """Random search objective function. Takes in hyperparameters
           and returns a list of results to be saved."""
        start = timer()
        # Perform n_folds cross validation
        if params['boosting_type'] == 'dart': 
            cv_results = lgb.cv(params, self.train_set, num_boost_round = 10000, nfold = n_folds, 
                            metrics = 'auc', seed = 50)
        else:
            cv_results = lgb.cv(params, self.train_set, num_boost_round = 10000, nfold = n_folds, 
                            early_stopping_rounds = 100, metrics = 'auc', seed = 50)
        end = timer()
        best_score = np.max(cv_results['auc-mean'])
        # Loss must be minimized
        loss = 1 - best_score
        # Boosting rounds that returned the highest cv score
        n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
        # Return list of results
        return [loss, params, iteration, n_estimators, end - start]
    
    def _gradient_boosting_objective(self, params, n_folds = N_FOLDS):
        """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
        # Keep track of evals
        global ITERATION
        ITERATION += 1
        # Retrieve the subsample if present otherwise set to 1.0
        subsample = params['boosting_type'].get('subsample', 1.0)
        # Extract the boosting type
        params['boosting_type'] = params['boosting_type']['boosting_type']
        params['subsample'] = subsample
        # Make sure parameters that need to be integers are integers
        for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
            params[parameter_name] = int(params[parameter_name])
        start = timer()
        # Perform n_folds cross validation
        if params['boosting_type'] == 'dart': 
            cv_results = lgb.cv(params, self.train_set, num_boost_round = 10000, nfold = n_folds, 
                            metrics = 'auc', seed = 50)
        else:
            cv_results = lgb.cv(params, self.train_set, num_boost_round = 10000, nfold = n_folds, 
                            early_stopping_rounds = 100, metrics = 'auc', seed = 50)        
        run_time = timer() - start
        # Extract the best score
        best_score = np.max(cv_results['auc-mean'])
        # Loss must be minimized
        loss = 1 - best_score
        # Boosting rounds that returned the highest cv score
        n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
        # Write to the csv file ('a' means append)
        of_connection = open(self.out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, params, ITERATION, 
                         n_estimators, run_time])
        # Dictionary with information for evaluation
        return {'loss': loss, 'params': params, 'iteration': ITERATION,
                'estimators': n_estimators, 
                'train_time': run_time, 'status': STATUS_OK}

In [13]:
if __name__ == '__main__':
    lgb_model = TrainLGBModel(search_type = 'bayesian')
    lgb_model.execute_pipeline(data = 'caravan-insurance-challenge.csv', label = 'CARAVAN', train_test_col = 'ORIGIN')

Train shape:  (5822, 85)
Test shape:  (4000, 85)
Successfully load training data!
{'boosting_type': {'boosting_type': 'gbdt', 'subsample': 0.6040735905270593}}
{'boosting_type': {'boosting_type': 'gbdt', 'subsample': 0.9560323200138281}, 'class_weight': None, 'colsample_bytree': 0.8310823192173599, 'learning_rate': 0.029642399954777503, 'min_child_samples': 260.0, 'num_leaves': 43.0, 'reg_alpha': 0.7654396659123995, 'reg_lambda': 0.6753631548946817, 'subsample_for_bin': 120000.0}
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8310823192173599, 'learning_rate': 0.029642399954777503, 'min_child_samples': 260.0, 'num_leaves': 43.0, 'reg_alpha': 0.7654396659123995, 'reg_lambda': 0.6753631548946817, 'subsample_for_bin': 120000.0, 'subsample': 0.9560323200138281}
100%|██████████| 10/10 [00:09<00:00,  1.01it/s, best loss: 0.23465460856485376]
This was achieved after 5 search iterations
The best model from bayesian search training time is 0.2890 seconds
The best model fr