In [111]:
import abc

class ModelPipeline(object):
    
    __metaclass__ = abc.ABCMeta
    
    @abc.abstractmethod
    def get_data(self, *args, **kwargs):
        pass    
    
    @abc.abstractmethod
    def fit_model(self, *args, **kwargs):
        pass
    
    # Hook method
    def optimize_hyperparam(self, *args, **kwargs):
        pass    
    
    @abc.abstractmethod
    def transform_model(self, *args, **kwargs):
        pass
    
    def ensemble_model(self, *args, **kwargs):
        pass    

    @abc.abstractmethod
    def evaluate_result(self, *args, **kwargs):
        pass
    
    #定義method的執行流程
    def execute_pipeline(self, data, label, train_test_col):
        self.get_data(data, label, train_test_col)
        self.optimize_hyperparam()
        self.fit_model()
        self.transform_model()
        self.evaluate_result()

In [178]:
import logging
import random
import lightgbm as lgb
import pandas as pd
import numpy as np
from timeit import default_timer as timer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

MAX_EVALS = 500
N_FOLDS = 5
# lgb hyperparameter grid
PARAM_GRID = {
    'class_weight': [None, 'balanced'],
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(30, 150)),
    'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
SUBSAMPLE_DIST = list(np.linspace(0.5, 1, 100))


class TrainLGBModel(ModelPipeline):
    def __init__(self, search_type):
        self.search_type = search_type
        self._model = None
        self._fitted_model = None
        self._prediction = None
        self._features = None
        self.labels = None
        self.test_labels = None
        self._test_features = None
        self._train_set = None
    
    def get_data(self, data, label, train_test_col):
        data = pd.read_csv(data)
        train = data[data[train_test_col] == 'train']
        test = data[data[train_test_col] == 'test']
        
        train_labels = np.array(train[label].astype(np.int32)).reshape((-1,))
        self.test_labels = np.array(test[label].astype(np.int32)).reshape((-1,))
        
        train = train.drop(columns = [train_test_col, label])
        test = test.drop(columns = [train_test_col, label])
        self._features = np.array(train)
        self._test_features = np.array(test)
        self.labels = train_labels[:] 
        print('Train shape: ', train.shape)
        print('Test shape: ', test.shape)
        print("Successfully load training data!")
        self._train_set = lgb.Dataset(self._features, label = self.labels)
#         return self._features, self._test_features, self._train_set
    
    def optimize_hyperparam(self):
        if self.search_type == 'default':
            self._model = lgb.LGBMClassifier()
        if self.search_type == 'bayesian':
            pass
        if self.search_type == 'random':
            #random.seed(100)
            for i in range(MAX_EVALS):
                # Randomly sample parameters for gbm
                params = {key: random.sample(value, 1)[0] for key, value in PARAM_GRID.items()}            
                if params['boosting_type'] == 'goss':
                    # Cannot subsample with goss
                    params['subsample'] = 1.0                    
                else:
                    # Subsample supported for gdbt and dart
                    params['subsample'] = random.sample(SUBSAMPLE_DIST, 1)[0]
                
            results_list = self._random_objective(params, i)
            random_results = pd.DataFrame(columns = ['loss', 'params', 'iteration', 'estimators', 'time'],
                                       index = list(range(MAX_EVALS)))
            # Add results to next row in dataframe
            random_results.loc[i, :] = results_list
            # Sort results by best validation score
            random_results.sort_values('loss', ascending = True, inplace = True)
            random_results.reset_index(inplace = True, drop = True)
            
            best_random_params = random_results.loc[0, 'params'].copy()
            best_random_estimators = int(random_results.loc[0, 'estimators'])
            print(best_random_params)
            self._model = lgb.LGBMClassifier(n_estimators=best_random_estimators, n_jobs = -1, 
                                       objective = 'binary', **best_random_params, random_state = 50)
            print('This was achieved using {} search iterations.'.format(random_results.loc[0, 'iteration']))
    
    def fit_model(self):
        start = timer()
        self._fitted_model = self._model.fit(self._features, self.labels)
        lgb_time = timer()-start       
        print("The best model from {} search training time is {:.4f} seconds".format(self.search_type ,lgb_time))
#         return self._fitted_model
    
    def transform_model(self):
        self._predictions = self._model.predict_proba(self._test_features)[:, 1]
        print('The best model from {} search scores {:.4f} on the test data.'.format(self.search_type, roc_auc_score(self.test_labels, self._predictions)))
#         return self._prediction
            
    def evaluate_result(self):
        auc = roc_auc_score(self.test_labels, self._predictions)
        print("The AUC from {} search is {:.4f}".format(self.search_type, auc))
#         return auc
    
    
    def _random_objective(self, params, iteration, n_folds = N_FOLDS):
        """Random search objective function. Takes in hyperparameters
           and returns a list of results to be saved."""
        start = timer()
        # Perform n_folds cross validation
        if params['boosting_type'] == 'dart': 
            cv_results = lgb.cv(params, self._train_set, num_boost_round = 10000, nfold = n_folds, 
                            metrics = 'auc', seed = 50)
        else:
            cv_results = lgb.cv(params, self._train_set, num_boost_round = 10000, nfold = n_folds, 
                            early_stopping_rounds = 100, metrics = 'auc', seed = 50)
        end = timer()
        best_score = np.max(cv_results['auc-mean'])
        # Loss must be minimized
        loss = 1 - best_score
        # Boosting rounds that returned the highest cv score
        n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
        # Return list of results
        return [loss, params, iteration, n_estimators, end - start]

In [182]:
if __name__ == '__main__':
    lgb_model = TrainLGBModel(search_type = 'default')
    lgb_model.execute_pipeline(data = 'caravan-insurance-challenge.csv', label = 'CARAVAN', train_test_col = 'ORIGIN')
    

Train shape:  (5822, 85)
Test shape:  (4000, 85)
Successfully load training data!
The best model from default search training time is 0.1311 seconds
The best model from default search scores 0.7092 on the test data.
The AUC from default search is 0.7092
