Reference: https://www.kaggle.com/willkoehrsen/intro-to-model-tuning-grid-and-random-search

In [93]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [94]:
# Load data 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [97]:
# Prepare data
features = [c for c in train.columns if c not in ['ID_code', 'target']]
y = train['target']
X = train[features]

# Split into training and test data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 50)

print("Training X shape: ", train_X.shape, "\nTraining y shape: ", train_y.shape)
print("Testing X shape: ", test_X.shape, "\nTesting y shape: ", test_y.shape)

Training X shape:  (700, 200) 
Training y shape:  (700,)
Testing X shape:  (300, 200) 
Testing y shape:  (300,)


In [98]:
# Create a training and test dataset
train_set = lgb.Dataset(data = train_X, label = train_y)
test_set = lgb.Dataset(data = test_X, label = test_y)

# Define the classifier and grab the default params
clf = lgb.LGBMClassifier()
default_params = clf.get_params()

In [99]:
# Baseline ROC AUC with default params

# CV to determine the best num_boost_round
cv_results = lgb.cv(default_params, train_set, num_boost_round = 10000, early_stopping_rounds = 100, 
                    metrics = 'auc', nfold = 5, seed = 42)

print('The maximum validation ROC AUC was: {:.5f} with a standard deviation of {:.5f}.'.format(cv_results['auc-mean'][-1], cv_results['auc-stdv'][-1]))
print('The optimal number of boosting rounds (estimators) was {}.'.format(len(cv_results['auc-mean'])))

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


The maximum validation ROC AUC was: 0.63035 with a standard deviation of 0.05757.
The optimal number of boosting rounds (estimators) was 92.


In [100]:
# Optimal number of esimators
clf.n_estimators = len(cv_results['auc-mean'])

# Train and make predicions
clf.fit(train_X, train_y)
preds = clf.predict_proba(test_X)[:, 1]
baseline_auc = roc_auc_score(test_y, preds)

print('The baseline model scores {:.5f} ROC AUC on the test set.'.format(baseline_auc))

The baseline model scores 0.64903 ROC AUC on the test set.


In [101]:
### Hyper-parameters Tuning ###

# Define the objective function
def objective(hyperparameters):
    
    # Number of estimators will be found using early stopping
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    
    # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = 5, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 42)
    
    # results to return
    score = cv_results['auc-mean'][-1]
    hyperparameters['n_estimators'] = len(cv_results['auc-mean'])
    
    return [score, hyperparameters]

In [102]:
# Define the search space
param_grid = {
    'boosting_type': ['gbdt'],
    'class_weight': ['balanced'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.1, 1, 10)),
    'subsample': list(np.linspace(0.1, 1, 100)),
    'subsample_freq': list(range(3,10))
}

In [103]:
# Define the maximum of evaluations
MAX_EVALS = 10

# Define the random search
def random_search(param_grid, max_evals = MAX_EVALS):
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'params'], index = list(range(MAX_EVALS)))
    
    # Keep searching until reach max evaluations
    for i in range(MAX_EVALS):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        # Evaluate randomly selected hyperparameters
        results.loc[i, :] = objective(hyperparameters)
    
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return results 

In [107]:
# The parameter combination with best AUC
random_search_results = random_search(param_grid)
random_search_results.head()

print('The best model from random search scores {:.5f} ROC AUC by cross validation.'
      .format(random_search_results.loc[0, 'score']))

The best model from random search scores 0.68110 ROC AUC by cross validation.


In [108]:
# Get the best parameters
best_params = random_search_results.loc[0, 'params']

# Fit the model with best parameters
clf = lgb.LGBMClassifier(**best_params, random_state = 42)
clf.fit(train_X, train_y)

# Make predictions on test set and print the ROC AUC
preds = clf.predict_proba(test_X)[:, 1]

print('The best model from random search scores {:.5f} ROC AUC on the test set.'
      .format(roc_auc_score(test_y, preds)))

The best model from random search scores 0.58141 ROC AUC on the test set.
