In [9]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import lightgbm as lgb
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 500
#import pandas_profiling

In [10]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [11]:
random_state = 42
np.random.seed(random_state)

In [12]:

def augmentation(x=None,y=None,upsample_times=2):
    x_pos = x[y==1].copy()
    y_pos = y[y==1].copy()
    
    aug_x_pos = x_pos.copy()
    
    if upsample_times ==0:
        x_pos_temp = np.zeros(x_pos.shape)
        for i in range(x_pos.shape[1]):
            pos_values = x_pos.iloc[:,i].values
            np.random.shuffle(pos_values)
            x_pos_temp[:,i] = pos_values

        x_pos_temp = pd.DataFrame(x_pos_temp)
        x_pos_temp.columns = x_pos.columns  
        aug_x_pos = x_pos_temp    
        aug_y_pos = np.ones(y_pos.shape[0]) 
        aug_x_pos = aug_x_pos.append(x[y==0]) 
        aug_y_pos = np.append(aug_y_pos,y[y==0]) 
    else:
        for n in range(upsample_times):
            x_pos_temp = np.zeros(x_pos.shape)

            for i in range(x_pos.shape[1]):
                pos_values = x_pos.iloc[:,i].values
                np.random.shuffle(pos_values)
                x_pos_temp[:,i] = pos_values

            x_pos_temp = pd.DataFrame(x_pos_temp)
            x_pos_temp.columns = x_pos.columns  
            aug_x_pos = aug_x_pos.append(x_pos_temp)    

        aug_y_pos = np.ones(y_pos.shape[0]*(upsample_times+1)) 

        aug_x_pos = aug_x_pos.append(x[y==0]) 
        aug_y_pos = np.append(aug_y_pos,y[y==0]) 
        
    return aug_x_pos,aug_y_pos
    
    
    
    

In [13]:
# train_df[var_list] = np.exp(train_df[var_list])
# test_df[var_list] = np.exp(test_df[var_list])

In [6]:
n_folds = 5
random_seed = 26
upsample_times = 3
model = f'augment_{upsample_times}_times'

model_name = "{0}_{1}_folds".format(model, n_folds)
print("Model: {}".format(model_name))

Model: augment_3_times_5_folds


In [14]:
#exclusion = ['ID_code', 'target']+ rank_var_list
exclusion = ['ID_code', 'target'] 
feats = [c for c in train_df.columns if c not in exclusion]

In [9]:
clfs = []
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
oof_preds = np.zeros((len(train_df), 1))
test_preds = np.zeros((len(test_df), 1))


X = train_df[feats]
y = train_df['target']
X_test = test_df[feats]
test_ids = test_df.ID_code.values


parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    #'scale_pos_weight': 400,
    #'device' : 'gpu' ,
    'boosting': 'gbdt',
    'num_leaves': 31, #31
    'feature_fraction': 0.5,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'learning_rate': 0.05, #0.05
    'verbose': 30
    #'min_data_in_leaf': 200
}

feature_importance_df = pd.DataFrame()
for fold_, (trn_, val_) in enumerate(folds.split(X, y)):
    print("Current Fold: {}".format(fold_+1))
    trn_x, trn_y = X.iloc[trn_, :], y[trn_]
    val_x, val_y = X.iloc[val_, :], y[val_]
    trn_x, trn_y = augmentation(trn_x,trn_y,upsample_times)

    trn_lgb = lgb.Dataset(trn_x, trn_y)
    val_lgb = lgb.Dataset(val_x, val_y)
    clf = lgb.train(parameters,
                     train_set=trn_lgb,
                     #valid_sets=[valid_data_lgb,holdout_data_lgb],
                     valid_sets=[trn_lgb, val_lgb],
                     num_boost_round=3000,
                     early_stopping_rounds=50,
                     verbose_eval=False)
    


    val_pred = clf.predict(val_x)
    test_fold_pred = clf.predict(X_test)

    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))
    
   # print('getting feature importance')
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    
test_preds /= n_folds
roc_score = roc_auc_score(y, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))


print("Saving submission file")
sample = pd.read_csv('../data/sample_submission.csv')
sample.target = test_preds.astype(float)
sample.ID_code = test_ids
sample.to_csv('../submissions/{}_{}.csv'.format(model_name,str(roc_score)), index=False)

#display_importances(feature_importance_df)


Current Fold: 1


LightGBMError: GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1

In [8]:
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta

In [15]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    best_score = optimizer.best_score_
    best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [23]:
from sklearn.metrics import average_precision_score, roc_auc_score, mean_absolute_error
from sklearn.metrics import make_scorer
from time import time


In [20]:
auc = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)


In [None]:
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)

clf = lgb.LGBMClassifier(boosting_type='gbdt',
                         #is_unbalance ='true',
                         objective='binary',
                         #n_jobs=1, 
                         verbose=0)


search_spaces = {
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'num_leaves': Integer(2, 500),
        'max_depth': Integer(0, 500),
        'min_child_samples': Integer(0, 200),
        'max_bin': Integer(100, 100000),
        'subsample': Real(0.01, 1.0, 'uniform'),
        'subsample_freq': Integer(0, 10),
        'colsample_bytree': Real(0.01, 1.0, 'uniform'),
        'min_child_weight': Integer(0, 10),
        'subsample_for_bin': Integer(100000, 500000),
        'reg_lambda': Real(1e-9, 1000, 'log-uniform'),
        'reg_alpha': Real(1e-9, 1.0, 'log-uniform'),
        'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
        'n_estimators': Integer(500, 30000)        
        }

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=auc,
                    cv=folds,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)
X = train_df[feats]
y = train_df['target']    
best_params = report_perf(opt, X, y,'LightGBM', 
                          callbacks=[DeltaXStopper(0.001), 
                                     DeadlineStopper(60*5)])