In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

from hyperopt import fmin, hp, tpe, Trials, space_eval
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
from functools import partial

from lightgbm import LGBMClassifier

from imblearn.pipeline import Pipeline
#from imblearn.over_sampling import ADASYN
#from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks
from imblearn.under_sampling import RandomUnderSampler

In [None]:
def evalue_model(model, y_test, X_test, model_name):
    
    yhat_prob = [x[1] for x in model.predict_proba(X_test)]
    
    results = {'model': model_name,
               'auc': roc_auc_score(y_true = y_test, y_score = yhat_prob),
               'aucpr': average_precision_score(y_true = y_test, y_score = yhat_prob),
               'logloss': log_loss(y_test, yhat_prob)}
    
    return results

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')

In [None]:
train.drop(columns = "id", inplace = True)
X_test.drop(columns = "id", inplace = True)

In [None]:
for col in train.columns[train.dtypes == "object"].tolist():
    train[col] = train[col].astype('category')
    
for col in X_test.columns[X_test.dtypes == "object"].tolist():
    X_test[col] = X_test[col].astype('category')

In [None]:
X = train.drop('target', axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

high_cardinality = ["cat5", "cat7", "cat8", "cat10"]

categorical_cols = X.columns[X.dtypes == "category"].tolist()

categorical_cols = list(set(categorical_cols) - set(high_cardinality))

cat_columns_position = [X.columns.tolist().index(x) for x in categorical_cols + high_cardinality]

In [None]:
lgbm=LGBMClassifier(random_state = 42, 
                    #device = "gpu", 
                    learning_rate = 0.1,
                    n_estimators = 20000)

lgbm.fit(X_train, y_train, 
         eval_set=(X_val,y_val),
         early_stopping_rounds=200,
         verbose=False)

predictions=lgbm.predict_proba(X_val)[:,1]

auc_baseline=roc_auc_score(y_val,predictions)

print(f'Baseline: {auc_baseline}')

In [None]:
submission_baseline = submission.copy()
submission_baseline.loc[:, 'target'] = lgbm.predict_proba(X_test)[:,1]
submission_baseline.to_csv('submission_baseline.csv', index = False)

In [None]:
hp_space = {
    'undersample': hp.choice(label = 'undersample', options = [True, False]),
    'clf': {
        'boosting_type': hp.choice(label = 'boosting_type', options = ['gbdt', 'goss']),
        
        'num_leaves': hp.choice(label = 'num_leaves', options = [15, 31, 63, 127, 255, 511, 1023, 2047, 4095]), 
        #'max_depth': ho_scope.int(hp.quniform('max_depth',1,32,1)), # default 'max_depth': -1
        'min_child_weight': ho_scope.int(hp.quniform('min_child_weight', 0, 0.01,0.001)),
        'min_child_samples': ho_scope.int(hp.quniform('min_child_samples',1,300,1)),
        
        'max_bin': ho_scope.int(hp.quniform('max_bin',128,1024,128)), # Typical: 255
        'max_delta_step': ho_scope.int(hp.quniform('max_delta_step',1,10,1)),
        
        'subsample_freq': ho_scope.int(hp.quniform('subsample_freq',0,10, 1)),
       # 'subsample': hp.uniform('subsample',0.2,1),
        'colsample_bytree': hp.uniform('colsample_bytree',0.2,1),
        
        'reg_lambda': hp.loguniform('reg_lambda',np.log(1e-4),np.log(3)),
        'reg_alpha': hp.loguniform('reg_alpha',np.log(1e-4),np.log(3)),
        
        'min_data_per_group': ho_scope.int(hp.quniform('min_data_per_group',50,200,1)),
        'cat_smooth':  ho_scope.int(hp.quniform('cat_smooth',5,100,1)),
        'cat_l2': ho_scope.int(hp.quniform('cat_l2',1,20,1))
    }
}

#ho_sample(hp_space)

In [None]:
iteracoes = Trials()

In [None]:
def instancia_modelo(hiperparametros):
    
    clf = LGBMClassifier(**hiperparametros['clf'],
                         random_state = 42, 
                         #device = "gpu", 
                         learning_rate = 0.1,
                         n_estimators = 20000)

    if hiperparametros['undersample'] == True:
        undersample = RandomUnderSampler(sampling_strategy='majority')
    else:
        undersample = None

    pipe = Pipeline([('undersample', undersample),
                     ('clf', clf) ])

    return pipe

In [None]:
def funcao_para_minimizar(hiperparametros, features, target):
    
    pipe = instancia_modelo(hiperparametros)
    
    eval_set=(X_val,y_val)
    
    fit_params={'clf__early_stopping_rounds': 200, 
                'clf__eval_metric': 'auc', # logloss
                'clf__verbose': False,
                'clf__eval_set': eval_set}
    
    cv = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    
    resultado = cross_val_score(estimator = pipe, 
                                X = features, 
                                y = target, 
                                scoring = "roc_auc",
                                cv = cv, 
                                error_score = "raise",
                                fit_params = fit_params,
                                n_jobs = -1)

    return -resultado.mean()

In [None]:
%%time

otimizacao = fmin(fn = partial(funcao_para_minimizar, features = X_train, target = y_train),
                  space = hp_space, 
                  algo = tpe.suggest,
                  trials = iteracoes,
                  max_evals = int(180), 
                  rstate = np.random.RandomState(42))

In [None]:
def extrai_space_eval(hp_space, trial):

    desempacota_trial = space_eval(space = hp_space, 
                                   hp_assignment = {k: v[0] for (k, v) in trial['misc']['vals'].items() if len(v) > 0})
    
    return desempacota_trial

In [None]:
def desempacota_dicionario(dicionario):
    desempacotado = {}
    for (chave, valor) in dicionario.items():
        if isinstance(valor, dict):
            desempacotado = {**desempacotado, **desempacota_dicionario(valor)}
        else:
            desempacotado[chave] = valor
            
    return desempacotado

In [None]:
historico = pd.DataFrame([desempacota_dicionario(extrai_space_eval(hp_space, x)) for x in iteracoes.trials])

historico['auc'] = [-x['loss'] for x in iteracoes.results]

In [None]:
hiperparametros_selecionados = space_eval(space = hp_space, hp_assignment = otimizacao)
print('Selected hyperparameters:\n%s' % hiperparametros_selecionados)

In [None]:
import plotly.express as px

historico.loc[:,'undersample'] = historico.loc[:,'undersample']*1

fig = px.parallel_coordinates(historico, color="auc", width = 1200)
fig.show()

In [None]:
if hiperparametros_selecionados['undersample'] == True:
    undersample = RandomUnderSampler(sampling_strategy='majority')
else:
    undersample = None

In [None]:
clf = LGBMClassifier(**hiperparametros_selecionados['clf'],
                     random_state = 42, 
                     #device = "gpu", 
                     learning_rate = 0.05,
                     n_estimators = 20000)

pipe = Pipeline([('undersample', undersample),
                 ('clf', clf) ])

eval_set=(X_val,y_val)

In [None]:
%%time

final_fit = pipe.fit(X_train, y_train,
                     clf__early_stopping_rounds=200,
                     clf__eval_metric='auc', # logloss
                     clf__verbose=False,
                     clf__eval_set=eval_set
                    )

In [None]:
predictions=final_fit.predict_proba(X_val)[:,1]

auc=roc_auc_score(y_val,predictions)

print(f'Baseline: {auc_baseline}')
print(f'Tunning: {auc}')

In [None]:
submission.loc[:, 'target'] = final_fit.predict_proba(X_test)[:,1]
submission.to_csv('submission.csv', index = False)