<h1 div class='alert alert-success'><center> Tunning Hyperparameters LGBM
    

 </center></h1>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

# <div class="alert alert-success">  1. INSTALAÇÕES </div>

In [None]:
!pip install --quiet optuna
!pip install --q GPUtil

## 1.1. Preparar ambiente para LGBM

In [None]:
# https://stackoverflow.com/questions/58707252/get-lightgbm-lgbm-run-with-gpu-on-google-colabratory
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%cd LightGBM
!mkdir build
%cd build
!cmake ../../LightGBM
!make -j4

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM.git
%cd LightGBM/python-package

In [None]:
!python3 setup.py install --gpu
!pip install cmake

In [None]:
! git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

# 1.1. IMPORTAÇÕES

## 0.1. Bibliotecas

In [1]:
import warnings
import os
import gc
import random
import glob
import optuna

In [2]:
import pandas               as pd
import numpy                as np
import matplotlib.pyplot    as plt 
import seaborn              as sns
import joblib               as jb

In [3]:
import torch
import torch.nn             as nn

In [4]:
import lightgbm             as lgbm

In [5]:
from sklearn.model_selection       import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing         import QuantileTransformer,  KBinsDiscretizer, StandardScaler
from sklearn.preprocessing         import RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn                       import metrics
from sklearn.feature_selection     import SelectKBest, SelectPercentile, f_classif

In [6]:
from optuna.samplers               import TPESampler
from optuna.visualization          import plot_edf
from optuna.visualization          import plot_optimization_history
from optuna.visualization          import plot_parallel_coordinate
from optuna.visualization          import plot_param_importances
from optuna.visualization          import plot_slice
from optuna.visualization          import plot_intermediate_values
from optuna.visualization          import plot_contour
from optuna.pruners                import MedianPruner

In [7]:
from GPUtil                        import showUtilization as gpu_usage
from numba                         import cuda
from sklearn.ensemble              import IsolationForest
from psutil                        import virtual_memory
from datetime                      import datetime

## 0.2. Funções

In [8]:
def jupyter_setting():
    
    %matplotlib inline
     
    pd.options.display.max_columns = None
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
      
    warnings.filterwarnings(action='ignore')
    warnings.simplefilter('ignore')
    warnings.filterwarnings('ignore')
    warnings.filterwarnings('ignore', category=DeprecationWarning)
    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=RuntimeWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    #pd.set_option('display.max_rows', 150)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.max_colwidth', None)

    icecream = ["#00008b", "#960018","#008b00", "#00468b", "#8b4500", "#582c00"]
    #sns.palplot(sns.color_palette(icecream))
    
    return icecream

icecream = jupyter_setting()

In [9]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [10]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.rcParams['font.size'] = 12
    plt.title('Precision Recall vs threshold')
    plt.xlabel('Threshold')
    plt.legend(loc="lower left")
    
    plt.grid(True)

In [11]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls[:-1], precisions[:-1], "b-", label="Precision")
    
    plt.rcParams['font.size'] = 12
    plt.title('Precision vs recall')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    # plt.legend(loc="lower left")
    
    plt.grid(True)

In [12]:
def plot_roc_curve(fpr, tpr, label=None):
    fig, ax = plt.subplots()
    ax.plot(fpr, tpr, "r-", label=label)
    ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.rcParams['font.size'] = 12
    plt.title('ROC curve for TPS 09')
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.legend(loc="lower right")
    plt.grid(True)

In [13]:
def graf_corr(df):
    
    df = df.corr().round(5)

    # Máscara para ocultar a parte superior direita do gráfico, pois é uma duplicata
    mask = np.zeros_like(df)
    mask[np.triu_indices_from(mask)] = True

    # Making a plot
    plt.figure(figsize=(16,16))
    ax = sns.heatmap(df, annot=True, mask=mask, cmap="RdBu", annot_kws={"weight": "bold", "fontsize":13})

    ax.set_title("Mapa de calor de correlação das variável", fontsize=17)

    plt.setp(ax.get_xticklabels(), 
             rotation      = 90, 
             ha            = "right",
             rotation_mode = "anchor", 
             weight        = "normal")

    plt.setp(ax.get_yticklabels(), 
             weight        = "normal",
             rotation_mode = "anchor", 
             rotation      = 0, 
             ha            = "right");

In [14]:
def correlation(dataset, threshold):

    col_corr    = set()  # Conjunto de todos os nomes de colunas correlacionadas
    corr_matrix = dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= threshold: # estamos interessados no valor coeficiente absoluto
                colname = corr_matrix.columns[i]        # obtendo o nome da coluna
                col_corr.add(colname)
    
    return col_corr

In [15]:
def free_gpu_cache():
    
    # https://www.kaggle.com/getting-started/140636
    #print("Initial GPU Usage")
    #gpu_usage()                             

    #cuda.select_device(0)
    #cuda.close()
    #cuda.select_device(0)   
    gc.enable()
    gc.collect()
    torch.cuda.empty_cache()

## 0.3. GPU

### 0.3.1. Informações

In [16]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Nov 28 21:23:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.06       Driver Version: 510.06       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   65C    P8    N/A /  N/A |   1008MiB /  4096MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 0.3.2. Memória

In [17]:
ram_gb = virtual_memory().total / 1e9

print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 17.0 gigabytes of available RAM

Not using a high-RAM runtime


## 0.4. Carregar Dados

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [18]:
path   = '/content/drive/MyDrive/kaggle/Tabular Playground Series/2021/11 - Novembro/'
path   = ''
target = 'target'

In [19]:
df3_train     = jb.load(path + 'Data/pkl/df2_nb_02_train.pkl.z')
df3_test      = jb.load(path + 'Data/pkl/df2_nb_02_test.pkl.z')
df_submission = pd.read_csv(path + 'Data/sample_submission.csv')

df3_train.shape, df3_test.shape, df_submission.shape

((600000, 111), (540000, 110), (540000, 2))

In [20]:
df3_train.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,target,fe_cluster_0,fe_cluster_1,fe_cluster_2,fe_cluster_3,fe_cluster_4,fe_mean,fe_median,fe_min,fe_max,fe_skew
0,0.106628,3.59375,132.75,3.183594,0.08197,1.188477,3.732422,2.265625,2.099609,0.012329,1.607422,-0.318115,0.560059,2.806641,1.351562,2.535156,0.19751,0.67627,1.990234,-3.841797,0.037384,0.230347,3.330078,0.009399,0.144775,3.050781,1.303711,0.033234,-0.01828,2.748047,-0.009293,-0.036285,-0.049866,0.019485,3.898438,11.289062,1.137695,3.367188,4.945312,-0.105774,2.113281,3.453125,0.789551,1.113281,1.491211,2.439453,0.041809,3.355469,0.05368,1.701172,0.908691,0.09491,0.030212,0.597168,4.445312,1.586914,-0.068665,-0.108276,0.061035,0.046112,0.017105,-0.027557,0.019485,-0.048828,0.050751,3.728516,5.015625,4.1875,0.063354,0.121033,1.37207,4.015625,0.167603,0.039764,2.042969,-0.016617,0.107666,3.507812,0.013657,-0.097046,5.394531,0.244507,3.492188,0.113098,-0.015472,4.207031,4.105469,0.037231,-0.118835,0.067078,0.010742,1.098633,0.013329,-0.011719,0.052765,0.06543,4.210938,1.978516,0.085999,0.240479,0,1,0,0,0,0,2.650391,0.242432,-3.841797,132.75,9.6875
1,0.125,1.673828,76.5625,3.378906,0.099426,5.09375,1.275391,-0.471436,4.546875,0.03772,0.331787,0.325195,0.062042,2.261719,4.339844,-0.224976,0.233643,3.380859,1.90332,0.067871,-0.05127,0.006134,2.603516,0.103455,0.067627,4.753906,1.855469,-0.181885,0.008362,3.166016,0.011848,0.022293,0.069336,0.117126,0.315186,24.484375,1.671875,-0.40918,4.953125,0.092346,2.603516,1.955078,0.005898,3.289062,2.564453,0.817871,0.026001,4.617188,1.575195,0.066101,0.681641,0.025253,0.183472,0.110046,2.746094,0.835449,0.188232,4.960938,0.136108,-0.008492,-0.015266,-0.010841,0.064575,0.102539,0.093628,0.963867,0.630371,4.308594,0.091309,-0.036346,3.617188,3.103516,0.000657,0.0513,1.924805,0.123291,-0.022675,1.547852,-0.010399,0.058319,3.662109,-0.118408,2.357422,-0.009109,0.178711,4.097656,3.533203,0.005245,0.121399,0.109985,0.135864,3.460938,0.017059,0.124878,0.154053,0.606934,-0.267822,2.578125,-0.020874,0.024719,0,1,0,0,0,0,2.136719,0.14502,-0.471436,76.5625,8.429688
2,0.036316,1.49707,233.5,2.195312,0.026917,3.126953,5.058594,3.849609,1.801758,0.057007,0.328613,2.96875,0.105225,2.070312,5.308594,1.354492,-0.261963,1.378906,1.480469,0.020538,-0.008804,0.109375,1.683594,0.038177,0.123718,1.112305,3.572266,0.120605,0.082092,2.234375,0.00227,0.045197,0.014404,0.011597,-0.50293,33.75,1.417969,1.071289,3.222656,2.121094,3.082031,0.637695,-0.006821,-0.390869,17.34375,3.701172,-0.0336,1.578125,0.051971,-0.002005,2.691406,0.018372,-0.030472,0.111389,2.1875,-0.324951,-0.019943,3.455078,0.068115,-0.009811,-0.010628,0.027573,-0.007122,-0.04892,-0.002575,1.865234,2.404297,0.411621,0.057739,0.525391,2.167969,0.828125,0.089844,0.09375,4.949219,-0.010979,0.07666,0.266846,0.038696,0.382812,3.847656,-0.12146,3.740234,0.147095,-0.016571,0.614746,2.125,0.078857,0.97998,0.026764,0.11731,4.882812,0.085205,0.03241,0.116089,-0.001689,-0.52002,2.140625,0.124451,0.148193,0,0,0,0,0,1,3.814453,0.124084,-0.52002,233.5,9.609375
3,-0.014076,0.245972,780.0,1.890625,0.006947,1.53125,2.697266,4.515625,4.503906,0.123474,1.00293,4.871094,0.058411,2.498047,1.238281,2.347656,0.175415,1.609375,2.029297,0.042084,0.005142,0.076477,1.651367,0.111816,0.121643,0.589355,4.238281,-0.032837,0.058167,0.712891,0.097473,0.072754,0.000324,0.063354,4.0625,25.375,0.57666,2.025391,2.96875,1.085938,1.710938,1.37207,0.034637,0.722656,71.4375,3.035156,0.092224,3.453125,0.04483,0.027191,4.082031,0.046967,0.063721,0.029221,0.671875,0.185303,0.164307,3.804688,0.062317,-0.021408,0.009468,0.110901,0.02684,2.931641,0.068115,-0.495117,1.345703,2.242188,0.035614,-0.139282,4.742188,3.292969,0.117859,0.065613,0.556641,-0.058044,0.070496,1.101562,0.068542,0.162964,4.070312,-0.008835,3.896484,0.913574,-0.163208,3.074219,4.355469,-0.048889,4.917969,0.069946,-0.01535,3.474609,-0.017105,-0.008102,0.062012,0.041199,0.511719,1.96875,0.040009,0.044861,0,0,0,1,0,0,9.859375,0.18042,-0.495117,780.0,9.867188
4,-0.00326,3.714844,156.125,2.148438,0.01828,2.097656,4.15625,-0.038239,3.371094,0.03418,0.711426,0.77002,0.057556,0.957031,3.710938,5.464844,0.287109,2.617188,1.383789,0.07489,-0.010544,0.109131,2.275391,0.008026,0.045227,4.359375,5.074219,-0.009377,0.528809,4.054688,0.020004,0.106812,0.0513,0.045929,3.402344,15.5625,1.635742,0.047028,4.019531,0.155762,5.289062,4.117188,0.072144,2.751953,3.171875,0.693359,-0.105835,3.320312,0.090698,0.112915,4.621094,0.126831,0.1427,0.055725,4.707031,-0.055115,0.523926,2.972656,0.115356,0.125244,0.067444,0.075562,0.032104,-0.042297,0.047974,-0.294189,5.066406,1.049805,0.034027,0.024612,3.125,2.263672,0.082458,-0.0233,5.617188,0.086243,0.157593,3.726562,0.061249,0.086609,0.607422,1.411133,2.060547,-0.023148,0.011238,2.15625,0.914551,0.044525,0.375732,0.134399,0.013779,1.910156,-0.042938,0.105591,0.125122,0.037506,1.043945,1.075195,-0.012817,0.072815,1,1,0,0,0,0,2.949219,0.14917,-0.294189,156.125,9.710938


In [21]:
df3_train.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,target,fe_cluster_0,fe_cluster_1,fe_cluster_2,fe_cluster_3,fe_cluster_4,fe_mean,fe_median,fe_min,fe_max,fe_skew
0,0.106628,3.59375,132.75,3.183594,0.08197,1.188477,3.732422,2.265625,2.099609,0.012329,1.607422,-0.318115,0.560059,2.806641,1.351562,2.535156,0.19751,0.67627,1.990234,-3.841797,0.037384,0.230347,3.330078,0.009399,0.144775,3.050781,1.303711,0.033234,-0.01828,2.748047,-0.009293,-0.036285,-0.049866,0.019485,3.898438,11.289062,1.137695,3.367188,4.945312,-0.105774,2.113281,3.453125,0.789551,1.113281,1.491211,2.439453,0.041809,3.355469,0.05368,1.701172,0.908691,0.09491,0.030212,0.597168,4.445312,1.586914,-0.068665,-0.108276,0.061035,0.046112,0.017105,-0.027557,0.019485,-0.048828,0.050751,3.728516,5.015625,4.1875,0.063354,0.121033,1.37207,4.015625,0.167603,0.039764,2.042969,-0.016617,0.107666,3.507812,0.013657,-0.097046,5.394531,0.244507,3.492188,0.113098,-0.015472,4.207031,4.105469,0.037231,-0.118835,0.067078,0.010742,1.098633,0.013329,-0.011719,0.052765,0.06543,4.210938,1.978516,0.085999,0.240479,0,1,0,0,0,0,2.650391,0.242432,-3.841797,132.75,9.6875
1,0.125,1.673828,76.5625,3.378906,0.099426,5.09375,1.275391,-0.471436,4.546875,0.03772,0.331787,0.325195,0.062042,2.261719,4.339844,-0.224976,0.233643,3.380859,1.90332,0.067871,-0.05127,0.006134,2.603516,0.103455,0.067627,4.753906,1.855469,-0.181885,0.008362,3.166016,0.011848,0.022293,0.069336,0.117126,0.315186,24.484375,1.671875,-0.40918,4.953125,0.092346,2.603516,1.955078,0.005898,3.289062,2.564453,0.817871,0.026001,4.617188,1.575195,0.066101,0.681641,0.025253,0.183472,0.110046,2.746094,0.835449,0.188232,4.960938,0.136108,-0.008492,-0.015266,-0.010841,0.064575,0.102539,0.093628,0.963867,0.630371,4.308594,0.091309,-0.036346,3.617188,3.103516,0.000657,0.0513,1.924805,0.123291,-0.022675,1.547852,-0.010399,0.058319,3.662109,-0.118408,2.357422,-0.009109,0.178711,4.097656,3.533203,0.005245,0.121399,0.109985,0.135864,3.460938,0.017059,0.124878,0.154053,0.606934,-0.267822,2.578125,-0.020874,0.024719,0,1,0,0,0,0,2.136719,0.14502,-0.471436,76.5625,8.429688
2,0.036316,1.49707,233.5,2.195312,0.026917,3.126953,5.058594,3.849609,1.801758,0.057007,0.328613,2.96875,0.105225,2.070312,5.308594,1.354492,-0.261963,1.378906,1.480469,0.020538,-0.008804,0.109375,1.683594,0.038177,0.123718,1.112305,3.572266,0.120605,0.082092,2.234375,0.00227,0.045197,0.014404,0.011597,-0.50293,33.75,1.417969,1.071289,3.222656,2.121094,3.082031,0.637695,-0.006821,-0.390869,17.34375,3.701172,-0.0336,1.578125,0.051971,-0.002005,2.691406,0.018372,-0.030472,0.111389,2.1875,-0.324951,-0.019943,3.455078,0.068115,-0.009811,-0.010628,0.027573,-0.007122,-0.04892,-0.002575,1.865234,2.404297,0.411621,0.057739,0.525391,2.167969,0.828125,0.089844,0.09375,4.949219,-0.010979,0.07666,0.266846,0.038696,0.382812,3.847656,-0.12146,3.740234,0.147095,-0.016571,0.614746,2.125,0.078857,0.97998,0.026764,0.11731,4.882812,0.085205,0.03241,0.116089,-0.001689,-0.52002,2.140625,0.124451,0.148193,0,0,0,0,0,1,3.814453,0.124084,-0.52002,233.5,9.609375
3,-0.014076,0.245972,780.0,1.890625,0.006947,1.53125,2.697266,4.515625,4.503906,0.123474,1.00293,4.871094,0.058411,2.498047,1.238281,2.347656,0.175415,1.609375,2.029297,0.042084,0.005142,0.076477,1.651367,0.111816,0.121643,0.589355,4.238281,-0.032837,0.058167,0.712891,0.097473,0.072754,0.000324,0.063354,4.0625,25.375,0.57666,2.025391,2.96875,1.085938,1.710938,1.37207,0.034637,0.722656,71.4375,3.035156,0.092224,3.453125,0.04483,0.027191,4.082031,0.046967,0.063721,0.029221,0.671875,0.185303,0.164307,3.804688,0.062317,-0.021408,0.009468,0.110901,0.02684,2.931641,0.068115,-0.495117,1.345703,2.242188,0.035614,-0.139282,4.742188,3.292969,0.117859,0.065613,0.556641,-0.058044,0.070496,1.101562,0.068542,0.162964,4.070312,-0.008835,3.896484,0.913574,-0.163208,3.074219,4.355469,-0.048889,4.917969,0.069946,-0.01535,3.474609,-0.017105,-0.008102,0.062012,0.041199,0.511719,1.96875,0.040009,0.044861,0,0,0,1,0,0,9.859375,0.18042,-0.495117,780.0,9.867188
4,-0.00326,3.714844,156.125,2.148438,0.01828,2.097656,4.15625,-0.038239,3.371094,0.03418,0.711426,0.77002,0.057556,0.957031,3.710938,5.464844,0.287109,2.617188,1.383789,0.07489,-0.010544,0.109131,2.275391,0.008026,0.045227,4.359375,5.074219,-0.009377,0.528809,4.054688,0.020004,0.106812,0.0513,0.045929,3.402344,15.5625,1.635742,0.047028,4.019531,0.155762,5.289062,4.117188,0.072144,2.751953,3.171875,0.693359,-0.105835,3.320312,0.090698,0.112915,4.621094,0.126831,0.1427,0.055725,4.707031,-0.055115,0.523926,2.972656,0.115356,0.125244,0.067444,0.075562,0.032104,-0.042297,0.047974,-0.294189,5.066406,1.049805,0.034027,0.024612,3.125,2.263672,0.082458,-0.0233,5.617188,0.086243,0.157593,3.726562,0.061249,0.086609,0.607422,1.411133,2.060547,-0.023148,0.011238,2.15625,0.914551,0.044525,0.375732,0.134399,0.013779,1.910156,-0.042938,0.105591,0.125122,0.037506,1.043945,1.075195,-0.012817,0.072815,1,1,0,0,0,0,2.949219,0.14917,-0.294189,156.125,9.710938


# <div class="alert alert-success"> 1.  TUNNING </div>

## 1.0. Split Train/Test

In [22]:
X      = df3_train.drop([target], axis=1)    
y      = df3_train[target].copy()
X_test = df3_test

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size    = 0.2,
                                                      shuffle      = True, 
                                                      stratify     = y,
                                                      random_state = 0)

del df3_train , df3_test

free_gpu_cache() 

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape 

((480000, 110), (480000,), (120000, 110), (120000,))

## 1.1. Classe Tunning 

In [28]:
class TunningModels(nn.Module):
    
    from sklearn.linear_model import RidgeClassifier
    
    def __init__(self, name_model_, X_trn_, y_trn_, X_ts_, feature_=None, 
                 seed_=12359, scaler_=StandardScaler(), feature_bin_=None, 
                 target_='target', path_='', level_='1', save_predict_=True):
        
        super(TunningModels,self).__init__() 

        self.name_clf     = name_model
        self.X_trn        = X_trn_
        self.y_trn        = y_trn_
        self.X_ts         = X_ts_         
        self.feature      = feature_
        self.seed         = seed_
        self.scaler       = scaler_
        self.feature_bin  = feature_bin_ 
        self.target       = target_
        self.path         = path_
        self.level        = level_
        self.save_predict = save_predict_

    def recover_prediction_first_level():
        
        preds_train1 = glob.glob("model/train/*.pkl.z")
        preds_test   = glob.glob("model/test/*.pkl.z")
        preds_val1   = glob.glob("model/valid/*.pkl.z")

        df_train1     = []
        scores_traint = dict()

        for p_name in preds_train1:    
            p    = jb.load(p_name)
            p_df = pd.DataFrame(p, columns=[p_name.replace('model/train\\', '')])    
            df_train1.append(p_df)    
            scores_traint[p_name] = f1_score(y_train1, (p_df>.5))

        df_val1     = [] 
        scores_val1 = dict()
        for p_name in preds_val1:    
            p    = jb.load(p_name)
            p_df = pd.DataFrame(p, columns=[p_name.replace('model/valid\\', '')])    
            df_val1.append(p_df)    
            scores_val1[p_name] = f1_score(y_val1, (p_df>.5))

        df_test     = [] 
        scores_test = dict()
        for p_name in preds_test:    
            p         = jb.load(p_name)
            p_df_test = pd.DataFrame(p, columns=[p_name.replace('model/test\\', '')])    
            df_test.append(p_df_test)

        df_train1 = pd.concat(df_train1, axis=1)
        df_val1   = pd.concat(df_val1, axis=1)
        df_test   = pd.concat(df_test, axis=1)

        return df_train1, df_val1, df_test.shape
        
    def delete_files(namefile):

        path = ['model/train', 'model/test', 'model/valid', 'model/params', 'model/score',
                'model/test_f', 'model/cv_model', 'model/preds', 'model/optuna', 
                'model/preds/train', 'model/preds/test', 'model/preds/test/n1', 
                'model/preds/test/n2', 'model/preds/test/n3', 'model/preds/train/n1', 
                'model/preds/train/n2', 'model/preds/train/n3','model/preds/param', 
                'Data/submission/tunning', 'Data/submission'
                
               ]

        for path_ in path:
            for raiz, diretorios, arquivos in os.walk(path_):
                for arquivo in arquivos:
                    if arquivo.startswith(namefile):
                        os.remove(os.path.join(raiz, arquivo))
 
    def logging_callback(study, frozen_trail):
        prev_best = study.user_attrs.get('prev_best', None)
        if prev_best != study.best_value:
            study.set_user_attr('prev_best', study.best_value)
            print(f"Trail {frozen_trail.number} finished with best value {frozen_trail.value}")

    def df_return_preds_tunning(model_name=None, level=1, target_='target', 
                                train_shape_row=0, test_shape_row=0): 
    
        if level==1: 
            level_ = 'n1'
        else: 
            if level==2:
                level_ = 'n2'
            else: 
                level_ = 'n3'
        
        paths = ['model/preds/test/n1', 'model/preds/train/' + level_ ]    

        if model_name==None: 
            model_name=''
            
        for i, path in enumerate(paths): 

            name_file_pkl     = glob.glob(path + '/'+ model_name + '*.pkl.z')
            dic_preds_mdl_pkl = dict()

            for p_name in name_file_pkl:    
                y_model_pkl_name_col  = p_name.replace(path + '\\', '').replace('.pkl.z','') 
                y_model_pkl           = jb.load(p_name)   

                if i==0:
                    if len(y_model_pkl)==test_shape_row:
                        dic_preds_mdl_pkl[y_model_pkl_name_col] = y_model_pkl
                        
                if i==1:
                    if len(y_model_pkl)==train_shape_row:                        
                        dic_preds_mdl_pkl[y_model_pkl_name_col] = y_model_pkl
                
                gc.collect()

            if i==0:         
                X_test_pred_nivel_1 = pd.DataFrame(dic_preds_mdl_pkl)
            else:
                X_train_pred_nivel_1 = pd.DataFrame(dic_preds_mdl_pkl)

            gc.collect()

        X_train_pred_nivel_1[target_] = y
        
        return X_train_pred_nivel_1, X_test_pred_nivel_1
    
    def feature_select(mdl, feature=[], best_score=0):
    
        best_feature = ''

        for col in df_train1.columns:

            if col not in feature:
                Xtr  = df_train1[feature+[col]].copy()
                Xval = df_val1[feature+[col]].copy()                

                mdl.fit(Xtr, y_train1)

                p = mdl.predict(Xval)
                c = f1_score(y_val1, p)

                if c > best_score:
                    best_score = c
                    best_feature = col 

        return best_score, best_feature

    def permutation_test(mdl, feature_selected):

        dist = []

        for seed in range(100):

            Xtr  = df_train1[feature_selected].copy()
            Xval = df_val1[feature_selected].copy()

            np.random.seed(seed)

            Xtr['random']  = np.random.permutation(Xtr.iloc[:, -1].values)
            Xval['random'] = np.random.permutation(Xval.iloc[:, -1].values)

            mdl.fit(Xtr, y_train1)

            p = mdl.predict(Xval)
            c = f1_score(y_val1, p)

            dist.append(c)

        dist = np.array(dist)

        return dist.max()

    def feature_selected_model(model = RidgeClassifier(alpha=1.) ):
   
        score_feature, best_feature =  TunningModels.feature_select(model)
        print('Score: {:2.4f} => Feature: {}'. format(score_feature*100 , best_feature))

        feature_selected = []
        feature_selected.append(best_feature)

        loop = True

        while loop:

            best_score = TunningModels.permutation_test(model, feature_selected) 
            best_score = best_score + 1e-4

            score_feature, best_feature = TunningModels.feature_select(model, feature=feature_selected, best_score=best_score)
            

            if score_feature <= best_score:  
                print('Fim')
                loop= False
            else: 
                feature_selected.append(best_feature)
                print('Score: {:2.4f} => Feature: {}'. format(score_feature*100 , best_feature))

        return feature_selected
    
    def model_of_diversity_feature_group(model, name_model_, X_, y_, X_ts_, scaler_=None, feature_bin_= None, 
                                         feature_imp_num=5, seed_=12359):

        TunningModels.delete_files(name_model_)
    
        mdl ,score , y_hat = TunningModels.cross_valid( model        = model, 
                                                        model_name_  = name_model_, 
                                                        X_           = X_, 
                                                        y_           = y_, 
                                                        X_test_      = X_ts_, 
                                                        type_model   = 1, 
                                                        feature      = None,
                                                        seed         = seed_, 
                                                        tunning      = 1, 
                                                        scaler       = scaler_,
                                                        print_result = False, 
                                                        feature_bin  = feature_bin_, 
                                                        save_predict = False,
                                                        n_splits     = 2
                                                        )
        
        df               = pd.DataFrame()
        df["feature"]    = X_.columns.to_list()
        df["importance"] = mdl.feature_importances_
        
        df.sort_values("importance", axis=0, ascending=False, inplace=True)

        feature_import = df[:feature_imp_num]['feature'].to_list()
        
        print(feature_import)
        print()

        for feature_imp in  feature_import:

            score_                =  0.09
            feature_best          = []
            feature               = X_ts_.columns            
            feature               = [s for s in feature if s not in feature_import]
            feature_number        = len(feature)
            feature_select_number = np.round(np.sqrt(len(feature)))
            feature_number_sample = int(np.round((feature_number/feature_select_number)))
            feature_sample        = []

            print('='*60)
            print(' Divercidade de Grupos de Features => ({})'.format(feature_imp))
            print('='*60)

            for i in  range(0,5):

                feature            = [s for s in feature if s not in feature_sample]
                feature_sample     = pd.Series(feature).sample(feature_number_sample).to_list() 
                name_model_xgb_div = name_model_ + '_' + str(i+1)   

                feature_sample.append(feature_imp)

                feature_sample_bin = []

                for x in feature_sample: 
                    if x in feature_bin_: 
                        feature_sample_bin.append(x)

                if len(feature_sample_bin)==0:
                    feature_sample_bin = None
                
                mdl ,score , y_hat = TunningModels.cross_valid( model        = model, 
                                                                model_name_  = name_model_xgb_div, 
                                                                X_           = X_, 
                                                                y_           = y_, 
                                                                X_test_      = X_ts_, 
                                                                type_model   = 2, 
                                                                feature      = feature_sample,
                                                                seed         = seed_, 
                                                                tunning      = 1, 
                                                                scaler       = scaler_,
                                                                print_result = False, 
                                                                feature_bin  = feature_sample_bin, 
                                                                save_predict = True,
                                                                n_splits     = 3
                                                                )
                
                if score >=.6:
                    create = '*'
                else: 
                    create = ' '
            
                feature_best.append(feature)
                print('Score: {:2.5f} => {} Gr.Feature: {} {}'.format(score, create, i+1, feature_sample))

            print('')
            
        print('')
        print('FIM')
        print('')
        
    def model_of_diversity_feature_one_(model, name_model,  X_, y_, X_test_,  scaler_=None, feature_bin_=None, seed_=12359):

        score_       =  0.09
        feature_best = []

        print('')
        print('Feature apenas uma')
        print('-'*20)
        print()

        TunningModels.delete_files(name_model)

        for feature in X_train.columns:

            name_model_xgb_div = name_model + feature 

            mdl ,score , y_hat = TunningModels.cross_valid(model       = model, 
                                              model_name_  = name_model_xgb_div, 
                                              X_           = X_, 
                                              y_           = y_, 
                                              X_test_      = X_test_, 
                                              type_model   = 1, 
                                              feature      = feature,
                                              seed         = seed_, 
                                              tunning      = 1, 
                                              scaler       = scaler_,
                                              print_result = False, 
                                              feature_bin  = feature_bin_, 
                                              save_predict = True,
                                              n_splits     = 3
                                              )
    
            if score >.6:
                create = '*'
            else: 
                create = ' '
                
            if score > score_:
                score_ = np.abs(score)
                feature_best.append(feature)
                print('F1-score: {:2.5f} => {} feature: {}'.format(score, create, feature ))        

        print('')
        print('Feature dupla')
        print('-'*20)

        for feature in feature_best:

            for feature_ in feature_best:
                if feature != feature_:            
                    name_model_xgb_div = name_model + feature + '_' + feature_     
                            
                    mdl ,score , y_hat = TunningModels.cross_valid( model       = model, 
                                                                    model_name_  = name_model_xgb_div, 
                                                                    X_           = X_, 
                                                                    y_           = y_, 
                                                                    X_test_      = X_test_, 
                                                                    type_model   = 1, 
                                                                    feature      = [feature, feature_],
                                                                    seed         = seed_, 
                                                                    tunning      = 1, 
                                                                    scaler       = scaler_,
                                                                    print_result = False, 
                                                                    feature_bin  = feature_bin_, 
                                                                    save_predict = True,
                                                                    n_splits     = 3
                                                                    )
            
                    if score >.59:
                        create = '*'
                    else: 
                        create = ' '

                    print('F1-score: {:.4f} => {} feature: {} | {}'.format(score*100, create,  feature, feature_ )) 

        print('')
        print('FIM')
        print('')
         
    def save_data_model(model_, model_name_, path_, y_pred_train_prob_, y_pred_test_prob_,
                     score_, seed_, level_='1', target_='target'):
        
        level_ = 'n'+ level_ + '/'

        if score_>.6:          

            path_name_param = path_ + 'model/preds/param/' + model_name_.format(score_, seed_)
            path_name_train = path_ + 'model/preds/train/' + level_ + model_name_.format(score_, seed_)
            path_name_test  = path_ + 'model/preds/test/'  + level_ + model_name_.format(score_, seed_)    
            path_name_model = path_ + 'model/mdl/'         + model_name_.format(score_, seed_)    

            jb.dump(y_pred_train_prob_, path_name_train)
            jb.dump(y_pred_test_prob_, path_name_test)
            jb.dump(model_, path_name_model)
            jb.dump(pd.DataFrame([model_.get_params()]), path_name_param)   

            if score_>.6:
                # Gerar o arquivo de submissão 
                df_submission[target_] = y_pred_test_prob_
                name_file_sub =  path_ + 'Data/submission/tunning/' + model_name_.format(score_, seed_) + '.csv'
                df_submission.to_csv(name_file_sub, index = False)
                
    def diff(t_a, t_b):
        from dateutil.relativedelta import relativedelta
        t_diff = relativedelta(t_b, t_a)  # later/end time comes first!
        return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)
        
    def feature_scaler(df_, scaler_=None, feature_bin_=None):
    
        if scaler_!=None: 
            
            if feature_bin_!=None:
                disc = KBinsDiscretizer(n_bins=50, encode='ordinal', strategy='uniform')
                df_[feature_bin_] = disc.fit_transform(df_[feature_bin_])

            df_ = pd.DataFrame(scaler_.fit_transform(df_), columns=df_.columns)
    
        return df_

    def cross_valid(model_, model_name_, X_train_, y_train_, X_test_, fold_=5, target_='target', 
                    path_='', level_='1', save_predict_=True, print_result_=True, seed_=12359, 
                    feature_=None, feature_bin_=None, scaler_=StandardScaler(), threshold=.5):
        
        if feature_!=None: 
            X_train_ = X_train_[feature_]
            X_test_  = X_test_[feature_]

        #--------------------------------------------------------  
        # Scorpo de variáveis
        #--------------------------------------------------------

        time_pred_start    = datetime.now()
        preds_valid_f      = {}
        preds_test         = []
        total_auc          = []
        f_scores           = []
        auc_mean           = []
        f1_mean            = []
        lloss_mean         = []
        preds_test_prob    = 0    
        df_score_history   = pd.DataFrame()
        df_train_pred_fold = pd.DataFrame()
        random             = str(np.random.rand(1)[0]).replace('.','')
        model_name_        = model_name_ + '_score_{:2.5f}_{}_' + random + '.pkl.z'
        clf_name           = model_.__class__.__name__
        pri_result         = 92
        learning_rate      = model_.learning_rate 
        feature_imp_values = np.zeros(X_train.shape[1])
        out_of_fold        = np.zeros(X_train.shape[0]) 
        
        # Lists for recording validation and training scores
        valid_scores = []
        train_scores = []

        #--------------------------------------------------------  
        # Início do process de varilidação
        #--------------------------------------------------------
        have_observation=''

        #if dropout_>0: 
        #    is_dropout='*'

        if print_result_:
            num_parallel_tree = 1 #model_.get_params()['num_parallel_tree']
            learning_rate     = model_.learning_rate
            n_estimators      = model_.n_estimators * num_parallel_tree  
            max_depth         = model_.max_depth 
            msg               = 'Training model: {} - seed {} - n_estimators: {} - max_depth: {} {:2.5f}'

            print('='*pri_result)            
            print(msg.format(clf_name, seed_, n_estimators, max_depth, learning_rate))
            print('='*pri_result)

        kf = StratifiedKFold(n_splits=fold_, random_state=seed_, shuffle=True)

        for fold,(idx_train, idx_val) in enumerate(kf.split(X_train_, y_train_, groups=y_train_)):

            time_fold_start = datetime.now()

            #--------------------------------------------------------  
            # Seleção dos dados
            #--------------------------------------------------------
            X_trn, X_val = X_train_.iloc[idx_train], X_train_.iloc[idx_val]
            y_trn, y_val = y_train_.iloc[idx_train], y_train_.iloc[idx_val]
            index_valid  = X_val.index.tolist() 

            #--------------------------------------------------------  
            # Processamento
            #--------------------------------------------------------        
            X_trn = TunningModels.feature_scaler(X_trn, scaler_, feature_bin_) 
            X_val = TunningModels.feature_scaler(X_val, scaler_, feature_bin_) 

            #--------------------------------------------------------  
            # Modelo
            #--------------------------------------------------------
            #model = model_.fit(X_trn, y_trn, )

            eval_set     = [(X_trn, y_trn), (X_val, y_val)]   
            model = model_.fit(X_trn, y_trn, 
                               eval_set              = eval_set,
                               eval_names            = ['valid', 'train'],
                               early_stopping_rounds = int(n_estimators * .1), 
                               verbose               = False)
            
            best_iteration      = model.best_iteration_
            feature_imp_values += model.feature_importances_ / fold_

            #--------------------------------------------------------  
            # oof
            #--------------------------------------------------------
            preds_valid_proba  = model.predict_proba(X_val, num_iteration = best_iteration)[:, 1]
            y_pred_valid       = (preds_valid_proba>.5).astype(int)

            #--------------------------------------------------------  
            # Obtenha os valores médios de cada fold para a previsão
            #--------------------------------------------------------        
            preds_test_prob += model.predict_proba(X_test_, num_iteration = best_iteration)[:, 1] / fold_

            #--------------------------------------------------------  
            # Métricas 
            #--------------------------------------------------------
            auc   = metrics.roc_auc_score(y_val, y_pred_valid)
            f1    = metrics.f1_score(y_val, y_pred_valid)
            lloss = metrics.log_loss(y_val, preds_valid_proba) 

            #--------------------------------------------------------  
            # Concatenar validação e predição
            #--------------------------------------------------------        
            df_val_pred_fold = pd.DataFrame({'fold'       : fold+1,
                                            'index'       : index_valid, 
                                            'auc'         : auc, 
                                            'f1'          : f1,
                                            'lloss'       : lloss,
                                            'pred_val'    : preds_valid_proba, 
                                            'train_score' : model.best_score_['train']['auc'], 
                                            'valid_score' : model.best_score_['valid']['auc'],
                                            'target'      : y_val})
            
            df_train_pred_fold = pd.concat([df_train_pred_fold, df_val_pred_fold], axis=0)

            auc_mean.append(auc)   
            f1_mean.append(f1)    
            lloss_mean.append(lloss) 

            #--------------------------------------------------------  
            # Print resultado Fold
            #--------------------------------------------------------
            if print_result_:
                msg = 'Fold: {} - AUC: {:2.5f} - F1-score: {:2.5f} - L.Loss: {:2.5f} - {}'
                time_fold_start_end = TunningModels.diff(time_fold_start, datetime.now())
                print(msg.format(fold+1, auc, f1, lloss, time_fold_start_end))

            free_gpu_cache() 

        del X_trn, y_trn, X_val, y_val

        df_train_pred_fold.sort_values("index", axis=0, ascending=True, inplace=True)

        #--------------------------------------------------------  
        # Salvar predição em disco
        #--------------------------------------------------------
        X_train_prob      = df_train_pred_fold['pred_val'].to_list()
        score             = np.mean(auc_mean)
        y_pred_test_prob_ = preds_test_prob 

        if save_predict_:
            TunningModels.save_data_model(model_             = model_, 
                                            model_name_        = model_name_, 
                                            path_              = path_, 
                                            y_pred_train_prob_ = X_train_prob, 
                                            y_pred_test_prob_  = y_pred_test_prob_, 
                                            score_             = score, 
                                            seed_              = seed_, 
                                            level_             = level_, 
                                            target_            = target_
                                            )  

        #--------------------------------------------------------  
        # Print média dos Folds
        #--------------------------------------------------------
        time_pred_end = TunningModels.diff(time_pred_start, datetime.now())

        if print_result_:
            msg = '[Mean Fold]  AUC: {:.5f}(Std:{:.5f}) - F1: {:.5f} - L. Loss: {:.5f}  {}'        
            print('-'*pri_result)            
            print(msg.format(np.mean(auc_mean),np.std(auc_mean) , np.mean(f1_mean), np.mean(lloss_mean), time_pred_end))
            print('='*pri_result)
            print()

        free_gpu_cache() 

        return model, score, y_pred_test_prob_, df_train_pred_fold
        
    def lgbm(self, trial):
        
        # https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
        # https://buildmedia.readthedocs.org/media/pdf/optuna/stable/optuna.pdf
        # https://medium.com/@am.sharma/lgbm-on-colab-with-gpu-c1c09e83f2af
        params = {'objective'         : trial.suggest_categorical('objective', ['binary']),     
                  'metric'            : trial.suggest_categorical('metric', ['auc']),                   
                  'boosting_type'     : trial.suggest_categorical('boosting_type', ['gbdt']),  
                  'importance_type'   : trial.suggest_categorical('importance_type', ['gain']),  
                  'class_weight'      : trial.suggest_categorical('class_weight', ['balanced']),                   
                  'learning_rate'     : trial.suggest_float('learning_rate', 0.0095, 0.11),               
                  'max_depth'         : trial.suggest_int('max_depth', 2, 8),
                  'n_estimators'      : trial.suggest_int('n_estimators', 100, 4000),
                  'min_child_samples' : trial.suggest_int('min_child_samples', 180, 250),
                  'extra_trees'       : trial.suggest_categorical('extra_trees', ['True']),  
                  'extra_seed'        : trial.suggest_int('extra_seed', self.seed, self.seed),
                  'max_delta_step'    : trial.suggest_float('max_delta_step', .75, .89), 
                  'reg_lambda'        : trial.suggest_float('reg_lambda', .95, 1.05), 
                  'subsample'         : trial.suggest_float('subsample', .59, .95),
                  'seed'              : trial.suggest_int('random_state', self.seed, self.seed),                  
                  'verbosity'         : trial.suggest_int('verbosity', -1, -1),
                  'n_jobs'            : trial.suggest_int('n_jobs', -1, -1),
                }
    
        if torch.cuda.is_available():       
            params.update({'device': trial.suggest_categorical('device', ['gpu'])})
                      
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc', valid_name='valid_1')
       
        mdl = lgbm.LGBMClassifier(**params) #, callbacks=[pruning_callback])
                
        _, score, _, _  = TunningModels.cross_valid(model_         = mdl, 
                                                    model_name_    = self.name_clf, 
                                                    X_train_       = self.X_trn, 
                                                    y_train_       = self.y_trn, 
                                                    X_test_        = self.X_ts,            
                                                    target_        = self.target,
                                                    path_          = self.path,
                                                    level_         = self.level,            
                                                    seed_          = self.seed,
                                                    feature_       = self.feature, 
                                                    feature_bin_   = self.feature_bin,
                                                    scaler_        = self.scaler, 
                                                    save_predict_  = self.save_predict
                                                    )
        
        print('param = {}'.format(params))
        print()

        return score

## 1.2. LGBM

In [29]:
feature_int      = X_test.filter(regex=r'f[0-9]').columns.to_list()
feature_cluster  = X_test.filter(regex=r'fe_clu').columns.to_list()
feature_static   = X_test.filter(regex=r'fe_[m-s]').columns.to_list()
feature_         = X[feature_int + ['fe_cluster_1', 'fe_cluster_3']].columns.to_list()

In [30]:
%%time 

SEED   = 12359  
params = {'objective'         : 'binary',     
          'metric'            : 'auc',                   
          'boosting_type'     : 'gbdt',  
          'importance_type'   : 'gain',  
          'class_weight'      : 'balanced',                   
          'learning_rate'     : 0.01,        # [.1, .11]       
          'max_depth'         : 4,           # [4, 9]
          'n_estimators'      : 1200,        # [1000, 5000 ]
          'min_child_samples' : 200,         # [180, 250]
          'extra_trees'       : True,
          'extra_seed'        : SEED,
          'max_delta_step'    : .8,          # [.75, 89]
          'reg_lambda'        : 1.0,         # [.95, 1.04]                   
          'subsample'         : .6,          # [.59, 95]
          'device'            : 'gpu',  
          'verbosity'         : -1, 
          'seed'              : SEED
          }

scalers = [#None, 
           StandardScaler(), 
           #RobustScaler(), 
           #MinMaxScaler(), 
           #MaxAbsScaler(), 
           #QuantileTransformer(output_distribution='normal', random_state=SEED)
           ]

score_best     = 0 
name_model_clf = 'lgbm_' 
name_model     = name_model_clf + '001_scaler_' 

TunningModels.delete_files(name_model)

for scaler in scalers: 
    
    print(scaler)
     
    X_test_scaler = TunningModels.feature_scaler(df_          = X_test.copy(), 
                                                 scaler_      = scaler, 
                                                 feature_bin_ = None)
        
    name_mdl            = name_model + str(scaler).lower()[:4]
    model, score, yp,df = TunningModels.cross_valid(model_         = lgbm.LGBMClassifier(**params), 
                                                    model_name_    = name_mdl, 
                                                    X_train_       = X.head(3000), 
                                                    y_train_       = y.head(3000), 
                                                    X_test_        = X_test_scaler, 
                                                    fold_          = 5, 
                                                    target_        = 'target',
                                                    path_          = path,
                                                    level_         = '1',
                                                    save_predict_  = False, 
                                                    print_result_  = True,
                                                    seed_          = SEED,
                                                    feature_       = feature_, 
                                                    feature_bin_   = None, 
                                                    scaler_        = scaler, 
                                                    threshold      =.5                                                      
                                                    )
    
    if score > score_best: 
        score_best  = score
        scaler_best = scaler
       
print()
print('Scaler best: {}'.format(scaler_best))
print('Score      : {:2.5f}'.format(score_best))
print()

StandardScaler()
Training model: LGBMClassifier - seed 12359 - n_estimators: 1200 - max_depth: 4 0.01000


ValueError: operands could not be broadcast together with shapes (110,) (102,) (110,) 

### 1.2.1. Tunning 
Nesta etapa de modelagem, vamos criar 20 modelos e salvá-los para a nossa `Stacking`.

In [None]:
%%time
name_model = name_model_clf + '002_tunning_' 
n_trials_  = 20

scaler_best = StandardScaler() 

TunningModels.delete_files(name_model)
    
modelOpt = TunningModels(name_model_     = name_model, 
                         X_trn_          = X, 
                         y_trn_          = y, 
                         X_ts_           = X_test_scaler,                                     
                         feature_        = feature_,  
                         scaler_         = scaler_best, 
                         seed_           = SEED, 
                         feature_bin_    = None, 
                         target_         = 'target', 
                         path_           = path, 
                         level_          = '1', 
                         save_predict_   = True)

pruner = MedianPruner(n_startup_trials = 5,
                      n_warmup_steps   = 0,
                      interval_steps   = 1,
                      n_min_trials     = 5,
                      )

study = optuna.create_study(direction = 'maximize',
                            sampler   = optuna.samplers.TPESampler(seed=SEED),
                            pruner    = optuna.pruners.MedianPruner(n_warmup_steps=10),
                            #pruner    = pruner,
                            study_name= 'lgbm_tuning'
                            ) 

study.optimize(modelOpt.lgbm, n_trials=n_trials_)

score_seed = study.best_value 
params     = study.best_params 
path_name  = path + 'model/optuna/' + name_model + '_{:2.5f}.pkl.z'.format(score_seed) 
  
seed_best   = SEED
score_best  = score_seed 
params_best = params

print()
print('-'*110)
print('Best score: {:2.5f}'.format(scare_best))
print('Seed      : {}'.format(SEED))
print('Parameters:\n\n{}'.format(params_best))
print()

<div class="alert alert-info" role="alert">

**`NOTA:`** <br>
    
Com os melhores parametros gerei uma nova submissão e obtive a AUC de 0.68785.
    
</div>

### Análise 

In [None]:
###################################################################################################
# Plot functions
# --------------
# Visualize the optimization history. See :func:`~optuna.visualization.plot_optimization_history` for the details.
plot_optimization_history(study)

In [None]:
#plot_intermediate_values(study)
###################################################################################################
# Visualize the learning curves of the trials. See :func:`~optuna.visualization.plot_intermediate_values` for the details.
#plot_intermediate_values(study)

In [None]:
###################################################################################################
# Visualize high-dimensional parameter relationships. See :func:`~optuna.visualization.plot_parallel_coordinate` for the details.
plot_parallel_coordinate(study)

In [None]:
###################################################################################################
# Select parameters to visualize.
plot_parallel_coordinate(study, params=['max_depth', 'subsample','learning_rate', 'n_estimators' ])

In [None]:
###################################################################################################
# Visualize hyperparameter relationships. See :func:`~optuna.visualization.plot_contour` for the details.
#plot_contour(study)

In [None]:
###################################################################################################
# Select parameters to visualize.
plot_contour(study, params=[ 'max_depth', 'subsample','learning_rate', 'n_estimators'])

In [None]:
###################################################################################################
# Visualize individual hyperparameters as slice plot. See :func:`~optuna.visualization.plot_slice` for the details.
plot_slice(study)

In [None]:
###################################################################################################
# Select parameters to visualize.
plot_slice(study, params=['max_depth', 'subsample','learning_rate', 'n_estimators'])

In [None]:
###################################################################################################
# Visualize parameter importances. See :func:`~optuna.visualization.plot_param_importances` for the details.
#plot_param_importances(study)

In [None]:
###################################################################################################
# Learn which hyperparameters are affecting the trial duration with hyperparameter importance.
# optuna.visualization.plot_param_importances( study, target=lambda t: t.duration.total_seconds(), target_name="duration")

In [None]:
###################################################################################################
# Visualize empirical distribution function. See :func:`~optuna.visualization.plot_edf` for the details.
plot_edf(study)

### 1.2.3. Modelo Final
Agora que temos os melhores parametros, vamos treinar uma modelo com esse parametros e fazer algumas análises, para o treinamento vamos utilizar o dataset de treino (train)  e validar a performance do modelo em dados que não foram utlizados no treinamento e  vamos fazer uma pequena análise. 

In [None]:
name_model_clf = 'lgbm_'
scaler_best = StandardScaler()
SEED = 12359

X_test_scaler = TunningModels.feature_scaler(df_          = X_test.copy(), 
                                                 scaler_      = StandardScaler(), 
                                                 feature_bin_ = None)

In [None]:
params_best = param = {'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'importance_type': 'gain', 
                       'class_weight': 'balanced', 'learning_rate': 0.1086844835393899, 'max_depth': 2, 
                       'n_estimators': 2260, 'min_child_samples': 222, 'extra_trees': 'True', 'extra_seed': 12359, 'max_delta_step': 0.8094967474479909, 
                       'reg_lambda': 1.0104114282718522, 'subsample': 0.6463258626367244, 'seed': 12359, 'verbosity': -1, 'n_jobs': -1, 'device': 'gpu'}



In [None]:
%%time

name_model = name_model_clf + '003_tun'

model, score, _, _  = TunningModels.cross_valid(model_         = lgbm.LGBMClassifier(**params_best), 
                                                model_name_    = name_model, 
                                                X_train_       = X_train, 
                                                y_train_       = y_train, 
                                                X_test_        = X_test_scaler, 
                                                fold_          = 5, 
                                                target_        = 'target',
                                                path_          = path,
                                                level_         = '1',
                                                save_predict_  = False, 
                                                print_result_  = True,
                                                seed_          = SEED,
                                                feature_       = feature_, 
                                                feature_bin_   = None, 
                                                scaler_        = scaler_best, 
                                                threshold      =.5                                                      
                                                )

<div class="alert alert-info" role="alert"> 
    
**`NOTA:`** <br>
Observando os dados acima do treinamento, a AUC está na média em relação ao processo de tunning que foi realizado na etapa anterior, vamos fazer a predição em dados que o modelo não viu no treinamento e continuar com a análise, primeito vamos transformar os dados de validação.
    
</div>

In [None]:
X_valid_scaler = TunningModels.feature_scaler(X_valid[feature_].copy(), scaler_best, None )
X_valid_scaler.shape

In [None]:
y_pred_proba = model.predict_proba (X_valid_scaler)[:,1]
y_pred_proba

#### 1.2.3.1. Analise do Modelo

#### 1.2.3.1.1. Curva Roc

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_valid, y_pred_proba)
plot_roc_curve(fpr, tpr, label="LGBM")
plt.show()

<div class="alert alert-info" role="alert"> 
    
**`NOTA:`** <br>
Observando o gráfico acima, podemos concluir que o melhor ponto de corte (threshold) fica em torno de .4 à .45, isto é, esse ponto de corte melhora o F1-score,  vamos fazer esse teste.  

</div>


- **SEM PONTO DE CORTE**

In [None]:
threshold = .5
y_pred = (y_pred_proba >threshold)

f1_002  = metrics.f1_score (y_valid, y_pred)
auc_002 = metrics.roc_auc_score(y_valid, y_pred)

print(metrics.classification_report(y_valid, y_pred))
print('')
print('F1-score: {:2.5f}'.format(f1_002))
print('AUC     : {:2.5f}'.format(auc_002))

- **COM PONTO DE CORTE**

In [None]:
threshold = .49
y_pred  = (y_pred_proba >threshold)
f1_002  = metrics.f1_score (y_valid, y_pred)
auc_002 = metrics.roc_auc_score(y_valid, y_pred)

print(metrics.classification_report(y_valid, y_pred))
print('')
print('F1-score: {:2.5f}'.format(f1_002))
print('AUC     : {:2.5f}'.format(auc_002))

<div class="alert alert-info" role="alert"> 
    
**`NOTA:`** <br>
Podemos observar acima, com um ponto de corte de .45 obtivemos um F1-score de 0.69162 em relação ao ponto de corte padrão de .5 que gerou um F1-score de 0.66127, uma observação importante que tenho que destacar é que o ponto de corte ideal depende muito do entendimento do negócio que estamos modelando.
 
</div>

In [None]:
plt.figure(figsize=(15,5))
ax = plt.subplot(1,2,1)

metrics.plot_confusion_matrix(model, 
                              X_valid_scaler, 
                              y_valid, 
                              cmap          = 'inferno', 
                              values_format = 'd', 
                              ax            = ax) #true’, ‘pred’, ‘all’

plt.title('Confusion matrix')

ax= plt.subplot(1,2,2)
metrics.plot_confusion_matrix(model, X_valid_scaler, y_valid, cmap='inferno', normalize='all', ax=ax) 
plt.title('Confusion matrix');

<div class="alert alert-info" role="alert"> 
    
**`NOTA:`** <br>
com as matrizes de confusão acima podemos ter uma melhor noção para dos número que o modelo consegue gerar na predição, podemos obeservar que o modelo de Random Forest para esse conjunto de dados tem uma taxa razoavel na predição, principalmente na predição de verdadeiro positivo com um percentual de erro de 23%, isto é, o modelo classifica erradamente falsos positivos com sendo verdadeiro posito e acerta 33%, em relação ao falso positivo tem um erro de 17% com certo de 28%.
    
</div>

### 1.2.4. Divercidade

#### 1.2.4.1. Feature Select

#### 1.2.4.1. SEED
Nesta etapa vamos utilizar os melhores parametros, que encontramos na tunagem acima, com `seed` diferentes. 

In [None]:
%%time 

SEED_       = [42, 59, 100, 200, 1000, 1500, 2020, 2021, 5000, 10000, 7000]
name_model  = name_model_clf + '005_div_seed' 
df_seed     = pd.DataFrame()
params_seed = params_best.copy()

TunningModels.delete_files(name_model)

for i, seed_ in  enumerate (SEED_):     
    
    params_seed.update({'random_state': seed_})
    
    _, score, y_hat, _ = TunningModels.cross_valid(model_         = lgbm.LGBMClassifier(**params_best), 
                                                   model_name_    = name_mdl,
                                                   X_train_       = X,
                                                   y_train_       = y, 
                                                   X_test_        = X_test_scaler, 
                                                   fold_          = 5, 
                                                   target_        = 'target',
                                                   path_          = path,
                                                   level_         = '1',
                                                   save_predict_  = True, 
                                                   print_result_  = True,
                                                   seed_          = seed_,
                                                   feature_       = feature_, 
                                                   feature_bin_   = None, 
                                                   scaler_        = scaler_best, 
                                                   threshold      =.5                                                      
                                                   )

    if score > score_best: 
        seed_best  = seed_
        score_best = score

    df_seed['seed_' + str(seed_)] = y_hat 
   
print('Seed best: {}'.format(seed_best))
print('Score    : {:2.5f}'.format(score_best))

In [None]:
 df_seed.head()

In [None]:
submission = pd.DataFrame({'id': df_submission.id, target: df_seed.mean(axis=1)})
submission.to_csv(path + 'Data/submission/lgbm_005_div_seed.csv', index=False)
# kaggle 0.74514

### 1.2.5. Ensable 

#### 1.2.5.1. Recuparar dataset
Vamos recuperar todas as previsões do LGBM para gerar um ensable. 

In [None]:
%%time 
df_train, df_test = TunningModels.df_return_preds_tunning(train_shape_row = X.iloc[X_sample_idx].shape[0], 
                                                          test_shape_row  = X_test.shape[0])
print(df_train.shape, df_test.shape)
print()

In [None]:
df_train.head()

In [None]:
y_pred      = df_train_rf['target']
df_train_rf = df_train_rf.filter(regex=r'_0.8' , axis=1)
df_test_rf  = df_test_rf.filter(regex=r'_0.8', axis=1)

df_train_rf['target'] = y_pred
df_train_rf.shape, df_test_rf.shape

In [None]:
df_train_rf.head()

#### 1.2.5.2. Descritiva

In [None]:
df_test_rf.mean(axis=1).describe()

#### 1.2.5.3. Gerar submission 
Vamos gerar uma submission com a media das previssões, para termos uma ideia de como estamos. 

In [None]:
y_hat_rf_mean = df_test_rf.mean(axis=1)
submission = pd.DataFrame({'id': df_submission.id, 'claim': y_hat_rf_mean })
submission.to_csv(path + 'Data/sumbmission/rf_003_feature_gr.csv', index=False) 
# score kaggle: 0.60423

In [None]:
y_hat_rf_mean

#### 1.2.5.4. Correlação

In [None]:
graf_corr(df_train)

Temos muitas previsões autocorrelacionadas, vamos fazer a exclusão de algumas.

In [None]:
corr_features = correlation(df_train, 0.75)
len(set(corr_features))

In [None]:
#df_train_lgbm.drop(labels=corr_features, axis=1, inplace=True)

graf_corr(df_train) 