# <b style='color:gold;'> LightLGBM </b>
* Este Jupyter tem como função treinar um `LightLGBM` com os melhores parametros determinados por um `scikit-optimize`

---

Bibliotecas/Módulos

<details>    
<summary>
    <font size="3" color="magenta"><b>Install</b></font>
</summary>
<p>
<ul>
    <li> !pip install lightgbm</li>
    <li> !pip install scikit-optimize </li>

</ul>
</p>

In [2]:
%%capture

!pip install lightgbm scikit-optimize

In [1]:
import logging
import numpy as np
import pandas as pd
import sys
from lightgbm import LGBMClassifier
# from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, matthews_corrcoef, classification_report, f1_score, confusion_matrix
from skopt import forest_minimize
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

Configurações dos logs

In [2]:
logger = logging.getLogger("SVM")
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('%(name)s - %(levelname)s - [+] ------- %(message)s -------') 
handler.setFormatter(formatter)
logger.handlers = [handler]

Cores

In [3]:
RED = "\033[1;31m"
BLUE = "\033[1;34m"
GREEN = "\033[1;32m"
PINK = "\033[1;45m"
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
MAG = "\033[1;45m"

---

In [4]:
df = pd.read_csv("dados_FEM_10_voluntarios_c_semanas.csv")

In [5]:
df.sample()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1992,1993,1994,1995,1996,1997,1998,1999,label,semana
30647,0,1,0,-2,-3,-1,0,-1,-3,-2,...,0,-1,-2,-5,-4,-5,-5,-5,1,4


In [6]:
len(df)

38400

In [7]:
len(df.columns)

2002

---

In [8]:
y = df["semana"]

In [9]:
df = df.drop(["semana"], axis=1)

In [10]:
from sklearn.decomposition import PCA

In [11]:
pca = PCA(n_components=1000)

In [12]:
pca.fit(df)

In [13]:
%%time

pca_samples = pca.transform(df)

CPU times: user 38.7 s, sys: 15.5 s, total: 54.2 s
Wall time: 4.54 s


In [14]:
ps = pd.DataFrame(pca_samples)
ps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-627.251083,30.568203,16.833665,69.195151,12.280706,-94.428158,-123.645087,-20.408368,-42.381539,-5.982082,...,-35.306267,12.934203,-15.954227,27.322736,13.912132,-30.681279,-20.8853,-45.839703,-22.655295,31.701072
1,-629.120229,66.765675,30.657207,50.077044,-0.233826,-121.839505,-76.846392,-22.002857,-42.243671,-17.20599,...,-41.392834,7.39567,-3.973416,25.327145,14.767522,-38.536509,-21.293074,-40.867514,-25.117885,27.328072
2,-643.068394,92.164941,33.187273,16.964413,-0.983939,-154.585766,-34.821368,-23.286663,-36.066578,-37.748962,...,-46.023858,10.3785,10.937307,15.779199,22.172475,-26.632457,-1.585979,-37.204049,-15.152859,7.208884
3,-640.528456,112.703774,31.83808,2.859316,-7.447007,-173.481431,4.94617,-27.813234,-23.105233,-52.269841,...,-39.156055,9.66662,34.463056,11.133124,14.776412,-9.057023,8.878949,-31.782318,-4.172921,1.224581
4,-614.98094,109.588646,42.609763,-11.98713,-12.14519,-181.04146,30.963179,-33.286936,-12.361197,-72.572084,...,-22.934892,3.904587,44.722021,5.805603,3.629181,12.97028,11.071929,-22.326151,5.289277,-0.522228


In [15]:
len(ps)

38400

In [16]:
len(ps.columns)

1000

In [17]:
X = ps

In [18]:
x_treino, x_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, random_state=7)

---

### LGBM

Instanciando

In [None]:
%%time

mdl_lgbm = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=2)
mdl_lgbm.fit(x_treino, y_treino)

Tuning do Lgbm

In [None]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    logger.info('Instanciando')
    mdl_tune = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=2)
    
    logger.info('Treinando')
    mdl_tune.fit(x_treino, y_treino)
    
    logger.info('Predict')
    p = mdl_tune.predict_proba(x_teste)[:, 1]
    categorias = mdl_tune.predict(x_teste)
    
    print('Métricas: \n \n')
    
    logger.info('Acurácia')
    acc = accuracy_score(y_teste, categorias)
    print(OKCYAN, "Acc \n" + str(acc) + "\n")
    
    logger.info("Mcc")
    mcc = matthews_corrcoef(y_teste, categorias)
    print(WARNING, "Mcc \n" + str(mcc) + "\n")
    
    logger.info("F1")
    f1 = f1_score(y_teste, categorias, average="weighted")
    print(GREEN, "F1 \n" + str(f1) + "\n")
    
    logger.info('Class Report')
    resultados = classification_report(y_teste, categorias)
    print(ENDC, "Class Report \n" + resultados + "\n")
    
    return -accuracy_score(y_teste, categorias)


In [None]:
space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10),                    # max_depth
          (1, 20),                    # min_child_samples
          (0.05, 1.),                 # subsample
          (0.05, 1.),                 # colsample_bytree
          (100,1000),                 # n_estimators
          (1,5),                      # min_df
          (1,5)]                      # ngram_range

[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]

In [None]:
%%time

try:
    res = forest_minimize(tune_lgbm, space, random_state=7, n_random_starts=20, n_calls=50, verbose=1)

except Exception as error:
    logger.warning('Aconteceu algum problema...')
    logger.critical(error)

else:
    logger.info('Ok')
    
finally:
    logger.info('Treino finalizado')

In [None]:
res.

 2 voluntarias
 
 accuracy                           0.79      1536

In [None]:
res.x

In [19]:
logger.info('Instanciando')
mdl_tune = OneVsRestClassifier(LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=2))

logger.info('Treinando')
mdl_tune.fit(x_treino, y_treino)

logger.info('Predict')
p = mdl_tune.predict_proba(x_teste)[:, 1]
categorias = mdl_tune.predict(x_teste)

print('Métricas: \n \n')

logger.info('Acurácia')
acc = accuracy_score(y_teste, categorias)
print(OKCYAN, "Acc \n" + str(acc) + "\n")

logger.info("Mcc")
mcc = matthews_corrcoef(y_teste, categorias)
print(WARNING, "Mcc \n" + str(mcc) + "\n")

logger.info("F1")
f1 = f1_score(y_teste, categorias, average="weighted")
print(GREEN, "F1 \n" + str(f1) + "\n")

logger.info('Class Report')
resultados = classification_report(y_teste, categorias)
print(ENDC, "Class Report \n" + resultados + "\n")

SVM - INFO - [+] ------- Instanciando -------
SVM - INFO - [+] ------- Treinando -------
SVM - INFO - [+] ------- Predict -------
SVM - INFO - [+] ------- Acurácia -------
SVM - INFO - [+] ------- Mcc -------
SVM - INFO - [+] ------- F1 -------
SVM - INFO - [+] ------- Class Report -------


Métricas: 
 

[96m Acc 
0.7046875

[93m Mcc 
0.6068434191082591

[1;32m F1 
0.7050263696799471

[0m Class Report 
              precision    recall  f1-score   support

           1       0.71      0.70      0.71      1948
           2       0.65      0.73      0.69      1932
           3       0.74      0.71      0.72      1879
           4       0.74      0.68      0.71      1921

    accuracy                           0.70      7680
   macro avg       0.71      0.70      0.71      7680
weighted avg       0.71      0.70      0.71      7680


