In [None]:
import pandas as pd
import os
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

import pickle

In [None]:
BASE_PATH = ''

In [None]:
def split_data(df, split=0.3):
  y = df['label']
  X = df.drop(columns=['label'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)

  return X_train, X_test, y_train, y_test

In [None]:
def create_balanced_dataset(df):
    df_0 = df[df['label'] == 0]
    df_1 = df[df['label'] == 1].sample(n=2500)
    df_2 = df[df['label'] == 2].sample(n=2500)
    
    return pd.concat([df_0, df_1, df_2]).replace(2, 1)

In [None]:
def create_grid():
    random_grid = {
        'learning_rate': sp_randFloat(),
        'subsample'    : sp_randFloat(),
        'n_estimators' : sp_randInt(100, 1000),
        'max_depth'    : sp_randInt(4, 10) 
    }
    return random_grid

In [None]:
def load_data(path):
  df = pd.read_csv(path, sep=';', encoding='utf-8')
  return df

In [None]:
def train(path, name, binary=False):    
    if binary:
        df = create_balanced_dataset(load_data(path))
    else:
        df = load_data(path)

    X_train, X_test, y_train, y_test = split_data(df)

    random_grid = create_grid()
    
    gb = GradientBoostingClassifier()
    
    gb_random = RandomizedSearchCV(
        estimator=gb,
        param_distributions=random_grid,
        n_iter=5,
        cv=3,
        verbose=2,
        random_state=161194,
        n_jobs=-1
    )
    
    gb_random.fit(X_train, y_train)
    
    print('Score: ', gb_random.best_score_)
    print('Estimator: ', gb_random.best_estimator_)
    
    y_pred = gb_random.best_estimator_.predict(X_test)
    
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    
    
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
    cm_norm = pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'))
    
    print(cm)
    print(cm_norm)
    
    cm.to_csv(BASE_PATH + r'results-gb/' + name + 'cm.csv', sep=';', encoding='utf-8', index=False)
    cm_norm.to_csv(BASE_PATH + r'results-gb/' + name + 'norm_cm.csv', sep=';', encoding='utf-8', index=False)
    
    with open(BASE_PATH + r'results-gb/' + f'{name}model.pkl', 'wb') as file:
        pickle.dump(gb_random, file)

In [None]:
%%time
train(BASE_PATH + f'128_QF95-{DATASET_NAME}+.csv', f'GB_128_QF-95_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'128_QF98-{DATASET_NAME}+.csv', f'GB_128_QF-98_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'128_QF100-{DATASET_NAME}+.csv', f'GB_128_QF-100_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'64_QF95-{DATASET_NAME}+.csv', f'GB_64_QF-95_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'64_QF98-{DATASET_NAME}+.csv', f'GB_64_QF-98_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'64_QF100-{DATASET_NAME}+.csv', f'GB_64_QF-100_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'64_asc-{DATASET_NAME}+.csv', f'GB_64_ASC_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'64_desc-{DATASET_NAME}+.csv', f'GB_64_DESC_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'128_asc-{DATASET_NAME}+.csv', f'GB_128_ASC_{DATASET_NAME}_')

In [None]:
%%time
train(BASE_PATH + f'128_desc-{DATASET_NAME}+.csv', f'GB_128_DESC_{DATASET_NAME}_')