In [8]:
from lib import *
from preprocess import *
from report import *

import warnings
warnings.filterwarnings('ignore')

notebook_path = os.path.abspath("lgbm_classifier.ipynb"+ '/..')

# Constants
random_state = 23
n_splits = 9
test_size = 0.2
n_round = 4

# Read data
df1 = pd.read_csv('data/df1.csv', delimiter=',')
df2 = pd.read_csv('data/df2.csv', delimiter=',')

print(f'Shape of df1 is {df1.shape}')
print(f'Shape of df2 is {df2.shape}')

type_graph = ['distplot', 'hist']
features = ['value', 'duration']
cat_feature = 'event_type'


bootstrap_bot = bootstrap(n=df1.shape[0], arr=df2, cols=features, cat_col=cat_feature)

df = Preparation(df1=df1, df2=bootstrap_bot, features=features, cat_feature=cat_feature)

X, y = df.create_df()
spl = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)

# Split on train test and scale data
X_train_pre, X_test_pre, y_train, y_test = df.prep_split_data()


def objective(trial, X, y):
    cv_scores =[]
    
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [100, 500, 1000]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.001, 0.01, 0.1]),
        "num_leaves": trial.suggest_categorical("num_leaves", [3, 5, 10]),
        "max_depth": trial.suggest_categorical("max_depth", [3, 7]),
        "min_split_gain": trial.suggest_categorical("min_split_gain", [0, 0.5]),
        "min_child_samples": trial.suggest_categorical("min_child_samples", [1, 5]),
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt"]),
        "reg_alpha": trial.suggest_categorical("reg_alpha", [0, 1, 10]),
        "reg_lambda": trial.suggest_categorical("reg_lambda", [0, 1, 10])}
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    model = lgb.LGBMClassifier(objective="binary", 
                               class_weight='balanced', 
                               random_state=23, 
                               subsample=1.0, 
                               subsample_freq=10, 
                               colsample_bytree=0.85,
                               **param_grid)
    model.fit(X_train,
              y_train,
              eval_set = [(X_test, y_test)],
              eval_metric="binary_logloss",
              early_stopping_rounds=10,
              verbose=-1,
              callbacks=[LightGBMPruningCallback(trial, metric="binary_logloss")])
        
    preds = model.predict_proba(X_test)
    cv_scores.append(log_loss(y_test, preds))

    return cv_scores

#study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
##func = lambda trial: objective(trial, X_train_pre, y_train)

#n_trials = 100

#optim = study.optimize(func, n_trials=n_trials)
#print(f"\tBest value (rmse): {study.best_value:.5f}")
#print(f"\tBest params:")

#for key, value in study.best_params.items():
    #print(f"\t\t{key}: {value}")

best_params = {'n_estimators': 1000,
               'learning_rate': 0.01,
               'max_depth': 5,
               'boosting_type': 'gbdt'}

lgb_best = lgb.LGBMClassifier(objective="binary",
                          class_weight='balanced', 
                          random_state=random_state,
                          **best_params)

lgb_best_fit = lgb_best.fit(X_train_pre,
                            y_train,
                            eval_set = [(X_train_pre, y_train), (X_test_pre, y_test)],
                            eval_metric="binary_logloss",
                            verbose=-1,
                            early_stopping_rounds=10)

y_pred_proba_test = lgb_best_fit.predict(X_test_pre)
y_pred_proba_train = lgb_best_fit.predict(X_train_pre)

report = Report(y_train=y_train, y_test=y_test, y_pred_train=y_pred_proba_train, y_pred_test=y_pred_proba_test)
classification_report= report.classification_report(name='lgbm_report')
report.test_overfitting(n_round=n_round)
overfit_logreg = report.plot_overfitting(model='lgbm', save=True, name_fig='overfitting')
roc_pr = report.roc_auc_pr_plot(name_fig='lgbm_ROC-PR')

lgb.plot_metric(lgb_best_fit)
plt.savefig(notebook_path + '/graph/' + 'lgbm_metric_plot.png', dpi=300)
plt.close()

Shape of df1 is (100828, 5)
Shape of df2 is (7567, 5)
