In [1]:
from helpers.datasplit import S21SplitByThirds
from helpers.pipeline_manager import S21Pipeline
from algos.algorithms import S21DecisionTreeClassifier, S21DecisionTreeRegressor, S21RandomForestClassifier, S21GradientBoostingClassifier

from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

In [2]:
df = pd.read_csv("../datasets/data/training.csv")

splitter = S21SplitByThirds(df)
df_train, df_val, df_test = splitter.split()

TARGET = "IsBadBuy"
drop_cols = [TARGET, "PurchDate"]

X_train = df_train.drop(columns=drop_cols)
X_val = df_val.drop(columns=drop_cols)
X_test = df_test.drop(columns=drop_cols)

y_train = df_train[TARGET]
y_val = df_val[TARGET]
y_test = df_test[TARGET]

Xs = [X_train, X_val, X_test]
ys = [y_train, y_val, y_test]

In [3]:
models = [
    ("S21DecisionTreeClassifier", S21DecisionTreeClassifier(random_state=42)),
    ("sklearn DecisionTreeClassifier", DecisionTreeClassifier(random_state=42)),
    ("S21DecisionTreeRegressor", S21DecisionTreeRegressor(random_state=42)),
    ("S21RandomForestClassifier", S21RandomForestClassifier(random_state=42)),
    ("S21GradientBoostingClassifier", S21GradientBoostingClassifier(number_of_trees=50, max_depth=3, learning_rate=0.1, random_state=42)),
]

for name, model in models:
    pipeline = S21Pipeline(name, model, Xs, ys)
    gini = pipeline.build_evaluate(X_val, y_val)
    print(f"{name} Gini: {gini:.5f}")

S21DecisionTreeClassifier Gini: 0.42693
sklearn DecisionTreeClassifier Gini: 0.19031
S21DecisionTreeRegressor Gini: 0.42661
S21RandomForestClassifier Gini: 0.46839
S21GradientBoostingClassifier Gini: 0.47210


DecisionTreeClassifier из sklearn показывает результаты хуже моей имплементации.

In [5]:
%%capture --no-stdout

library_models = [
    ("LGBMClassifier", LGBMClassifier(n_estimators=200, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, max_depth=-1, random_state=42)),
    #("CatBoostClassifier", CatBoostClassifier(iterations=200, learning_rate=0.05, depth=6, random_seed=42, verbose=False, allow_writing_files=False)),
    ("XGBClassifier", XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", eval_metric="auc", reg_lambda=1.0, gamma=0.0, random_state=42, use_label_encoder=False, n_jobs=-1)),
]

for name, model in library_models:
    pipeline = S21Pipeline(name, model, Xs, ys)
    gini = pipeline.build_evaluate(X_val, y_val)
    print(f"{name} Gini: {gini:.5f}")

[LightGBM] [Info] Number of positive: 2607, number of negative: 20452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3605
[LightGBM] [Info] Number of data points in the train set: 23059, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113058 -> initscore=-2.059881
[LightGBM] [Info] Start training from score -2.059881
LGBMClassifier Gini: 0.47834
XGBClassifier Gini: 0.48021
