Попробуем решить ту же задачу, но с использованием XGBoost
--

**Импортируем необходимые инструменты**

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

**Читаем тренировочные данные**

In [2]:
df = pd.read_excel('Training.xlsb', engine='pyxlsb')
df

Unnamed: 0,ID,A,B,C,D,E,F,G,H,I,...,P,Q,R,S,T,U,V,W,X,MARKER
0,1,0.198778,0.099389,0.00,799.90,1.777556,0.888778,13,3.49,Woman,...,2 Two,property,No,Works,Yes,Yes,No,Yes,No,0
1,2,0.043000,0.021264,49.97,173.03,0.384511,0.190143,13,3.49,Woman,...,2 Two,otherwise,No,Works,Yes,Yes,Yes,Yes,No,0
2,3,0.067073,0.067073,0.00,329.90,0.599818,0.599818,13,3.49,Woman,...,1 One,property,No,No couple,No,Yes,No,Yes,No,0
3,4,0.052700,0.052700,0.00,235.65,0.471300,0.471300,13,3.49,Woman,...,0 Zero,property,No,No couple,Yes,Yes,Few,Yes,No,0
4,5,0.141880,0.141880,0.00,634.45,1.268900,1.268900,13,3.49,Woman,...,0 Zero,property,No,No couple,Yes,No,No,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89729,89730,0.027941,0.014844,25.00,474.00,0.557647,0.296250,20,0.01,Man,...,0 Zero,property,No,Works,Yes,Yes,No,No,Yes,0
89730,89731,0.120017,0.068581,179.96,720.03,1.200050,0.685743,10,0.00,Woman,...,2 Two,property,No,Works,Yes,Yes,No,No,Yes,0
89731,89732,0.153033,0.122427,29.01,550.99,0.459158,0.367327,3,0.00,Man,...,0 Zero,otherwise,No,Works,Yes,Yes,No,Yes,Yes,0
89732,89733,0.107575,0.043030,53.82,215.18,0.537950,0.215180,5,0.00,Woman,...,1 One,property,Yes,Works,No,Yes,No,No,Yes,0


**Приводим данные к нужному формату, выделяем небольшую валидационную выборку**

Подробно изучал данные я в первой части, там же можно и прочитать мои развернутые пояснения, касающиеся удаления столбцов

In [3]:
X = df.drop(['MARKER', 'ID', 'P', 'T', 'V', 'E', 'C', 'A'], axis = 1)
y = df['MARKER']

X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.10, random_state=101)

X_train_num = X_train.select_dtypes(exclude = 'object')
X_train_cat = X_train.select_dtypes(include = 'object')

X_val_num = X_val.select_dtypes(exclude = 'object')
X_val_cat = X_val.select_dtypes(include = 'object')

scaler = StandardScaler()
scaled_X_train_num = scaler.fit_transform(X_train_num)
scaled_X_val_num = scaler.transform(X_val_num)

oe = OrdinalEncoder()
dum_X_train_cat = oe.fit_transform(X_train_cat)
dum_X_val_cat = oe.transform(X_val_cat)

X_tr = np.hstack([scaled_X_train_num, dum_X_train_cat])
X_vl = np.hstack([scaled_X_val_num, dum_X_val_cat])

**Создаем сетку параметров, scale_pos_weight рассчитывался по формуле из документации: sum(negative instances) / sum(positive instances)**

In [4]:
param_grid = {
    'max_depth': [1, 2, 3, 4],
    'learning_rate': [0.1, 0.01, 0.05],
    'gamma': [0, 0.25, 1],
    'reg_alpha': [10, 100],
    'reg_lambda': [0, 1, 10],
    'scale_pos_weight': [262]
    
}

In [5]:
XgCv = GridSearchCV(estimator= xgb.XGBClassifier(objective= 'binary:logistic',
                                                 colsample_bytree = 0.5),
                    param_grid = param_grid,
                    scoring = 'roc_auc'
                   )

**Поиск по сетке**

In [6]:
XgCv.fit(X_tr, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.5, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bi...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimator

In [7]:
XgCv.best_params_

{'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 2,
 'reg_alpha': 100,
 'reg_lambda': 10,
 'scale_pos_weight': 262}

In [8]:
XgCv.best_score_

0.8796137497151598

In [9]:
XgCv_pred = XgCv.predict(X_vl)
print(classification_report(y_val,XgCv_pred))
cm = confusion_matrix(y_val, XgCv_pred)
print(cm)
roc_auc_score(y_val, XgCv.predict_proba(X_vl)[:, 1])

              precision    recall  f1-score   support

           0       1.00      0.82      0.90      8939
           1       0.02      0.71      0.03        35

    accuracy                           0.82      8974
   macro avg       0.51      0.77      0.47      8974
weighted avg       0.99      0.82      0.90      8974

[[7341 1598]
 [  10   25]]


0.8627219407731769

**Как видно градиентный бустинг дал чуть лучший результат чем адаптивный. Интересно попробовать эту модель на тестовой выборке, но сначала обучим ее на всей тренировочной**

In [10]:
X_final_tr = np.vstack([X_tr, X_vl])
y_final_train = pd.concat([y_train, y_val], axis = 0)

In [11]:
final_model = xgb.XGBClassifier(gamma = 0, learning_rate = 0.1,
     max_depth = 2,
     reg_alpha = 100,
     reg_lambda = 0,
     scale_pos_weight = 262)

In [12]:
final_model.fit(X_final_tr, y_final_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

**Читаем тестовый датасет, готовим данные для работы модели**

In [13]:
test_df =  pd.read_excel('Test.xlsb', engine='pyxlsb')

In [14]:
X_test = test_df.drop(['MARKER', 'ID', 'P', 'T', 'V', 'E', 'C', 'A'], axis = 1)
y_test = test_df['MARKER']
X_test_num = X_test.select_dtypes(exclude = 'object')
X_test_cat = X_test.select_dtypes(include = 'object')
dum_X_test_cat = oe.transform(X_test_cat)
X_t = np.hstack([X_test_num, dum_X_test_cat])

In [15]:
final_pred = final_model.predict(X_t)

In [16]:
print(classification_report(y_test,final_pred))
cm = confusion_matrix(y_test, final_pred)
print(cm)
print(roc_auc_score(y_test, final_model.predict_proba(X_t)[:, 1]))

              precision    recall  f1-score   support

           0       1.00      0.90      0.94     38259
           1       0.02      0.50      0.04       146

    accuracy                           0.90     38405
   macro avg       0.51      0.70      0.49     38405
weighted avg       0.99      0.90      0.94     38405

[[34326  3933]
 [   73    73]]
0.8101110241049916


**Не самые удивительные результаты, адаптивный бустинг показывал себя намного лучше на этом датасете**