In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [36]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score

In [5]:
data_path = '..\\data\\'

In [6]:
X_train_val = np.load(data_path + 'features_train_val.npy')
y_train_val = np.load(data_path + 'y_train.npy')
X_test = np.load(data_path + 'features_test.npy')
y_test = np.load(data_path + 'y_test.npy')

In [7]:
labels_train_val = np.array([np.where(r==1)[0][0] for r in y_train_val])
labels_test = np.array([np.where(r==1)[0][0] for r in y_test])

Подберём гиперпараметры на кросс-валидации с помощью случайного поиска по сетке

In [60]:
model = RandomForestClassifier(random_state=42)
cv_obj = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

parameters = {'n_estimators': [50, 100, 200, 400, 1000, 3000],
              'max_depth': [None, 2, 4, 5, 8, 10],
              'min_samples_split': [2, 4, 6, 8],
              'min_samples_leaf': [2, 4, 5, 10],
              'max_features': ['auto', 'sqrt', 'log2']}

clf = RandomizedSearchCV(model, parameters, scoring='f1_macro', n_jobs=-1, verbose=2, cv=cv_obj)
clf.fit(X_train_val, labels_train_val)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.0min finished


0.975330698806659
{'n_estimators': 100, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}


In [61]:
clf.cv_results_

{'mean_fit_time': array([  1.82880154,  12.87839866,  37.17513399,   6.34919791,
        138.46829162, 100.13618584,   2.66278715,   3.92815423,
         56.50195479,  58.00991445]),
 'std_fit_time': array([ 0.03725118,  0.18939855,  2.88502798,  0.16319543, 13.84991866,
         3.20882202,  0.20352448,  0.57122411,  0.38102183,  7.22804866]),
 'mean_score_time': array([0.05659833, 0.23440466, 0.45880423, 0.10340033, 2.223488  ,
        1.79786129, 0.05479741, 0.06180563, 1.36183138, 0.73131232]),
 'std_score_time': array([0.00728396, 0.02420084, 0.05651789, 0.01132439, 0.8471986 ,
        0.55279702, 0.0275168 , 0.0021397 , 0.07289159, 0.26514717]),
 'param_n_estimators': masked_array(data=[100, 400, 1000, 200, 3000, 3000, 50, 100, 3000, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[6, 8, 4, 4, 8, 4, 2, 8, 4, 4],
  

Определим модель с учётом лучшего набора параметров

In [62]:
final_model = RandomForestClassifier(n_estimators=100, min_samples_split=8, 
                                     min_samples_leaf=2, max_features='log2', max_depth=None, random_state=42)

In [63]:
final_model.fit(X_train_val, labels_train_val)

RandomForestClassifier(max_features='log2', min_samples_leaf=2,
                       min_samples_split=8, random_state=42)

Посчитаем прогноз на обучающем наборе

In [64]:
y_pred_train = final_model.predict(X_train_val)

In [65]:
confusion_matrix(labels_train_val, y_pred_train)

array([[1226,    0,    0,    0,    0,    0],
       [   0, 1073,    0,    0,    0,    0],
       [   0,    0,  986,    0,    0,    0],
       [   0,    0,    0, 1284,    2,    0],
       [   0,    0,    0,    0, 1374,    0],
       [   0,    0,    0,    0,    0, 1407]], dtype=int64)

Сделаем прогноз на тестовой части

In [66]:
y_pred = final_model.predict(X_test)

In [67]:
y_pred_proba = final_model.predict_proba(X_test)

In [68]:
confusion_matrix(labels_test, y_pred)

array([[436,  28,  32,   0,   0,   0],
       [ 54, 410,   7,   0,   0,   0],
       [ 49,  45, 326,   0,   0,   0],
       [  0,   1,   0, 419,  71,   0],
       [  0,   0,   0,  43, 489,   0],
       [  0,   0,   0,   0,   0, 537]], dtype=int64)

In [69]:
print(classification_report(labels_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       496
           1       0.85      0.87      0.86       471
           2       0.89      0.78      0.83       420
           3       0.91      0.85      0.88       491
           4       0.87      0.92      0.90       532
           5       1.00      1.00      1.00       537

    accuracy                           0.89      2947
   macro avg       0.89      0.88      0.88      2947
weighted avg       0.89      0.89      0.89      2947



In [70]:
f1_score(labels_test, y_pred, average='macro')

0.8844428168797478

In [71]:
roc_auc_score(labels_test, y_pred_proba, multi_class='ovo')

0.9889342316396819