In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [5]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [21]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score

In [2]:
data_path = '..\\data\\'

In [3]:
X_train_val = np.load(data_path + 'features_train_val.npy')
y_train_val = np.load(data_path + 'y_train.npy')
X_test = np.load(data_path + 'features_test.npy')
y_test = np.load(data_path + 'y_test.npy')

In [4]:
labels_train_val = np.array([np.where(r==1)[0][0] for r in y_train_val])
labels_test = np.array([np.where(r==1)[0][0] for r in y_test])

Нормализуем данные

In [6]:
scaler = MinMaxScaler()

In [7]:
X_train_val[:, :-27] = scaler.fit_transform(X_train_val[:, :-27])
X_test[:, :-27] = scaler.transform(X_test[:, :-27])

Подберём гиперпараметры на кросс-валидации

In [10]:
model = LogisticRegression(multi_class='multinomial', random_state=42)
cv_obj = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'C': [0.5, 0.75, 1, 1.25, 1.5, 2, 2.5],
              'solver': ['newton-cg', 'lbfgs', 'liblinear'],
              'max_iter': [100, 200, 500, 1000]}

clf = RandomizedSearchCV(model, parameters, scoring='f1_macro', n_jobs=-1, verbose=2, cv=cv_obj)
clf.fit(X_train_val, labels_train_val)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.8min finished


0.9506447960164847
{'solver': 'lbfgs', 'penalty': 'none', 'max_iter': 1000, 'C': 0.75}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Определим модель с учётом оптимального набора гиперпараметров

In [25]:
final_model = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=1000, C=0.75, multi_class='multinomial', random_state=42)

In [26]:
final_model.fit(X_train_val, labels_train_val)

LogisticRegression(C=0.75, max_iter=1000, multi_class='multinomial',
                   random_state=42)

Сделаем прогноз для тренировочной части набора данных

In [27]:
y_pred_train = final_model.predict(X_train_val)

In [28]:
confusion_matrix(labels_train_val, y_pred_train)

array([[1182,   32,   12,    0,    0,    0],
       [  54, 1010,    9,    0,    0,    0],
       [  25,   17,  944,    0,    0,    0],
       [   0,    1,    0, 1077,  201,    7],
       [   0,    0,    0,  146, 1228,    0],
       [   0,    0,    0,    0,    0, 1407]], dtype=int64)

Сделаем прогноз на тестовом наборе

In [29]:
y_pred = final_model.predict(X_test)

In [30]:
y_pred_proba = final_model.predict_proba(X_test)

In [31]:
confusion_matrix(labels_test, y_pred)

array([[476,   0,  20,   0,   0,   0],
       [ 34, 423,  14,   0,   0,   0],
       [ 36,  14, 370,   0,   0,   0],
       [  0,   2,   0, 385, 104,   0],
       [  1,   0,   0,  63, 468,   0],
       [  0,   0,   0,   0,  27, 510]], dtype=int64)

Посчитаем метрики

In [33]:
print(classification_report(labels_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91       496
           1       0.96      0.90      0.93       471
           2       0.92      0.88      0.90       420
           3       0.86      0.78      0.82       491
           4       0.78      0.88      0.83       532
           5       1.00      0.95      0.97       537

    accuracy                           0.89      2947
   macro avg       0.90      0.89      0.89      2947
weighted avg       0.90      0.89      0.89      2947



In [32]:
f1_score(labels_test, y_pred, average='macro')

0.8937166334141492

In [34]:
roc_auc_score(labels_test, y_pred_proba, multi_class='ovo')

0.9880712942186649