In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score
from sklearn.metrics import accuracy_score, roc_auc_score

## Подготовка данных

In [2]:
df = pd.read_csv('USE.csv')

In [3]:
X, y = df.iloc[:, 5:], df['tag']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## KNN

In [7]:
knn_params = {'n_neighbors': range(10, 100, 4)}
f1 = make_scorer(f1_score, average='weighted')  # for scoring param
rand_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                              param_distributions=knn_params,
                              cv=5,
                              verbose=3,
                              n_iter = 50,
                              n_jobs=-1,
                              scoring=f1)
rand_knn.fit(X_train, y_train)
y_pred = rand_knn.predict(X_test)
y_prob = rand_knn.predict_proba(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 23 candidates, totalling 115 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 115 out of 115 | elapsed:  1.5min finished


In [8]:
rand_knn.best_estimator_

KNeighborsClassifier(n_neighbors=10)

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6475300400534045
Recall: 0.6475300400534045
Precision: 0.5797158384263055
F1: 0.5768435973150727
ROC AUC: 0.5729872795541696
Confusion matrix:
 [[878  64   0   6   0]
 [302  87   0   1   0]
 [ 79   6   0   1   0]
 [ 64   2   0   5   0]
 [  3   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.66      0.93      0.77       948
           1       0.55      0.22      0.32       390
           2       0.00      0.00      0.00        86
           3       0.38      0.07      0.12        71
           4       0.00      0.00      0.00         3

    accuracy                           0.65      1498
   macro avg       0.32      0.24      0.24      1498
weighted avg       0.58      0.65      0.58      1498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [11]:
ada_params = {
    'n_estimators': range(10, 200, 10),
    'learning_rate': [0.1, 0.25,0.5,0.75, 1]
}

rand_AdaBoost = RandomizedSearchCV(estimator=AdaBoostClassifier(),
                                   param_distributions=ada_params,
                                   cv=3,
                                   n_iter=50,
                                   verbose=3,
                                   scoring=f1,
                                   n_jobs=-1)

rand_AdaBoost.fit(X_train, y_train)
y_pred = rand_AdaBoost.predict(X_test)
y_prob = rand_AdaBoost.predict_proba(X_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.9min finished


In [12]:
y_pred = rand_AdaBoost.predict(X_test)
y_prob = rand_AdaBoost.predict_proba(X_test)

In [13]:
rand_AdaBoost.best_estimator_

AdaBoostClassifier(learning_rate=0.5, n_estimators=70)

In [14]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6428571428571429
Recall: 0.6428571428571429
Precision: 0.5853947151235555
F1: 0.5400627397923419
ROC AUC: 0.4861866292757625
Confusion matrix:
 [[922  19   0   7   0]
 [345  39   0   6   0]
 [ 81   2   0   3   0]
 [ 69   0   0   2   0]
 [  3   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.65      0.97      0.78       948
           1       0.65      0.10      0.17       390
           2       0.00      0.00      0.00        86
           3       0.11      0.03      0.04        71
           4       0.00      0.00      0.00         3

    accuracy                           0.64      1498
   macro avg       0.28      0.22      0.20      1498
weighted avg       0.59      0.64      0.54      1498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
import xgboost as xgb

In [16]:
XGB_param = { 
    "n_estimators": range(5, 100, 10), 
    'max_depth': range(4, 20, 2), 
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5]
} 
# max_depth - максимальная глубина деревьев, eta - специальный параметр скорости обучения, silent - если 1, то программа
# работает по-тихому, objective - целевая функция, eval_metric - по какой метрике оцениваем качество. AUC совмещает в себе 
# precision и recall - альтернатива f1-score


grid = RandomizedSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax'), 
                          param_distributions=XGB_param, n_iter=30,
                          cv=3, verbose=3, scoring=f1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
y_prob = grid.predict_proba(X_test)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] n_estimators=5, max_depth=16, eta=0.5 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=5, max_depth=16, eta=0.5, score=0.576, total=   3.5s
[CV] n_estimators=5, max_depth=16, eta=0.5 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s remaining:    0.0s


[CV]  n_estimators=5, max_depth=16, eta=0.5, score=0.590, total=   1.8s
[CV] n_estimators=5, max_depth=16, eta=0.5 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.3s remaining:    0.0s


[CV]  n_estimators=5, max_depth=16, eta=0.5, score=0.579, total=   1.8s
[CV] n_estimators=35, max_depth=18, eta=0.4 ..........................
[CV]  n_estimators=35, max_depth=18, eta=0.4, score=0.594, total=  11.8s
[CV] n_estimators=35, max_depth=18, eta=0.4 ..........................
[CV]  n_estimators=35, max_depth=18, eta=0.4, score=0.581, total=  12.2s
[CV] n_estimators=35, max_depth=18, eta=0.4 ..........................
[CV]  n_estimators=35, max_depth=18, eta=0.4, score=0.580, total=  12.2s
[CV] n_estimators=95, max_depth=8, eta=0.1 ...........................
[CV]  n_estimators=95, max_depth=8, eta=0.1, score=0.597, total=  19.2s
[CV] n_estimators=95, max_depth=8, eta=0.1 ...........................
[CV]  n_estimators=95, max_depth=8, eta=0.1, score=0.587, total=  27.2s
[CV] n_estimators=95, max_depth=8, eta=0.1 ...........................
[CV]  n_estimators=95, max_depth=8, eta=0.1, score=0.584, total=  24.8s
[CV] n_estimators=65, max_depth=4, eta=0.4 ........................

[CV]  n_estimators=75, max_depth=4, eta=0.1, score=0.594, total=  10.9s
[CV] n_estimators=75, max_depth=4, eta=0.1 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.1, score=0.585, total=  11.4s
[CV] n_estimators=75, max_depth=4, eta=0.1 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.1, score=0.584, total=  10.8s
[CV] n_estimators=75, max_depth=8, eta=0.4 ...........................
[CV]  n_estimators=75, max_depth=8, eta=0.4, score=0.596, total=  17.7s
[CV] n_estimators=75, max_depth=8, eta=0.4 ...........................
[CV]  n_estimators=75, max_depth=8, eta=0.4, score=0.583, total=  17.1s
[CV] n_estimators=75, max_depth=8, eta=0.4 ...........................
[CV]  n_estimators=75, max_depth=8, eta=0.4, score=0.581, total=  17.3s
[CV] n_estimators=15, max_depth=12, eta=0.1 ..........................
[CV]  n_estimators=15, max_depth=12, eta=0.1, score=0.596, total=   5.2s
[CV] n_estimators=15, max_depth=12, eta=0.1 .........................

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 28.1min finished


In [17]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.2, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.200000003, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=85, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average='micro'))
print("Precision:", precision_score(y_test, y_pred, average='micro'))
print("F1:", f1_score(y_test, y_pred, average='micro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6568758344459279
Recall: 0.6568758344459279
Precision: 0.6568758344459279
F1: 0.6568758344459279
Confusion matrix:
 [[865  61   3  19   0]
 [266 114   6   3   1]
 [ 76   5   3   2   0]
 [ 67   1   1   2   0]
 [  3   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.68      0.91      0.78       948
           1       0.63      0.29      0.40       390
           2       0.23      0.03      0.06        86
           3       0.08      0.03      0.04        71
           4       0.00      0.00      0.00         3

    accuracy                           0.66      1498
   macro avg       0.32      0.25      0.26      1498
weighted avg       0.61      0.66      0.60      1498



In [19]:
from sklearn.svm import SVC

In [20]:
svcclassifier = SVC(probability=True)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4, 5, 6, 7]
}

rand_svc = RandomizedSearchCV(estimator=svcclassifier,
                              param_distributions=param_grid,
                              n_jobs=-1,
                              cv=3,
                              verbose=3,
                              scoring=f1)

rand_svc.fit(X_train, y_train)

y_pred = rand_svc.predict(X_test)
y_proba = rand_svc.predict_proba(X_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.2min finished


In [21]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6662216288384513
Recall: 0.6662216288384513
Precision: 0.5995251776823977
F1: 0.6004242643942214
ROC AUC: 0.5610943699509693
Confusion matrix:
 [[885  55   1   7   0]
 [276 112   1   1   0]
 [ 79   6   0   1   0]
 [ 68   1   1   1   0]
 [  3   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.68      0.93      0.78       948
           1       0.64      0.29      0.40       390
           2       0.00      0.00      0.00        86
           3       0.10      0.01      0.02        71
           4       0.00      0.00      0.00         3

    accuracy                           0.67      1498
   macro avg       0.28      0.25      0.24      1498
weighted avg       0.60      0.67      0.60      1498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
