In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score
from sklearn.metrics import accuracy_score, roc_auc_score

## Подготовка данных

In [2]:
df = pd.read_csv('Doc2Vec.csv')

In [3]:
X, y = df.iloc[:, 5:], df['tag']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## KNN

In [6]:
knn_params = {'n_neighbors': range(10, 100, 4)}
f1 = make_scorer(f1_score, average='weighted')  # for scoring param
rand_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                              param_distributions=knn_params,
                              cv=5,
                              verbose=3,
                              n_iter = 50,
                              n_jobs=-1,
                              scoring=f1)
rand_knn.fit(X_train, y_train)
y_pred = rand_knn.predict(X_test)
y_prob = rand_knn.predict_proba(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 23 candidates, totalling 115 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 115 out of 115 | elapsed:  1.1min finished


In [7]:
rand_knn.best_estimator_

KNeighborsClassifier(n_neighbors=50)

In [8]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7276368491321762
Recall: 0.7276368491321762
Precision: 0.7429746225629787
F1: 0.6920019149888185
ROC AUC: 0.8466001140134386
Confusion matrix:
 [[838 117   0   1   0]
 [131 241   0   0   0]
 [ 48  28   5   0   0]
 [ 74   3   0   6   0]
 [  5   1   0   0   0]]
              precision    recall  f1-score   support

           0       0.76      0.88      0.82       956
           1       0.62      0.65      0.63       372
           2       1.00      0.06      0.12        81
           3       0.86      0.07      0.13        83
           4       0.00      0.00      0.00         6

    accuracy                           0.73      1498
   macro avg       0.65      0.33      0.34      1498
weighted avg       0.74      0.73      0.69      1498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [15]:
ada_params = {
    'n_estimators': range(10, 200, 10),
    'learning_rate': [0.1, 0.25,0.5,0.75, 1]
}

rand_AdaBoost = RandomizedSearchCV(estimator=AdaBoostClassifier(),
                                   param_distributions=ada_params,
                                   cv=5,
                                   n_iter=50,
                                   verbose=3,
                                   scoring=f1,
                                   n_jobs=-1)

rand_AdaBoost.fit(X_train, y_train)
y_pred = rand_AdaBoost.predict(X_test)
y_prob = rand_AdaBoost.predict_proba(X_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 15.3min finished


In [16]:
y_pred = rand_AdaBoost.predict(X_test)
y_prob = rand_AdaBoost.predict_proba(X_test)

In [17]:
rand_AdaBoost.best_estimator_

AdaBoostClassifier(learning_rate=0.75, n_estimators=190)

In [18]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6335113484646195
Recall: 0.6335113484646195
Precision: 0.5317351659769168
F1: 0.522689982749957
ROC AUC: 0.6845919994281534
Confusion matrix:
 [[923  25   0   8   0]
 [346  26   0   0   0]
 [ 79   2   0   0   0]
 [ 83   0   0   0   0]
 [  6   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.64      0.97      0.77       956
           1       0.49      0.07      0.12       372
           2       0.00      0.00      0.00        81
           3       0.00      0.00      0.00        83
           4       0.00      0.00      0.00         6

    accuracy                           0.63      1498
   macro avg       0.23      0.21      0.18      1498
weighted avg       0.53      0.63      0.52      1498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
import xgboost as xgb

In [22]:
XGB_param = { 
    "n_estimators": range(5, 100, 10), 
    'max_depth': range(4, 20, 2), 
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5]
} 
# max_depth - максимальная глубина деревьев, eta - специальный параметр скорости обучения, silent - если 1, то программа
# работает по-тихому, objective - целевая функция, eval_metric - по какой метрике оцениваем качество. AUC совмещает в себе 
# precision и recall - альтернатива f1-score


grid = RandomizedSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax'), 
                          param_distributions=XGB_param, n_iter=30,
                          cv=5, verbose=3, scoring=f1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
y_prob = grid.predict_proba(X_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] n_estimators=15, max_depth=8, eta=0.5 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=15, max_depth=8, eta=0.5, score=0.716, total=   3.7s
[CV] n_estimators=15, max_depth=8, eta=0.5 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


[CV]  n_estimators=15, max_depth=8, eta=0.5, score=0.670, total=   3.5s
[CV] n_estimators=15, max_depth=8, eta=0.5 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.1s remaining:    0.0s


[CV]  n_estimators=15, max_depth=8, eta=0.5, score=0.711, total=   3.5s
[CV] n_estimators=15, max_depth=8, eta=0.5 ...........................
[CV]  n_estimators=15, max_depth=8, eta=0.5, score=0.725, total=   3.5s
[CV] n_estimators=15, max_depth=8, eta=0.5 ...........................
[CV]  n_estimators=15, max_depth=8, eta=0.5, score=0.690, total=   3.5s
[CV] n_estimators=65, max_depth=6, eta=0.5 ...........................
[CV]  n_estimators=65, max_depth=6, eta=0.5, score=0.734, total=  11.4s
[CV] n_estimators=65, max_depth=6, eta=0.5 ...........................
[CV]  n_estimators=65, max_depth=6, eta=0.5, score=0.685, total=   9.6s
[CV] n_estimators=65, max_depth=6, eta=0.5 ...........................
[CV]  n_estimators=65, max_depth=6, eta=0.5, score=0.730, total=   9.1s
[CV] n_estimators=65, max_depth=6, eta=0.5 ...........................
[CV]  n_estimators=65, max_depth=6, eta=0.5, score=0.734, total=   9.5s
[CV] n_estimators=65, max_depth=6, eta=0.5 ...........................

[CV]  n_estimators=25, max_depth=14, eta=0.3, score=0.703, total=   7.6s
[CV] n_estimators=25, max_depth=14, eta=0.3 ..........................
[CV]  n_estimators=25, max_depth=14, eta=0.3, score=0.669, total=   7.6s
[CV] n_estimators=25, max_depth=14, eta=0.3 ..........................
[CV]  n_estimators=25, max_depth=14, eta=0.3, score=0.722, total=   8.4s
[CV] n_estimators=25, max_depth=14, eta=0.3 ..........................
[CV]  n_estimators=25, max_depth=14, eta=0.3, score=0.715, total=   7.6s
[CV] n_estimators=25, max_depth=14, eta=0.3 ..........................
[CV]  n_estimators=25, max_depth=14, eta=0.3, score=0.686, total=   7.6s
[CV] n_estimators=15, max_depth=6, eta=0.1 ...........................
[CV]  n_estimators=15, max_depth=6, eta=0.1, score=0.707, total=   3.6s
[CV] n_estimators=15, max_depth=6, eta=0.1 ...........................
[CV]  n_estimators=15, max_depth=6, eta=0.1, score=0.675, total=   3.3s
[CV] n_estimators=15, max_depth=6, eta=0.1 ......................

[CV]  n_estimators=75, max_depth=14, eta=0.1, score=0.724, total=  21.6s
[CV] n_estimators=75, max_depth=14, eta=0.1 ..........................
[CV]  n_estimators=75, max_depth=14, eta=0.1, score=0.705, total=  22.1s
[CV] n_estimators=75, max_depth=4, eta=0.3 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.3, score=0.722, total=  12.2s
[CV] n_estimators=75, max_depth=4, eta=0.3 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.3, score=0.690, total=  15.0s
[CV] n_estimators=75, max_depth=4, eta=0.3 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.3, score=0.726, total=  11.2s
[CV] n_estimators=75, max_depth=4, eta=0.3 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.3, score=0.744, total=  10.5s
[CV] n_estimators=75, max_depth=4, eta=0.3 ...........................
[CV]  n_estimators=75, max_depth=4, eta=0.3, score=0.723, total=  11.0s
[CV] n_estimators=45, max_depth=4, eta=0.5 .........................

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 26.1min finished


In [23]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.2, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.200000003, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=95, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average='micro'))
print("Precision:", precision_score(y_test, y_pred, average='micro'))
print("F1:", f1_score(y_test, y_pred, average='micro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7309746328437917
Recall: 0.7309746328437917
Precision: 0.7309746328437917
F1: 0.7309746328437917
Confusion matrix:
 [[857  72  14  13   0]
 [162 205   3   2   0]
 [ 49  18  14   0   0]
 [ 59   5   0  19   0]
 [  6   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.76      0.90      0.82       956
           1       0.68      0.55      0.61       372
           2       0.45      0.17      0.25        81
           3       0.56      0.23      0.32        83
           4       0.00      0.00      0.00         6

    accuracy                           0.73      1498
   macro avg       0.49      0.37      0.40      1498
weighted avg       0.71      0.73      0.71      1498



  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
from sklearn.svm import SVC

In [32]:
svcclassifier = SVC(probability=True)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4, 5, 6, 7]
}

rand_svc = RandomizedSearchCV(estimator=svcclassifier,
                              param_distributions=param_grid,
                              n_jobs=-1,
                              cv=3,
                              verbose=3,
                              scoring=f1)

rand_svc.fit(X_train, y_train)

y_pred = rand_svc.predict(X_test)
y_proba = rand_svc.predict_proba(X_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.0min finished


In [33]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7356475300400535
Recall: 0.7356475300400535
Precision: 0.7171788264818796
F1: 0.7016976912629467
ROC AUC: 0.8208184199367197
Confusion matrix:
 [[872  74   4   6   0]
 [159 211   2   0   0]
 [ 53  18  10   0   0]
 [ 68   6   0   9   0]
 [  6   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.75      0.91      0.82       956
           1       0.68      0.57      0.62       372
           2       0.62      0.12      0.21        81
           3       0.60      0.11      0.18        83
           4       0.00      0.00      0.00         6

    accuracy                           0.74      1498
   macro avg       0.53      0.34      0.37      1498
weighted avg       0.72      0.74      0.70      1498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
