In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score
from sklearn.metrics import accuracy_score, roc_auc_score

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
data_titles = pd.read_csv("titles_bert.csv", sep=';', index_col=0)
scaled_data_titles = pd.DataFrame(data=StandardScaler().fit_transform(data_titles.drop('tag', axis='columns')),
                                  columns=data_titles.drop('tag', axis='columns').columns)
scaled_data_titles['tag'] = data_titles['tag']
scaled_data_titles

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,tag
0,1.982788,0.636280,0.026205,-0.428302,0.806110,0.888753,-0.201266,-0.300143,1.516755,-1.695836,...,-0.384019,0.451372,0.143713,0.639740,1.180286,-0.398132,-0.734662,-1.755183,1.785207,society
1,-0.308032,0.178448,-0.578047,0.626585,-0.399080,-0.137572,-0.427407,0.737020,0.634622,1.077456,...,1.202575,0.852453,-0.781315,-1.706979,0.431412,0.679543,0.179937,1.562960,1.246345,society
2,1.036312,-0.618237,0.757178,0.199037,-1.895047,1.105581,-0.267929,-0.452879,-0.038079,-1.298838,...,-0.835760,2.246387,-0.999044,0.241686,1.946924,-0.055742,-0.247282,1.241116,0.797510,society
3,-0.593587,0.111944,1.032377,1.098603,-0.897947,-1.040369,1.022640,-1.185127,-1.237272,-1.624552,...,-1.391719,0.759933,-0.637234,-0.395446,1.167238,-1.146398,0.504509,-1.403000,0.008811,society
4,-0.736228,0.762001,-1.717402,1.079271,-0.367102,-1.474535,-0.164572,0.317099,-0.656236,1.037916,...,-0.641369,0.269537,-0.918724,1.950812,-1.488772,-1.730112,-1.159047,-0.537253,1.700184,society
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4988,-0.842281,0.289164,0.404224,-0.588546,0.497505,-1.841256,0.768579,0.242663,-1.129596,-0.353730,...,0.123436,0.235004,-0.730740,1.318898,-0.117078,-0.380366,1.444778,0.380130,1.716711,politics
4989,-0.814783,0.903289,-0.649079,0.558818,0.581950,-1.087061,1.973154,0.881107,-1.676159,0.519693,...,0.965649,1.686916,0.233233,0.193365,-0.781087,0.599595,1.416381,0.227833,-0.548911,society
4990,1.652728,0.941576,-0.791736,0.359181,-0.238372,0.854603,0.298399,-0.103699,1.117458,2.126494,...,0.129304,0.222394,1.293075,1.427311,1.066479,1.203590,-0.287603,0.341568,0.421119,society
4991,-0.343395,-1.077136,0.081769,-0.961482,-0.401869,0.253316,0.559567,2.216032,-1.036015,-0.253903,...,0.321489,-0.502074,0.503779,0.395119,1.464168,1.493723,-1.525271,0.887973,1.123517,science


In [5]:
data_texts = pd.read_csv("texts_bert.csv", sep=';', index_col=0)
scaled_data_texts = pd.DataFrame(data=StandardScaler().fit_transform(data_texts.drop('tag', axis='columns')),
                                  columns=data_texts.drop('tag', axis='columns').columns)
scaled_data_texts['tag'] = data_texts['tag']
scaled_data_texts

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,tag
0,0.835533,1.376226,2.992430,0.101940,-1.560078,-0.084151,0.984520,-2.708209,1.797536,0.690925,...,-0.442746,-0.848235,-0.898623,0.116909,1.327458,-1.735393,1.354291,-0.653356,-0.849807,society
1,0.861976,0.432571,-0.423595,-0.825834,0.508335,1.655087,-0.374494,-2.021640,0.569279,0.298125,...,-0.540152,1.072514,-0.768824,-0.287196,-0.153840,-0.046399,-0.225241,-1.406981,-1.307059,society
2,0.506031,1.236236,-1.392696,0.737111,1.641614,-0.780017,-0.627772,-1.115202,1.615938,-0.574158,...,0.701086,1.249610,1.859676,-0.953304,-0.232084,-0.815959,-2.889244,0.669419,1.163325,society
3,0.434785,-0.851983,-0.677896,-1.619145,0.995476,0.820249,-0.594013,0.198919,0.801927,1.718985,...,0.006928,-0.194544,-0.749033,-0.598448,0.075641,0.350724,-0.852142,0.327182,0.480588,society
4,0.287372,0.493638,0.352622,1.077253,-0.140485,-0.501860,0.450711,-0.390699,0.920146,1.526652,...,0.083041,1.456687,0.285124,0.350316,-1.160809,-0.667480,-0.959954,0.791802,-0.572998,society
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4988,1.717120,1.008547,0.342709,-1.318128,-0.329470,1.016126,-0.165130,-0.755877,-0.461993,-1.174749,...,0.066041,2.050252,0.018005,-0.161464,1.291077,-0.485754,-0.440894,0.465067,-0.207532,politics
4989,-0.999162,-0.686018,0.999208,-0.547198,-0.460379,-1.151463,-0.203387,0.443674,-1.558448,-1.492466,...,0.243955,-1.069245,0.360546,1.479754,0.086263,-1.006575,0.919041,-0.183685,-0.740854,society
4990,-0.167977,0.475681,0.611242,0.041531,-1.619936,0.484497,0.760639,1.080717,-1.764248,-0.472329,...,-0.487375,0.597081,-0.013648,-0.709787,-0.301182,0.131828,0.539551,0.702470,-0.388982,society
4991,-1.577400,0.514657,0.281482,0.618723,-0.428352,-1.068472,0.721227,0.323401,-1.695198,-0.797388,...,-0.524988,0.717161,-0.267880,0.783461,0.940894,0.131928,1.141258,0.760456,-0.377740,science


In [6]:
def encode_tag(data):
    dic = {
        'society': 0,
        'politics': 1,
        'economics': 2,
        'science': 3,
        'books': 4
    }
    new_tags = []
    for tag in data['tag']:
        new_tags.append(dic[tag])
    data['tag'] = new_tags

In [7]:
encode_tag(scaled_data_titles)
encode_tag(scaled_data_texts)

In [8]:
scaled_data_titles.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,tag
0,1.982788,0.63628,0.026205,-0.428302,0.80611,0.888753,-0.201266,-0.300143,1.516755,-1.695836,...,-0.384019,0.451372,0.143713,0.63974,1.180286,-0.398132,-0.734662,-1.755183,1.785207,0
1,-0.308032,0.178448,-0.578047,0.626585,-0.39908,-0.137572,-0.427407,0.73702,0.634622,1.077456,...,1.202575,0.852453,-0.781315,-1.706979,0.431412,0.679543,0.179937,1.56296,1.246345,0
2,1.036312,-0.618237,0.757178,0.199037,-1.895047,1.105581,-0.267929,-0.452879,-0.038079,-1.298838,...,-0.83576,2.246387,-0.999044,0.241686,1.946924,-0.055742,-0.247282,1.241116,0.79751,0
3,-0.593587,0.111944,1.032377,1.098603,-0.897947,-1.040369,1.02264,-1.185127,-1.237272,-1.624552,...,-1.391719,0.759933,-0.637234,-0.395446,1.167238,-1.146398,0.504509,-1.403,0.008811,0
4,-0.736228,0.762001,-1.717402,1.079271,-0.367102,-1.474535,-0.164572,0.317099,-0.656236,1.037916,...,-0.641369,0.269537,-0.918724,1.950812,-1.488772,-1.730112,-1.159047,-0.537253,1.700184,0
5,-0.224977,1.693298,0.473479,-0.954427,0.37701,-1.578833,2.125805,-3.194222,0.424827,-0.975309,...,-1.356036,-0.708303,-1.371023,0.028115,-0.036139,0.108983,0.231719,-0.189435,-1.702182,1
6,1.574279,-0.379246,0.565063,0.086522,-0.538655,-0.409614,-1.290103,1.963522,-0.471619,-0.451271,...,0.571814,2.130367,0.379167,-1.005335,1.584527,-0.99851,-0.297637,0.429823,2.185497,1
7,-0.745556,1.381918,-1.321507,2.028954,0.797043,0.502394,-0.935365,2.779975,0.156413,0.634446,...,-1.006352,0.387942,-0.069132,0.974073,-0.769685,-2.206356,-1.088934,0.563868,0.053441,1
8,-0.219547,-0.107227,-0.18334,0.736846,-0.986749,-0.181782,-1.475464,-0.246313,0.413363,-0.110747,...,0.319769,0.895669,1.007312,0.157346,0.592778,0.171054,-1.016979,0.462089,0.266908,0
9,-0.765229,0.537559,0.56872,-0.384725,0.776401,1.380154,1.650151,-0.12534,0.491573,0.057383,...,-0.211182,-0.50568,1.74333,-0.111853,0.92134,-0.324186,-0.861971,-0.511207,0.699923,1


In [9]:
X, y = scaled_data_titles.drop('tag', axis = 'columns'), scaled_data_titles['tag']
X_t, y_t = scaled_data_texts.drop('tag', axis = 'columns'), scaled_data_texts['tag']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_t, y_t, test_size=0.25)

# Title

## KNN

In [11]:
param_distributions = {'n_neighbors': range(10, 501, 10)}
f1 = make_scorer(f1_score, average='weighted')  # for scoring param
rand_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                              param_distributions=param_distributions,
                              cv=3,
                              verbose=3,
                              n_iter = 50,
                              n_jobs=4,
                              scoring=f1)
rand_knn.fit(X_train, y_train)
y_pred = rand_knn.predict(X_test)
y_prob = rand_knn.predict_proba(X_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  6.5min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:  7.8min finished


In [12]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6293034427542034
Recall: 0.6293034427542034
Precision: 0.4824442837709277
F1: 0.5163532957693111
ROC AUC: 0.5215788338967055
Confusion matrix:
 [[771  30   0   0   0]
 [302  15   0   0   0]
 [ 58   6   0   0   0]
 [ 60   5   0   0   0]
 [  2   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.65      0.96      0.77       801
           1       0.27      0.05      0.08       317
           2       0.00      0.00      0.00        64
           3       0.00      0.00      0.00        65
           4       0.00      0.00      0.00         2

    accuracy                           0.63      1249
   macro avg       0.18      0.20      0.17      1249
weighted avg       0.48      0.63      0.52      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1))
param_grid = {
    'n_estimators': range(10, 301, 20),
    'learning_rate': [0.1, 0.25, 0.5, 0.75, 1]
}

randdAdaBoost = RandomizedSearchCV(estimator=classifier,
                                   param_distributions=param_grid,
                                   cv=2,
                                   n_iter=50,
                                   verbose=3,
                                   scoring=f1,
                                   n_jobs=4)

randdAdaBoost.fit(X_train, y_train)
y_pred = randdAdaBoost.predict(X_test)
y_prob = randdAdaBoost.predict_proba(X_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 14.5min finished


In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6309047237790232
Recall: 0.6309047237790232
Precision: 0.4818294133337264
F1: 0.5098530163339777
ROC AUC: 0.5307793300137135
Confusion matrix:
 [[778  23   0   0   0]
 [307  10   0   0   0]
 [ 63   1   0   0   0]
 [ 63   2   0   0   0]
 [  2   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.64      0.97      0.77       801
           1       0.28      0.03      0.06       317
           2       0.00      0.00      0.00        64
           3       0.00      0.00      0.00        65
           4       0.00      0.00      0.00         2

    accuracy                           0.63      1249
   macro avg       0.18      0.20      0.17      1249
weighted avg       0.48      0.63      0.51      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [16]:
from sklearn.svm import SVC

In [21]:
svcclassifier = SVC(probability=True)
param_grid = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C' : [0.1, 1, 10, 100, 1000],
    'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
    'degree' : [2,3,4]
}

rand_svc = RandomizedSearchCV(estimator=svcclassifier,
                              param_distributions=param_grid,
                              scoring=f1,
                              n_jobs=4,
                              cv=2,
                              verbose=3,
                              refit=True)

rand_svc.fit(X_train, y_train)

y_pred = rand_svc.predict(X_test)
y_proba = rand_svc.predict_proba(X_test)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:  8.3min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:  8.3min finished


In [22]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5804643714971978
Recall: 0.5804643714971978
Precision: 0.47505080093130203
F1: 0.514246108799376
ROC AUC: 0.5307793300137135
Confusion matrix:
 [[681 112   5   3   0]
 [270  44   1   2   0]
 [ 56   8   0   0   0]
 [ 54  11   0   0   0]
 [  1   0   1   0   0]]
              precision    recall  f1-score   support

           0       0.64      0.85      0.73       801
           1       0.25      0.14      0.18       317
           2       0.00      0.00      0.00        64
           3       0.00      0.00      0.00        65
           4       0.00      0.00      0.00         2

    accuracy                           0.58      1249
   macro avg       0.18      0.20      0.18      1249
weighted avg       0.48      0.58      0.51      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## LightGBM

In [24]:
from lightgbm import LGBMClassifier

In [25]:
gbm = LGBMClassifier()
param_grid = {
    'n_estimators': range(10, 301, 20),
    'learning_rate': [0.1, 0.25, 0.5, 0.75, 1]
}
randgbm = RandomizedSearchCV(estimator=gbm,
                             param_distributions=param_grid,
                             cv=2,
                             verbose=3,
                             n_iter=50,
                             scoring=f1,
                             n_jobs=4)
randgbm.fit(X_train, y_train)
y_pred = randgbm.predict(X_test)
y_prob = randgbm.predict_proba(X_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  7.9min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 28.0min finished


In [29]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision:", precision_score(y_test, y_pred, average = 'weighted'))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test, y_prob, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6172938350680545
Recall: 0.6172938350680545
Precision: 0.4909999886228291
F1: 0.5181056565054468
ROC AUC: 0.5352592299273472
Confusion matrix:
 [[748  45   6   2   0]
 [291  23   1   2   0]
 [ 62   2   0   0   0]
 [ 61   4   0   0   0]
 [  2   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.64      0.93      0.76       801
           1       0.31      0.07      0.12       317
           2       0.00      0.00      0.00        64
           3       0.00      0.00      0.00        65
           4       0.00      0.00      0.00         2

    accuracy                           0.62      1249
   macro avg       0.19      0.20      0.18      1249
weighted avg       0.49      0.62      0.52      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Texts

## KNN

In [30]:
param_distributions = {'n_neighbors': range(10, 501, 10)}
f1 = make_scorer(f1_score, average='weighted')  # for scoring param
rand_knn_t = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                                param_distributions=param_distributions,
                                cv=3,
                                verbose=3,
                                n_iter=50,
                                n_jobs=-1,
                                scoring=f1)
rand_knn_t.fit(X_train_t, y_train_t)
y_pred_t = rand_knn_t.predict(X_test_t)
y_prob_t = rand_knn_t.predict_proba(X_test_t)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.9min finished


In [31]:
print("Accuracy:", accuracy_score(y_test_t, y_pred_t))
print("Recall:", recall_score(y_test_t, y_pred_t, average = 'weighted'))
print("Precision:", precision_score(y_test_t, y_pred_t, average = 'weighted'))
print("F1:", f1_score(y_test_t, y_pred_t, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test_t, y_prob_t, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test_t, y_pred_t))
print(classification_report(y_test_t, y_pred_t))

Accuracy: 0.6036829463570856
Recall: 0.6036829463570856
Precision: 0.4671615367776791
F1: 0.5033427184926162
ROC AUC: 0.520161300171576
Confusion matrix:
 [[730  56   1   0   0]
 [301  24   0   1   0]
 [ 60   5   0   0   0]
 [ 66   4   0   0   0]
 [  0   1   0   0   0]]
              precision    recall  f1-score   support

           0       0.63      0.93      0.75       787
           1       0.27      0.07      0.12       326
           2       0.00      0.00      0.00        65
           3       0.00      0.00      0.00        70
           4       0.00      0.00      0.00         1

    accuracy                           0.60      1249
   macro avg       0.18      0.20      0.17      1249
weighted avg       0.47      0.60      0.50      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## AdaBoost

In [32]:
classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1))
param_grid = {
    'n_estimators': range(10, 301, 20),
    'learning_rate': [0.1, 0.25, 0.5, 0.75, 1]
}

randdAdaBoost_t = RandomizedSearchCV(estimator=classifier,
                                   param_distributions=param_grid,
                                   cv=2,
                                   n_iter=50,
                                   verbose=3,
                                   scoring=f1,
                                   n_jobs=-1)

randdAdaBoost_t.fit(X_train_t, y_train_t)
y_pred_t = randdAdaBoost_t.predict(X_test_t)
y_prob_t = randdAdaBoost_t.predict_proba(X_test_t)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.7min finished


In [33]:
print("Accuracy:", accuracy_score(y_test_t, y_pred_t))
print("Recall:", recall_score(y_test_t, y_pred_t, average = 'weighted'))
print("Precision:", precision_score(y_test_t, y_pred_t, average = 'weighted'))
print("F1:", f1_score(y_test_t, y_pred_t, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test_t, y_prob_t, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test_t, y_pred_t))
print(classification_report(y_test_t, y_pred_t))

Accuracy: 0.6236989591673339
Recall: 0.6236989591673339
Precision: 0.39584764585862237
F1: 0.4843128572912598
ROC AUC: 0.5024652256468304
Confusion matrix:
 [[779   3   4   0   1]
 [325   0   1   0   0]
 [ 65   0   0   0   0]
 [ 70   0   0   0   0]
 [  1   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.63      0.99      0.77       787
           1       0.00      0.00      0.00       326
           2       0.00      0.00      0.00        65
           3       0.00      0.00      0.00        70
           4       0.00      0.00      0.00         1

    accuracy                           0.62      1249
   macro avg       0.13      0.20      0.15      1249
weighted avg       0.40      0.62      0.48      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [34]:
svcclassifier = SVC(probability=True)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4, 5, 6, 7]
}

rand_svc_t = RandomizedSearchCV(estimator=svcclassifier,
                              param_distributions=param_grid,
                              scoring=f1,
                              n_jobs=-1,
                              cv=2,
                              verbose=1)

rand_svc_t.fit(X_train_t, y_train_t)

y_pred_t = rand_svc_t.predict(X_test_t)
y_proba_t = rand_svc_t.predict_proba(X_test_t)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.4min finished


In [35]:
print("Accuracy:", accuracy_score(y_test_t, y_pred_t))
print("Recall:", recall_score(y_test_t, y_pred_t, average = 'weighted'))
print("Precision:", precision_score(y_test_t, y_pred_t, average = 'weighted'))
print("F1:", f1_score(y_test_t, y_pred_t, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test_t, y_prob_t, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test_t, y_pred_t))
print(classification_report(y_test_t, y_pred_t))

Accuracy: 0.6188951160928743
Recall: 0.6188951160928743
Precision: 0.519177394951231
F1: 0.5208617646432911
ROC AUC: 0.5024652256468304
Confusion matrix:
 [[743  29  15   0   0]
 [292  29   5   0   0]
 [ 60   4   1   0   0]
 [ 63   5   2   0   0]
 [  1   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.64      0.94      0.76       787
           1       0.43      0.09      0.15       326
           2       0.04      0.02      0.02        65
           3       0.00      0.00      0.00        70
           4       0.00      0.00      0.00         1

    accuracy                           0.62      1249
   macro avg       0.22      0.21      0.19      1249
weighted avg       0.52      0.62      0.52      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## LightGBM

In [36]:
gbm = LGBMClassifier()
param_grid = {
    'n_estimators': range(10, 301, 20),
    'learning_rate': [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
}
randgbm_t = RandomizedSearchCV(estimator=gbm,
                               param_distributions=param_grid,
                               cv=2,
                               verbose=3,
                               n_iter=50,
                               scoring=f1,
                               n_jobs=-1)
randgbm_t.fit(X_train_t, y_train_t)
y_pred_t = randgbm_t.predict(X_test_t)
y_prob_t = randgbm_t.predict_proba(X_test_t)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 32.4min finished


In [37]:
print("Accuracy:", accuracy_score(y_test_t, y_pred_t))
print("Recall:", recall_score(y_test_t, y_pred_t, average = 'weighted'))
print("Precision:", precision_score(y_test_t, y_pred_t, average = 'weighted'))
print("F1:", f1_score(y_test_t, y_pred_t, average='weighted'))
print("ROC AUC:", roc_auc_score(y_test_t, y_prob_t, average = 'weighted', multi_class='ovo'))
print("Confusion matrix:\n", confusion_matrix(y_test_t, y_pred_t))
print(classification_report(y_test_t, y_pred_t))

Accuracy: 0.6060848678943155
Recall: 0.6060848678943155
Precision: 0.46069559315092445
F1: 0.49275869469075384
ROC AUC: 0.539903013435937
Confusion matrix:
 [[745  35   4   3   0]
 [311  11   2   2   0]
 [ 62   3   0   0   0]
 [ 66   3   0   1   0]
 [  1   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.63      0.95      0.76       787
           1       0.21      0.03      0.06       326
           2       0.00      0.00      0.00        65
           3       0.17      0.01      0.03        70
           4       0.00      0.00      0.00         1

    accuracy                           0.61      1249
   macro avg       0.20      0.20      0.17      1249
weighted avg       0.46      0.61      0.49      1249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
