In [1]:
import pandas as pd
import numpy as np
import os
import dill as pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from scipy.stats import loguniform
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def load_sentence_embeddings(model='DistilBERT', features_path='features/', filename='sentence_embeddings'):
    if model == 'DistilBERT':
        tst_df = pd.concat([pickle.load(open(os.path.join(features_path, file), 'rb')) for file in os.listdir(features_path) if filename in file])
        return tst_df

In [3]:
tst_df = load_sentence_embeddings(features_path="features_250/")

In [4]:
tst_df.shape

(22332, 2)

In [5]:
tst_df.head()

Unnamed: 0,sentence_embeddings,label
0,"[-0.185443714261055, -0.11448108404874802, -0....",0
1,"[-0.3724660873413086, 0.04101637750864029, -0....",0
2,"[-0.41084980964660645, -0.1713167279958725, -0...",0
3,"[-0.14235153794288635, 0.19862940907478333, -0...",0
4,"[-0.47683459520339966, -0.040994927287101746, ...",0


In [6]:
tst_df.label.value_counts()

0    14888
1     7444
Name: label, dtype: int64

In [7]:
features = np.array(tst_df.sentence_embeddings.tolist())

In [8]:
labels = tst_df['label']

In [9]:
model = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [10]:
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)

In [11]:
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

In [12]:
# execute search
result = search.fit(features, labels)

7080 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1590 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------

In [13]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9741925564325463
Best Hyperparameters: {'C': 4.0024482044283785, 'penalty': 'l2', 'solver': 'liblinear'}


In [14]:
# execute search
result = search.fit(features, labels)

KeyboardInterrupt: 

In [None]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
model1 = LogisticRegression()
cv1 = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [None]:
space1 = dict()
space1['solver'] = ['liblinear']
space1['penalty'] = ['none', 'l2']
space1['class_weight'] = ["balanced"]
space1['C'] = loguniform(1e-5, 100)

In [None]:
search1 = RandomizedSearchCV(model1, space1, n_iter=100, scoring='f1', n_jobs=-1, cv=cv1, random_state=1)

In [None]:
# execute search
result1 = search1.fit(features, labels)

In [None]:
# summarize result
print('Best Score: %s' % result1.best_score_)
print('Best Hyperparameters: %s' % result1.best_params_)

In [9]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [10]:
best_lr_clf = LogisticRegression(C=4, class_weight='balanced', penalty='l2', solver='liblinear')

In [11]:
best_lr_clf.fit(train_features, train_labels)

In [12]:
predictions = best_lr_clf.predict(test_features)
probs = best_lr_clf.predict_proba(test_features)[:, 1]

In [13]:
result_table = [["Classifier", "F1", "Accuracy", "AUC"]]

In [14]:
result_table.append([
    "LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [15]:
result_table

[['Classifier', 'F1', 'Accuracy', 'AUC'],
 ["LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",
  0.956,
  0.971,
  0.996]]

In [16]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+--------------------------------------------------------------------------------------+-------+------------+-------+
| Classifier                                                                           |    F1 |   Accuracy |   AUC |
| LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear') | 0.956 |      0.971 | 0.996 |
+--------------------------------------------------------------------------------------+-------+------------+-------+


# KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
knn_model = KNeighborsClassifier(n_jobs=1)

In [16]:
knn_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [17]:
knn_space = dict()
knn_space['weights'] = ['distance', 'uniform']
knn_space['n_neighbors'] = list(range(5, 50, 5))
knn_space['algorithm'] = ['auto', 'ball_tree', 'kd_tree']
knn_space['p'] = [1, 2]

In [18]:
knn_search = RandomizedSearchCV(knn_model, knn_space, n_iter=100, scoring='f1', n_jobs=24, cv=knn_cv, random_state=1)

In [19]:
# execute search
knn_result = knn_search.fit(features, labels)

In [20]:
# summarize result
print('Best Score: %s' % knn_result.best_score_)
print('Best Hyperparameters: %s' % knn_result.best_params_)

Best Score: 0.8777423494343544
Best Hyperparameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 20, 'algorithm': 'kd_tree'}


In [23]:
best_knn_est = knn_result.best_estimator_

In [24]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [25]:
best_knn_est.fit(train_features, train_labels)

In [27]:
predictions = best_knn_est.predict(test_features)
probs = best_knn_est.predict_proba(test_features)[:, 1]

In [38]:
params_str = ', '.join([f"{k}:{v}" for k,v in best_knn_est.get_params().items()])
params_str

'algorithm:kd_tree, leaf_size:30, metric:minkowski, metric_params:None, n_jobs:1, n_neighbors:20, p:2, weights:distance'

In [39]:
result_table.append([
    f"KNeighborsClassifier({params_str})",
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

NameError: name 'result_table' is not defined

In [19]:
best_knn_clf = KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)

In [20]:
best_knn_clf.fit(train_features, train_labels)

In [21]:
predictions = best_knn_clf.predict(test_features)
probs = best_knn_clf.predict_proba(test_features)[:, 1]

In [34]:
result_table = [["F1", "Accuracy", "AUC"]]

In [22]:
result_table.append([
    "KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [23]:
result_table

[['Classifier', 'F1', 'Accuracy', 'AUC'],
 ["LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",
  0.956,
  0.971,
  0.996],
 ["KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",
  0.866,
  0.911,
  0.967]]

In [24]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-------------------------------------------------------------------------------------------------+-------+------------+-------+
| Classifier                                                                                      |    F1 |   Accuracy |   AUC |
| LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')            | 0.956 |      0.971 | 0.996 |
+-------------------------------------------------------------------------------------------------+-------+------------+-------+
| KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1) | 0.866 |      0.911 | 0.967 |
+-------------------------------------------------------------------------------------------------+-------+------------+-------+


# DT

In [25]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [24]:
dt_space = dict()
dt_space['random_state'] = [1]
dt_space['criterion'] = ['gini', 'entropy']
dt_space['max_depth'] = [50, 100, 500, 1000]
dt_space['splitter'] = ['best', 'random']
dt_space['class_weight'] = [None, 'balanced']
dt_space['max_features'] = [None, 'auto', 'sqrt', 'log2']

In [25]:
dt_search = RandomizedSearchCV(dt_model, dt_space, n_iter=100, scoring='f1', n_jobs=-1, cv=dt_cv, random_state=1)

In [26]:
# execute search
dt_result = dt_search.fit(features, labels)

In [27]:
# summarize result
print('Best Score: %s' % dt_result.best_score_)
print('Best Hyperparameters: %s' % dt_result.best_params_)

Best Score: 0.7903791474939794
Best Hyperparameters: {'splitter': 'best', 'random_state': 1, 'max_features': None, 'max_depth': 100, 'criterion': 'entropy', 'class_weight': 'balanced'}


In [26]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [27]:
best_dt_clf = DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1,
                                     max_features=None, max_depth=100, class_weight='balanced')

In [28]:
best_dt_clf.fit(train_features, train_labels)

In [29]:
predictions = best_dt_clf.predict(test_features)
probs = best_dt_clf.predict_proba(test_features)[:, 1]

In [42]:
result_table = [["F1", "Accuracy", "AUC"]]

In [30]:
result_table.append([
    "DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1, max_features=None, max_depth=100, class_weight='balanced')",
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [31]:
result_table

[['Classifier', 'F1', 'Accuracy', 'AUC'],
 ["LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",
  0.956,
  0.971,
  0.996],
 ["KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",
  0.866,
  0.911,
  0.967],
 ["DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1, max_features=None, max_depth=100, class_weight='balanced')",
  0.722,
  0.819,
  0.793]]

In [32]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-----------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------+
| Classifier                                                                                                                              |    F1 |   Accuracy |   AUC |
| LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')                                                    | 0.956 |      0.971 | 0.996 |
+-----------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------+
| KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)                                         | 0.866 |      0.911 | 0.967 |
+-----------------------------------------------------------------------------------------------------------------------------------------+-------+--------

In [33]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.DataFrame(result_table[1:], columns=['Classifier', 'F1', 'Accuracy', 'AUC'])

Unnamed: 0,Classifier,F1,Accuracy,AUC
0,"LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",0.956,0.971,0.996
1,"KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",0.866,0.911,0.967
2,"DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1, max_features=None, max_depth=100, class_weight='balanced')",0.722,0.819,0.793


# more iter on dt

In [46]:
from sklearn.tree import DecisionTreeClassifier

dt_more_model = DecisionTreeClassifier()
dt_more_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [47]:
dt_more_space = dict()
dt_more_space['random_state'] = [1]
dt_more_space['criterion'] = ['gini', 'entropy']
dt_more_space['max_depth'] = list(range(1, 101))
dt_more_space['splitter'] = ['best', 'random']
dt_more_space['class_weight'] = [None, 'balanced']
dt_more_space['max_features'] = [None, 'auto', 'sqrt', 'log2']

In [48]:
dt_more_search = RandomizedSearchCV(dt_more_model, dt_more_space, 
                                    n_iter=500, scoring='f1', n_jobs=-1, cv=dt_more_cv, random_state=1)

In [49]:
# execute search
dt_more_result = dt_more_search.fit(features, labels)

270 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 889, in fit
    super().fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\Anaconda3\lib\site

In [50]:
# summarize result
print('Best Score: %s' % dt_more_result.best_score_)
print('Best Hyperparameters: %s' % dt_more_result.best_params_)

Best Score: 0.796373652263697
Best Hyperparameters: {'splitter': 'best', 'random_state': 1, 'max_features': None, 'max_depth': 10, 'criterion': 'gini', 'class_weight': 'balanced'}


In [54]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [55]:
best_dt_more_clf = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1,
                                     max_features=None, max_depth=10, class_weight='balanced')

In [56]:
best_dt_more_clf.fit(train_features, train_labels)

In [57]:
predictions = best_dt_more_clf.predict(test_features)
probs = best_dt_more_clf.predict_proba(test_features)[:, 1]

In [55]:
result_table = [["F1", "Accuracy", "AUC"]]

In [58]:
result_table.append([
    "DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [59]:
result_table

[['Classifier', 'F1', 'Accuracy', 'AUC'],
 ["LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",
  0.956,
  0.971,
  0.996],
 ["KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",
  0.866,
  0.911,
  0.967],
 ["DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1, max_features=None, max_depth=100, class_weight='balanced')",
  0.722,
  0.819,
  0.793],
 ["DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",
  0.865,
  0.91,
  0.969],
 ["DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",
  0.762,
  0.827,
  0.811]]

# RF

In [60]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_jobs=25)

In [10]:
rf_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [19]:
rf_space = dict()
rf_space['random_state'] = [1]
rf_space['criterion'] = ['gini', 'entropy']
rf_space['max_depth'] = list(range(1, 101, 10))
rf_space['n_estimators'] = list(range(1, 100, 10))
rf_space['class_weight'] = [None, 'balanced', 'balanced_subsample']
rf_space['max_features'] = [None, 'auto', 'sqrt', 'log2']

rf_search = RandomizedSearchCV(rf_model, rf_space, 
                                    n_iter=10, scoring='f1', cv=rf_cv, random_state=1)

In [20]:
from datetime import datetime

In [21]:
start = datetime.now()
rf_result = rf_search.fit(features, labels)
end = datetime.now()
print(end - start)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


2:48:44.272427


In [23]:
end, start

(datetime.datetime(2023, 10, 9, 13, 19, 3, 260869),
 datetime.datetime(2023, 10, 9, 10, 30, 18, 988442))

In [24]:
# summarize result
print('Best Score: %s' % rf_result.best_score_)
print('Best Hyperparameters: %s' % rf_result.best_params_)

Best Score: 0.9085199276170173
Best Hyperparameters: {'random_state': 1, 'n_estimators': 81, 'max_features': 'log2', 'max_depth': 11, 'criterion': 'entropy', 'class_weight': 'balanced'}


In [61]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [62]:
rf_clf = RandomForestClassifier(criterion='entropy', random_state=1,
                                n_estimators=81, max_features='log2', max_depth=11, class_weight='balanced')

In [63]:
rf_clf.fit(train_features, train_labels)

In [64]:
predictions = rf_clf.predict(test_features)
probs = rf_clf.predict_proba(test_features)[:, 1]

In [15]:
result_table = [["F1", "Accuracy", "AUC"]]

In [65]:
result_table.append([
    "RandomForestClassifier(criterion='entropy', random_state=1, n_estimators=81, max_features='log2', max_depth=11, class_weight='balanced')",
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [66]:
result_table

[['Classifier', 'F1', 'Accuracy', 'AUC'],
 ["LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",
  0.956,
  0.971,
  0.996],
 ["KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",
  0.866,
  0.911,
  0.967],
 ["DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1, max_features=None, max_depth=100, class_weight='balanced')",
  0.722,
  0.819,
  0.793],
 ["DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",
  0.865,
  0.91,
  0.969],
 ["DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",
  0.762,
  0.827,
  0.811],
 ["RandomForestClassifier(criterion='entropy', random_state=1, n_estimators=81, max_features='log2', max_depth=11, class_weight='balanced')",
  0.873,
  0.919,
  0.972]]

In [67]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------+
| Classifier                                                                                                                               |    F1 |   Accuracy |   AUC |
| LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')                                                     | 0.956 |      0.971 | 0.996 |
+------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------+
| KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)                                          | 0.866 |      0.911 | 0.967 |
+------------------------------------------------------------------------------------------------------------------------------------------+-------+--

In [69]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
results_df = pd.DataFrame(result_table[1:], columns=['Classifier', 'F1', 'Accuracy', 'AUC'])
results_df

Unnamed: 0,Classifier,F1,Accuracy,AUC
0,"LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')",0.956,0.971,0.996
1,"KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)",0.866,0.911,0.967
2,"DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1, max_features=None, max_depth=100, class_weight='balanced')",0.722,0.819,0.793
3,"DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",0.865,0.91,0.969
4,"DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1, max_features=None, max_depth=10, class_weight='balanced')",0.762,0.827,0.811
5,"RandomForestClassifier(criterion='entropy', random_state=1, n_estimators=81, max_features='log2', max_depth=11, class_weight='balanced')",0.873,0.919,0.972
