In [1]:
import pandas as pd
import numpy as np
import os
import dill as pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from scipy.stats import loguniform
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def load_sentence_embeddings(model='DistilBERT', features_path='features/', filename='sentence_embeddings'):
    if model == 'DistilBERT':
        tst_df = pd.concat([pickle.load(open(os.path.join(features_path, file), 'rb')) for file in os.listdir(features_path) if filename in file])
        return tst_df

In [3]:
tst_df = load_sentence_embeddings()

In [4]:
tst_df.shape

(22332, 2)

In [5]:
tst_df.head()

Unnamed: 0,sentence_embeddings,label
0,"[-0.21086546778678894, -0.005486507900059223, ...",0
1,"[-0.3340323865413666, 0.1280461549758911, -0.6...",0
2,"[-0.4694300889968872, -0.14065003395080566, -0...",0
3,"[-0.19011789560317993, 0.16007745265960693, -0...",0
4,"[-0.5151359438896179, -0.09672432392835617, -0...",0


In [6]:
tst_df.label.value_counts()

0    14888
1     7444
Name: label, dtype: int64

In [7]:
features = np.array(tst_df.sentence_embeddings.tolist())

In [8]:
labels = tst_df['label']

In [9]:
model = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [10]:
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)

In [11]:
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

In [12]:
# execute search
result = search.fit(features, labels)

7080 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1590 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------

In [14]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9837901085732407
Best Hyperparameters: {'C': 2.7687341029930432, 'penalty': 'l1', 'solver': 'liblinear'}


In [12]:
# execute search
result = search.fit(features, labels)

7080 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1590 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------

In [14]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9837901085732407
Best Hyperparameters: {'C': 2.7687341029930432, 'penalty': 'l1', 'solver': 'liblinear'}


In [18]:
model1 = LogisticRegression()
cv1 = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [17]:
space1 = dict()
space1['solver'] = ['liblinear']
space1['penalty'] = ['none', 'l2']
space1['class_weight'] = ["balanced"]
space1['C'] = loguniform(1e-5, 100)

In [19]:
search1 = RandomizedSearchCV(model1, space1, n_iter=100, scoring='f1', n_jobs=-1, cv=cv1, random_state=1)

In [20]:
# execute search
result1 = search1.fit(features, labels)

1380 fits failed out of a total of 3000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1380 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported fo

In [22]:
# summarize result
print('Best Score: %s' % result1.best_score_)
print('Best Hyperparameters: %s' % result1.best_params_)

Best Score: 0.9747971413558439
Best Hyperparameters: {'C': 4.908680809441754, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}


In [23]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [26]:
best_lr_clf = LogisticRegression(C=4.9, class_weight='balanced', penalty='l2', solver='liblinear')

In [28]:
best_lr_clf.fit(train_features, train_labels)

In [29]:
predictions = best_lr_clf.predict(test_features)
probs = best_lr_clf.predict_proba(test_features)[:, 1]

In [30]:
result_table = [["F1", "Accuracy", "AUC"]]

In [31]:
result_table.append([
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [32]:
result_table

[['F1', 'Accuracy', 'AUC'], [0.972, 0.981, 0.998]]

In [33]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-------+------------+-------+
|    F1 |   Accuracy |   AUC |
| 0.972 |      0.981 | 0.998 |
+-------+------------+-------+


# KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn_model = KNeighborsClassifier(n_jobs=-1)

In [11]:
knn_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [12]:
knn_space = dict()
knn_space['weights'] = ['distance', 'uniform']
knn_space['n_neighbors'] = list(range(5, 50, 5))
knn_space['algorithm'] = ['auto', 'ball_tree', 'kd_tree']
knn_space['p'] = [1, 2]

In [13]:
knn_search = RandomizedSearchCV(knn_model, knn_space, n_iter=100, scoring='f1', n_jobs=-1, cv=knn_cv, random_state=1)

In [14]:
# execute search
knn_result = knn_search.fit(features, labels)

In [16]:
# summarize result
print('Best Score: %s' % knn_result.best_score_)
print('Best Hyperparameters: %s' % knn_result.best_params_)

Best Score: 0.9108119417459216
Best Hyperparameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 20, 'algorithm': 'ball_tree'}


In [30]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [31]:
best_knn_clf = KNeighborsClassifier(p=1, weights='distance', n_neighbors=20, algorithm='ball_tree', n_jobs=-1)

In [32]:
best_knn_clf.fit(train_features, train_labels)

In [33]:
predictions = best_knn_clf.predict(test_features)
probs = best_knn_clf.predict_proba(test_features)[:, 1]

In [34]:
result_table = [["F1", "Accuracy", "AUC"]]

In [35]:
result_table.append([
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [36]:
result_table

[['F1', 'Accuracy', 'AUC'], [0.907, 0.936, 0.981]]

In [37]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-------+------------+-------+
|    F1 |   Accuracy |   AUC |
| 0.907 |      0.936 | 0.981 |
+-------+------------+-------+


# DT

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [24]:
dt_space = dict()
dt_space['random_state'] = [1]
dt_space['criterion'] = ['gini', 'entropy']
dt_space['max_depth'] = [50, 100, 500, 1000]
dt_space['splitter'] = ['best', 'random']
dt_space['class_weight'] = [None, 'balanced']
dt_space['max_features'] = [None, 'auto', 'sqrt', 'log2']

In [25]:
dt_search = RandomizedSearchCV(dt_model, dt_space, n_iter=100, scoring='f1', n_jobs=-1, cv=dt_cv, random_state=1)

In [26]:
# execute search
dt_result = dt_search.fit(features, labels)

In [27]:
# summarize result
print('Best Score: %s' % dt_result.best_score_)
print('Best Hyperparameters: %s' % dt_result.best_params_)

Best Score: 0.7903791474939794
Best Hyperparameters: {'splitter': 'best', 'random_state': 1, 'max_features': None, 'max_depth': 100, 'criterion': 'entropy', 'class_weight': 'balanced'}


In [38]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [39]:
best_dt_clf = DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=1,
                                     max_features=None, max_depth=100, class_weight='balanced')

In [40]:
best_dt_clf.fit(train_features, train_labels)

In [41]:
predictions = best_dt_clf.predict(test_features)
probs = best_dt_clf.predict_proba(test_features)[:, 1]

In [42]:
result_table = [["F1", "Accuracy", "AUC"]]

In [43]:
result_table.append([
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [44]:
result_table

[['F1', 'Accuracy', 'AUC'], [0.783, 0.857, 0.84]]

In [45]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-------+------------+-------+
|    F1 |   Accuracy |   AUC |
| 0.783 |      0.857 |  0.84 |
+-------+------------+-------+


# more iter on dt

In [46]:
from sklearn.tree import DecisionTreeClassifier

dt_more_model = DecisionTreeClassifier()
dt_more_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [47]:
dt_more_space = dict()
dt_more_space['random_state'] = [1]
dt_more_space['criterion'] = ['gini', 'entropy']
dt_more_space['max_depth'] = list(range(1, 101))
dt_more_space['splitter'] = ['best', 'random']
dt_more_space['class_weight'] = [None, 'balanced']
dt_more_space['max_features'] = [None, 'auto', 'sqrt', 'log2']

In [48]:
dt_more_search = RandomizedSearchCV(dt_more_model, dt_more_space, 
                                    n_iter=500, scoring='f1', n_jobs=-1, cv=dt_more_cv, random_state=1)

In [49]:
# execute search
dt_more_result = dt_more_search.fit(features, labels)

270 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 889, in fit
    super().fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\Anaconda3\lib\site

In [50]:
# summarize result
print('Best Score: %s' % dt_more_result.best_score_)
print('Best Hyperparameters: %s' % dt_more_result.best_params_)

Best Score: 0.796373652263697
Best Hyperparameters: {'splitter': 'best', 'random_state': 1, 'max_features': None, 'max_depth': 10, 'criterion': 'gini', 'class_weight': 'balanced'}


In [51]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [52]:
best_dt_more_clf = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1,
                                     max_features=None, max_depth=10, class_weight='balanced')

In [53]:
best_dt_more_clf.fit(train_features, train_labels)

In [54]:
predictions = best_dt_more_clf.predict(test_features)
probs = best_dt_more_clf.predict_proba(test_features)[:, 1]

In [55]:
result_table = [["F1", "Accuracy", "AUC"]]

In [56]:
result_table.append([
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [57]:
result_table

[['F1', 'Accuracy', 'AUC'], [0.786, 0.852, 0.823]]

In [58]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-------+------------+-------+
|    F1 |   Accuracy |   AUC |
| 0.786 |      0.852 | 0.823 |
+-------+------------+-------+


# RF

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_jobs=-1)

In [10]:
rf_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [11]:
rf_space = dict()
rf_space['random_state'] = [1]
rf_space['criterion'] = ['gini', 'entropy']
rf_space['max_depth'] = list(range(1, 101))
rf_space['n_estimators'] = list(range(1, 100))
rf_space['class_weight'] = [None, 'balanced', 'balanced_subsample']
rf_space['max_features'] = [None, 'auto', 'sqrt', 'log2']

rf_search = RandomizedSearchCV(rf_model, rf_space, 
                                    n_iter=100, scoring='f1', n_jobs=-1, cv=rf_cv, random_state=1)

In [None]:
rf_result = rf_search.fit(features, labels)

In [None]:
# summarize result
print('Best Score: %s' % rf_result.best_score_)
print('Best Hyperparameters: %s' % rf_result.best_params_)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [52]:
rf_clf = RandomForestClassifier(criterion='gini', splitter='best', random_state=1,
                                     max_features=None, max_depth=10, class_weight='balanced')

In [53]:
rf_clf.fit(train_features, train_labels)

In [54]:
predictions = rf_clf.predict(test_features)
probs = rf_clf.predict_proba(test_features)[:, 1]

In [55]:
result_table = [["F1", "Accuracy", "AUC"]]

In [56]:
result_table.append([
    round(f1_score(test_labels, predictions), 3),
    round(accuracy_score(test_labels, predictions), 3),
    round(roc_auc_score(test_labels, probs), 3),
])

In [57]:
result_table

[['F1', 'Accuracy', 'AUC'], [0.786, 0.852, 0.823]]

In [58]:
from tabulate import tabulate
print(tabulate(result_table, headers="firstrow", tablefmt="grid"))

+-------+------------+-------+
|    F1 |   Accuracy |   AUC |
| 0.786 |      0.852 | 0.823 |
+-------+------------+-------+
