In [7]:
## Import libs
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.base import TransformerMixin

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
SEED=42

## God Class Evaluation

In [3]:
data = pd.read_csv('gc.csv', dtype={col: np.float32 for col in ['lcc', 'lcom*']})
data.dropna(inplace=True)
data.head()

Unnamed: 0,dit,fanin,fanout,lcc,lcom*,loc,noc,rfc,icq,nof,nom,wmc,gc
0,4,1,6,0.0,0.592593,39,0,12,0,3,9,10,0
1,4,2,7,0.0,0.0,18,0,6,0,0,4,6,0
2,1,0,0,0.0,1.0,56,0,0,3,4,4,4,0
3,1,1,0,0.0,0.0,4,0,0,0,0,2,2,0
4,3,3,5,0.0,0.0,16,0,7,0,0,4,4,0


In [4]:
X = data.drop(['gc'], axis=1)
y = data.gc

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=SEED)

### Resample the data

In [6]:
smote = SMOTE(sampling_strategy=0.2)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

### Naive Bayes

In [9]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', GaussianNB())
])

In [10]:
pipeline.fit(X_train_resampled, y_train_resampled)

  f = msb / msw


In [12]:
y_pred = pipeline.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      6599
           1       0.45      0.82      0.58       224

    accuracy                           0.96      6823
   macro avg       0.72      0.89      0.78      6823
weighted avg       0.98      0.96      0.97      6823



In [14]:
roc_auc_score(y_test, y_pred)

0.8935147100209989

### Decision Tree Evaluation

In [8]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', DecisionTreeClassifier(random_state=SEED))
])

In [9]:
hyperparameters ={
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': list(range(1, 11)),
    'classifier__min_samples_split': list(range(2, 21)),
    'classifier__min_samples_leaf': list(range(1, 11)),
    'classifier__max_features': ['sqrt', 'log2', None],
}

In [10]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED)

In [11]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [12]:
randomized_search.best_params_

{'classifier__min_samples_split': 7,
 'classifier__min_samples_leaf': 6,
 'classifier__max_features': 'log2',
 'classifier__max_depth': 2,
 'classifier__criterion': 'gini'}

In [13]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [14]:
y_pred = best_model.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      6599
           1       0.57      0.96      0.71       224

    accuracy                           0.97      6823
   macro avg       0.78      0.97      0.85      6823
weighted avg       0.98      0.97      0.98      6823



In [16]:
roc_auc_score(y_test, y_pred)

0.9654797534258436

### Random Forest Evaluation

In [19]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', RandomForestClassifier(random_state=SEED))
])

In [20]:
hyperparameters = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 5, 10, 15, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['auto', 'sqrt', 'log2']
}

In [21]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED)

In [22]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  warn(


  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(
  f = msb / msw
  warn(


  f = msb / msw
  warn(
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [23]:
randomized_search.best_params_

{'classifier__n_estimators': 500,
 'classifier__min_samples_split': 5,
 'classifier__min_samples_leaf': 1,
 'classifier__max_features': 'log2',
 'classifier__max_depth': 10,
 'classifier__criterion': 'entropy'}

In [24]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [25]:
y_pred = best_model.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      6599
           1       0.58      0.94      0.72       224

    accuracy                           0.98      6823
   macro avg       0.79      0.96      0.85      6823
weighted avg       0.98      0.98      0.98      6823



In [27]:
roc_auc_score(y_test, y_pred)

0.9596167844695084

### SVM Evalutaion

In [28]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', SVC(random_state=SEED))
])

In [29]:
C = [0.1, 1, 10, 100, 1000]
gamma = [0.001, 0.01, 0.1, 1]
kernel = ['rbf']

hyperparameters = dict(
    classifier__C=C,
    classifier__gamma=gamma,
    classifier__kernel=kernel,
)

In [30]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [31]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\U

  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklear

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\An

  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Onl

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranki

In [32]:
randomized_search.best_params_

{'classifier__kernel': 'rbf', 'classifier__gamma': 0.001, 'classifier__C': 0.1}

In [33]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [34]:
y_pred = best_model.predict(X_test)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6599
           1       0.00      0.00      0.00       224

    accuracy                           0.97      6823
   macro avg       0.48      0.50      0.49      6823
weighted avg       0.94      0.97      0.95      6823



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
roc_auc_score(y_test, y_pred)

0.5

### Gradient Machine Boosting Evaluation

In [37]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', GradientBoostingClassifier(random_state=SEED))
])

In [38]:
hyperparameters ={
    'classifier__n_estimators': [10, 100, 1000],
    'classifier__learning_rate': [0.001, 0.01, 0.1],
    'classifier__subsample': [0.5, 0.7, 1.0],
    'classifier__max_depth': [3, 7, 9],
}

In [39]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [40]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\U

  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklear

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\An

  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Onl

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranki

In [41]:
randomized_search.best_params_

{'classifier__subsample': 0.5,
 'classifier__n_estimators': 100,
 'classifier__max_depth': 3,
 'classifier__learning_rate': 0.01}

In [42]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [43]:
y_pred = best_model.predict(X_test)

In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      6599
           1       0.59      0.94      0.72       224

    accuracy                           0.98      6823
   macro avg       0.79      0.96      0.85      6823
weighted avg       0.98      0.98      0.98      6823



In [45]:
roc_auc_score(y_test, y_pred)

0.9575361797242006

### Logistic Regression Evaluation

In [46]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', LogisticRegression(random_state=SEED))
])

In [47]:
hyperparameters ={
    'classifier__solver': ['newton-cg', 'liblinear'],
    'classifier__penalty': ['l2'],
    'classifier__C': [100, 10, 1.0, 0.1, 0.01],
}

In [48]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [49]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\U

  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Onl

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranki

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\U

  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranki

In [50]:
randomized_search.best_params_

{'classifier__solver': 'newton-cg',
 'classifier__penalty': 'l2',
 'classifier__C': 100}

In [51]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [52]:
y_pred = best_model.predict(X_test)

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      6599
           1       0.55      0.75      0.64       224

    accuracy                           0.97      6823
   macro avg       0.77      0.87      0.81      6823
weighted avg       0.98      0.97      0.97      6823



In [54]:
roc_auc_score(y_test, y_pred)

0.8668517821964367

### Multilayer Perceptron Evaluation

In [56]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', MLPClassifier(random_state=SEED, activation='logistic', solver='sgd'))
])

In [57]:
hyperparameters ={
    'classifier__hidden_layer_sizes': [(10,),(50,),(100,),(50, 50),(100, 50)],
    'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'classifier__learning_rate_init': [0.001, 0.01, 0.1, 0.5, 1.0],
    'classifier__batch_size': [1, 32, 64, 128, len(X_train_resampled)],
}

In [58]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [None]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\U

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339

Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Only one class pres

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\An

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranki

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\An

  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    r

In [60]:
randomized_search.best_params_

{'classifier__learning_rate_init': 0.1,
 'classifier__hidden_layer_sizes': (100, 50),
 'classifier__batch_size': 64,
 'classifier__alpha': 0.1}

In [61]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [62]:
y_pred = best_model.predict(X_test)

In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6599
           1       0.00      0.00      0.00       224

    accuracy                           0.97      6823
   macro avg       0.48      0.50      0.49      6823
weighted avg       0.94      0.97      0.95      6823



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
roc_auc_score(y_test, y_pred)

0.5

### K Nearest Neighbor Evaluation

In [65]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', KNeighborsClassifier())
])

In [66]:
hyperparameters ={
    'classifier__n_neighbors': range(1, 21, 2),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
}

In [67]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [68]:
randomized_search.fit(X_train_resampled, y_train_resampled)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\U

  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Onl

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranki

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\An

Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Only one class pres

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 399, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 572, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 339

  f = msb / msw


In [69]:
randomized_search.best_params_

{'classifier__weights': 'uniform',
 'classifier__n_neighbors': 1,
 'classifier__metric': 'euclidean'}

In [70]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [71]:
y_pred = best_model.predict(X_test)

In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      6599
           1       0.60      0.84      0.70       224

    accuracy                           0.98      6823
   macro avg       0.80      0.91      0.85      6823
weighted avg       0.98      0.98      0.98      6823



In [73]:
roc_auc_score(y_test, y_pred)

0.9124796370662221

## Refused Bequest Evaluation

### Load data

In [15]:
data = pd.read_csv('rb.csv', dtype={col: np.float32 for col in ['lcc', 'lcom*']})
data.dropna(inplace=True)
data.head()

Unnamed: 0,dit,fanin,fanout,lcc,lcom*,loc,noc,rfc,icq,nof,nom,wmc,rb
0,4,1,6,0.0,0.592593,39,0,12,0,3,9,10,0
1,4,2,7,0.0,0.0,18,0,6,0,0,4,6,0
2,1,0,0,0.0,1.0,56,0,0,3,4,4,4,0
3,1,1,0,0.0,0.0,4,0,0,0,0,2,2,0
4,3,3,5,0.0,0.0,16,0,7,0,0,4,4,0


In [16]:
X = data.drop(['rb'], axis=1)
y = data.rb

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=SEED)

### Resample the data

In [18]:
smote = SMOTE(sampling_strategy=0.2)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

### Naive Bayes

In [19]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', GaussianNB())
])

In [20]:
pipeline.fit(X_train_resampled, y_train_resampled)

  f = msb / msw


In [21]:
y_pred = pipeline.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97      6793
           1       0.03      0.33      0.05        30

    accuracy                           0.95      6823
   macro avg       0.51      0.64      0.51      6823
weighted avg       0.99      0.95      0.97      6823



In [23]:
roc_auc_score(y_test, y_pred)

0.6409784582167918

### Decision Tree Evaluation

In [9]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', DecisionTreeClassifier(random_state=SEED))
])

In [10]:
hyperparameters ={
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': list(range(1, 11)),
    'classifier__min_samples_split': list(range(2, 21)),
    'classifier__min_samples_leaf': list(range(1, 11)),
    'classifier__max_features': ['sqrt', 'log2', None],
}

In [11]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED)

In [12]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encount

In [13]:
randomized_search.best_params_

{'classifier__min_samples_split': 13,
 'classifier__min_samples_leaf': 4,
 'classifier__max_features': None,
 'classifier__max_depth': 10,
 'classifier__criterion': 'gini'}

In [14]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [15]:
y_pred = best_model.predict(X_test)

In [16]:
f1_score(y_test, y_pred)

0.10289389067524114

In [17]:
roc_auc_score(y_test, y_pred)

0.747161293488395

### Random Forest Evaluation

In [19]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', RandomForestClassifier(random_state=SEED))
])

In [20]:
hyperparameters = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 5, 10, 15, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['auto', 'sqrt', 'log2']
}

In [21]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED)

In [22]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encount

Features [0] are constant.
invalid value encountered in true_divide
`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.
Features [0] are constant.
invalid value encountered in true_divide
`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.
Features [0] are constant.
invalid value encountered in true_divide
`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.
Features [0] are constant.
invalid

In [23]:
randomized_search.best_params_

{'classifier__n_estimators': 400,
 'classifier__min_samples_split': 5,
 'classifier__min_samples_leaf': 2,
 'classifier__max_features': 'auto',
 'classifier__max_depth': 20,
 'classifier__criterion': 'entropy'}

In [24]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [25]:
y_pred = best_model.predict(X_test)

In [26]:
f1_score(y_test, y_pred)

0.09022556390977443

In [27]:
roc_auc_score(y_test, y_pred)

0.5928602973649345

### SVM Evalutaion

In [9]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', SVC(random_state=SEED))
])

In [10]:
C = [0.1, 1, 10, 100, 1000]
gamma = [0.001, 0.01, 0.1, 1]
kernel = ['rbf']

hyperparameters = dict(
    classifier__C=C,
    classifier__gamma=gamma,
    classifier__kernel=kernel,
)

In [11]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [12]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", li

In [14]:
randomized_search.best_params_

{'classifier__kernel': 'rbf', 'classifier__gamma': 0.001, 'classifier__C': 0.1}

In [15]:
randomized_search.best_score_
best_model = randomized_search.best_estimator_

In [16]:
y_pred = best_model.predict(X_test)

In [18]:
f1_score(y_test, y_pred)

0.0

In [17]:
roc_auc_score(y_test, y_pred)

0.5

### Gradient Machine Boosting Evaluation

In [29]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', GradientBoostingClassifier(random_state=SEED))
])

In [30]:
hyperparameters ={
    'classifier__n_estimators': [10, 100, 1000],
    'classifier__learning_rate': [0.001, 0.01, 0.1],
    'classifier__subsample': [0.5, 0.7, 1.0],
    'classifier__max_depth': [3, 7, 9],
}

In [31]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [32]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", li

In [33]:
randomized_search.best_params_

{'classifier__subsample': 0.5,
 'classifier__n_estimators': 100,
 'classifier__max_depth': 3,
 'classifier__learning_rate': 0.01}

In [34]:
randomized_search.best_score_

nan

In [35]:
best_model = randomized_search.best_estimator_

In [36]:
y_pred = best_model.predict(X_test)

In [37]:
f1_score(y_test, y_pred)

0.09665427509293681

In [38]:
roc_auc_score(y_test, y_pred)

0.7000318955787821

### Logistic Regression Evaluation

In [55]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', LogisticRegression(random_state=SEED))
])

In [62]:
hyperparameters ={
    'classifier__solver': ['newton-cg', 'liblinear'],
    'classifier__penalty': ['l2'],
    'classifier__C': [100, 10, 1.0, 0.1, 0.01],
}

In [63]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [64]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\

In [65]:
randomized_search.best_params_

{'classifier__solver': 'newton-cg',
 'classifier__penalty': 'l2',
 'classifier__C': 100}

In [66]:
randomized_search.best_score_

nan

In [67]:
best_model = randomized_search.best_estimator_

In [68]:
y_pred = best_model.predict(X_test)

In [69]:
f1_score(y_test, y_pred)

0.055045871559633024

In [70]:
roc_auc_score(y_test, y_pred)

0.5866038569115265

### Multilayer Perceptron Evaluation

In [40]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', MLPClassifier(random_state=SEED, activation='logistic', solver='sgd'))
])

In [41]:
hyperparameters ={
    'classifier__hidden_layer_sizes': [(10,),(50,),(100,),(50, 50),(100, 50)],
    'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'classifier__learning_rate_init': [0.001, 0.01, 0.1, 0.5, 1.0],
    'classifier__batch_size': [1, 32, 64, 128, len(X_train_resampled)],
}

In [42]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [43]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\

Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than 1 or larger than sample size. It is going to be clipped
Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than 1 or larger than sample size. It is going to be clipped
Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than 1 or larger than sample size. It is going to be clipped
Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than 1 or larger than sample size. It is going to be clipped
Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than 1 or larger than sample size. It is going to be clipped
Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than 1 or larger than sample size. It is going to be clipped
Features [0] are constant.
invalid value encountered in true_divide
Got `batch_size` less than

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", li

In [44]:
randomized_search.best_params_

{'classifier__learning_rate_init': 0.1,
 'classifier__hidden_layer_sizes': (100, 50),
 'classifier__batch_size': 64,
 'classifier__alpha': 0.1}

In [45]:
randomized_search.best_score_

nan

In [46]:
best_model = randomized_search.best_estimator_

In [47]:
y_pred = best_model.predict(X_test)

In [48]:
roc_auc_score(y_test, y_pred)

0.5

In [49]:
f1_score(y_test, y_pred)

0.0

### K Nearest Neighbor Evaluation

In [74]:
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('minmaxscaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', KNeighborsClassifier())
])

In [75]:
hyperparameters ={
    'classifier__n_neighbors': range(1, 21, 2),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
}

In [76]:
randomized_search = RandomizedSearchCV(pipeline,
                                       param_distributions=hyperparameters,
                                       cv=KFold(n_splits=10),
                                       random_state=SEED,
                                      scoring='roc_auc')

In [77]:
randomized_search.fit(X_train_resampled, y_train_resampled)

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\

Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Features [0] are constant.
invalid value encountered in true_divide
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Henrique\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", li

In [78]:
randomized_search.best_params_

{'classifier__weights': 'uniform',
 'classifier__n_neighbors': 1,
 'classifier__metric': 'euclidean'}

In [79]:
randomized_search.best_score_

nan

In [80]:
best_model = randomized_search.best_estimator_

In [81]:
y_pred = best_model.predict(X_test)

In [82]:
roc_auc_score(y_test, y_pred)

0.6215565042445653

In [83]:
f1_score(y_test, y_pred)

0.0808080808080808