# Load Datasets

In [210]:
import numpy as np
import pandas as pd
import os
import math

import sklearn.tree
import sklearn.ensemble

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

In [211]:
# Clone repository in order to get access locally to the datasets
!rm -rf .git README.md
!git clone -b playing-emsembled-methods https://github.com/sergio-gimenez/anomaly-4G-detection 

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [212]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [213]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1, stratify = y)
X_test = test
#X_train = X
#y_train = y
zeros = []
ones = []
for a in y_train:
  if a == 0:
    zeros.append(a)
  else:
    ones.append(1)
print(len(zeros), len(ones), len(ones)+ len(zeros), y_train.shape)

21377 8146 29523 (29523,)


In [214]:
#Refactor time feature to minuts and cellName to unique identifier 1:1
def getTimeInMinutes(x):
  hh, mm  = x.split(":")
  return int(hh)* 60 + int(mm)

def createCellNameDictionary(data):
  cellList = []
  for i in data["CellName"]:
    cellList.append(i)
  cellList = set(cellList)
  cellDict = {}
  for idx, value in enumerate(cellList):
    cellDict[value]=idx
  return cellDict

def refactorFeaturesDataframe(data):
  #data["Time"] = data["Time"].apply(lambda x: getTimeInMinutes(x))
  data["TimeCos"] = data["Time"].apply(lambda x: math.cos(getTimeInMinutes(x)*math.pi/(12*60)))
  data["TimeSin"] = data["Time"].apply(lambda x: math.sin(getTimeInMinutes(x)*math.pi/(12*60)))
  del data["Time"]

  cellNameDict = createCellNameDictionary(data);
  data["CellName"] = data["CellName"].apply(lambda x: cellNameDict[x])
  print(data.head())
  return data


In [215]:
#Refactoring data from features to useful values
X_train = refactorFeaturesDataframe(X_train).to_numpy()
y_train = y_train.to_numpy()


X_validation = refactorFeaturesDataframe(X_validation).to_numpy()
y_validation = y_validation.to_numpy()

X_test = refactorFeaturesDataframe(test).to_numpy()

       CellName  PRBUsageUL  PRBUsageDL  ...  maxUE_UL+DL   TimeCos   TimeSin
14974        19      22.837       2.728  ...           12 -0.991445  0.130526
5318         27       7.377       1.011  ...            7 -0.442289  0.896873
31338        20       0.101       0.808  ...            4  0.442289  0.896873
22493        24       2.425       9.701  ...           10 -0.831470  0.555570
27949        15      22.938       3.335  ...           12 -0.923880  0.382683

[5 rows x 14 columns]
       CellName  PRBUsageUL  PRBUsageDL  ...  maxUE_UL+DL   TimeCos   TimeSin
11936        17      0.2020      0.5050  ...            5 -0.555570  0.831470
23788        15      4.0413      1.2222  ...            7  0.500000  0.866025
29595         0      0.2020      0.8080  ...            3 -0.130526  0.991445
29759         9      3.3674      0.5049  ...            6  0.831470  0.555570
33580        16     12.4290      1.1120  ...            7  0.321439 -0.946930

[5 rows x 14 columns]
   CellName  PRBUs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

# Solving the Classification Problem

In [242]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from joblib import dump, load
from google.colab import files
from scipy.stats import uniform, randint

#if performVoting == 'y':
  
try:
  clf_GS = load('anomaly-4G-detection/voting_model.joblib') 

except:
  voting_clf = VotingClassifier( estimators=[ 
      ('xgb', XGBClassifier(random_state=1)),
      ('dt', DecisionTreeClassifier(random_state=1)),
      ('knn', KNeighborsClassifier())
      ], voting='soft')

  def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
    return 'f1',f1_score(t,y_bin)

  pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                         ('voting_clf', XGBClassifier(random_state=1,
                                                      #scale_pos_weight=7,
                                                      #colsample_bytree= 0.053381469489678104,
                                                      #eta= 0.20289460663803338,
                                                      #gamma= 0.88723107873764,
                                                      #learning_rate= 0.15455380920536027,
                                                      #max_depth= 26,
                                                      #min_child_weight= 1,
                                                      #n_estimators= 565,
                                                      #subsample= 0.9738168894035317
                                                      ))])
  
  parameters = {
   'voting_clf__eta'    : uniform(0.17, 0.25),
   "voting_clf__colsample_bytree": uniform(0.5, 0.99),
   "voting_clf__min_child_weight": randint(1, 5),
   "voting_clf__gamma": uniform(0.35, 0.6),
   "voting_clf__learning_rate": uniform(0.1, 0.2), # default 0.1 
   "voting_clf__max_depth": randint(20, 30), # default 3
   "voting_clf__n_estimators": randint(500, 1000), # default 100
   "voting_clf__subsample": uniform(0.9, 0.99),
   "voting_clf__reg_alpha": [0, 0.001, 0.005, 0.01, 0.05],
   "voting_clf__reg_lambda": [1e-5, 1e-2, 0.1, 1, 100],
   "voting_clf__scale_pos_weight": randint(6, 9)
  }

  # grid_params = {
  # "voting_clf__colsample_bytree": [0.1, 0.2],
  # "voting_clf__min_child_weight": [1],
  # "voting_clf__gamma": [0.35, 0.4, 0.5],
  # "voting_clf__learning_rate": [0.15, 0.2, 0.3], # default 0.1 
  # "voting_clf__max_depth": [15, 19, 21], # default 3
  # "voting_clf__n_estimators": [600, 700, 800], # default 100
  # "voting_clf__subsample": [0.932]
  # }

  # parameters = {'voting_clf__colsample_bytree': 0.053381469489678104,
  # 'voting_clf__eta': 0.20289460663803338,
  # 'voting_clf__gamma': 0.88723107873764,
  # 'voting_clf__learning_rate': 0.15455380920536027,
  # 'voting_clf__max_depth': 26,
  # 'voting_clf__min_child_weight': 1,
  # 'voting_clf__n_estimators': 565,
  # 'voting_clf__subsample': 0.9738168894035317}

  #clf_GS = GridSearchCV(estimator=pipe, param_grid=grid_params, n_jobs=10, verbose=1, cv=3 )
  clf_GS = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=3, n_iter= 500)
  clf_GS.fit(X_train, y_train)

  #Save the model in a file and download locally.
  dump(clf_GS, 'voting_model.joblib')
  files.download('voting_model.joblib') 

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    5.9s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  6.4min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 17.0min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 30.8min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed: 39.5min
[Parallel(n_jobs=10)]: Done 1500 out of 1500 | elapsed: 55.6min finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [243]:
pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)

In [244]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21377
           1       1.00      1.00      1.00      8146

    accuracy                           1.00     29523
   macro avg       1.00      1.00      1.00     29523
weighted avg       1.00      1.00      1.00     29523


TESTING
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5344
           1       0.99      0.98      0.98      2037

    accuracy                           0.99      7381
   macro avg       0.99      0.99      0.99      7381
weighted avg       0.99      0.99      0.99      7381



In [245]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.008806 
test confusion matrix:
[[5327   17]
 [  48 1989]]


In [246]:
from sklearn.feature_selection import SelectFromModel
#print(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
#print(clf_GS.best_estimator_.named_steps["voting_clf"].get_booster().get_fscore())
best_params = {k: [v] for k, v in clf_GS.best_params_.items()}
thresholds = np.sort(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
print('Thresholds',thresholds, '\n')
print('Best Params', best_params)

i = 0
current_error = 100.0
best_th = 0
for thresh in thresholds:
  new_X_train = X_train
  selection = SelectFromModel(clf_GS.best_estimator_.named_steps["voting_clf"], threshold=thresh, prefit=True)
  select_X_train = selection.transform(new_X_train)

  selection_model = RandomizedSearchCV(estimator=pipe, param_distributions=best_params, n_jobs=10, verbose=1, cv=3, n_iter= 1)
  selection_model.fit(select_X_train, y_train)
  #dump(clf_GS, '%dselected_feature_model.joblib' % i)
  #files.download('%dselected_feature_model.joblib' % i) 
  pred_train = selection_model.predict(select_X_train)
  print(X_validation.shape, X_train.shape)
  select_X_val = selection.transform(X_validation)
  pred_val = selection_model.predict(select_X_val)
  
  i = i + 1

  train_error = 1. - accuracy_score(y_train, pred_train)
  train_cmat = confusion_matrix(y_train, pred_train)
  val_error = 1. - accuracy_score(y_validation, pred_val)
  val_cmat = confusion_matrix(y_validation, pred_val)

  print("Threshold: %f \n" % thresh)
  print('train error: %f ' % train_error)
  print('train confusion matrix:')
  print(train_cmat)
  print('test error: %f ' % val_error)
  print('test confusion matrix:')
  print(val_cmat)
  if val_error < current_error:
    current_error = val_error
    best_th = thresh

print('Best th:', best_th)
print('current error :', best_th)

Thresholds [0.01247077 0.01711424 0.0241311  0.02551129 0.02926498 0.03122059
 0.03454302 0.03709779 0.04886748 0.05350886 0.06710359 0.07660205
 0.13638878 0.40617552] 

Best Params {'voting_clf__colsample_bytree': [0.5679080291606311], 'voting_clf__eta': [0.25669249083545453], 'voting_clf__gamma': [0.3664509736859072], 'voting_clf__learning_rate': [0.14474841574499428], 'voting_clf__max_depth': [20], 'voting_clf__min_child_weight': [2], 'voting_clf__n_estimators': [704], 'voting_clf__reg_alpha': [0], 'voting_clf__reg_lambda': [0.1], 'voting_clf__scale_pos_weight': [8], 'voting_clf__subsample': [0.9858685418924441]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   59.6s finished


(7381, 14) (29523, 14)
Threshold: 0.012471 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.008806 
test confusion matrix:
[[5327   17]
 [  48 1989]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   58.5s finished


(7381, 14) (29523, 14)
Threshold: 0.017114 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.008129 
test confusion matrix:
[[5324   20]
 [  40 1997]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   52.5s finished


(7381, 14) (29523, 14)
Threshold: 0.024131 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.007587 
test confusion matrix:
[[5328   16]
 [  40 1997]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   51.0s finished


(7381, 14) (29523, 14)
Threshold: 0.025511 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.007858 
test confusion matrix:
[[5327   17]
 [  41 1996]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   47.2s finished


(7381, 14) (29523, 14)
Threshold: 0.029265 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.007316 
test confusion matrix:
[[5330   14]
 [  40 1997]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   47.9s finished


(7381, 14) (29523, 14)
Threshold: 0.031221 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.007723 
test confusion matrix:
[[5329   15]
 [  42 1995]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   43.3s finished


(7381, 14) (29523, 14)
Threshold: 0.034543 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.005690 
test confusion matrix:
[[5332   12]
 [  30 2007]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   38.2s finished


(7381, 14) (29523, 14)
Threshold: 0.037098 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.005284 
test confusion matrix:
[[5333   11]
 [  28 2009]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   38.8s finished


(7381, 14) (29523, 14)
Threshold: 0.048867 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.006232 
test confusion matrix:
[[5335    9]
 [  37 2000]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   33.4s finished


(7381, 14) (29523, 14)
Threshold: 0.053509 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.004606 
test confusion matrix:
[[5339    5]
 [  29 2008]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   34.9s finished


(7381, 14) (29523, 14)
Threshold: 0.067104 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.004877 
test confusion matrix:
[[5336    8]
 [  28 2009]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   28.1s finished


(7381, 14) (29523, 14)
Threshold: 0.076602 

train error: 0.003658 
train confusion matrix:
[[21269   108]
 [    0  8146]]
test error: 0.006639 
test confusion matrix:
[[5315   29]
 [  20 2017]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   27.3s finished


(7381, 14) (29523, 14)
Threshold: 0.136389 

train error: 0.005724 
train confusion matrix:
[[21211   166]
 [    3  8143]]
test error: 0.007045 
test confusion matrix:
[[5307   37]
 [  15 2022]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   24.1s finished


(7381, 14) (29523, 14)
Threshold: 0.406176 

train error: 0.017884 
train confusion matrix:
[[21194   183]
 [  345  7801]]
test error: 0.020458 
test confusion matrix:
[[5300   44]
 [ 107 1930]]
Best th: 0.053508863
current error : 0.053508863


In [247]:
from sklearn.feature_selection import SelectFromModel
#print(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
#print(clf_GS.best_estimator_.named_steps["voting_clf"].get_booster().get_fscore())

thresholds = np.sort(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
print(thresholds)
i = 0
new_X_train = X_train
selection = SelectFromModel(clf_GS.best_estimator_.named_steps["voting_clf"], threshold=best_th , prefit=True)
select_X_train = selection.transform(new_X_train)

selection_model = RandomizedSearchCV(estimator=pipe, param_distributions=best_params, n_jobs=10, verbose=1, cv=3, n_iter= 1)
selection_model.fit(select_X_train, y_train)
dump(clf_GS, '%dselected_feature_model.joblib' % i)
files.download('%dselected_feature_model.joblib' % i) 
pred_train = selection_model.predict(select_X_train)
select_X_val = selection.transform(X_validation)
pred_val = selection_model.predict(select_X_val)

i = i + 1

train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print("Threshold: %f \n" % thresh)
print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

[0.01247077 0.01711424 0.0241311  0.02551129 0.02926498 0.03122059
 0.03454302 0.03709779 0.04886748 0.05350886 0.06710359 0.07660205
 0.13638878 0.40617552]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   33.5s finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Threshold: 0.406176 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.004606 
test confusion matrix:
[[5339    5]
 [  29 2008]]


In [248]:
#pred_test = clf_GS.predict(X_test)
select_X_test = selection.transform(X_test)
pred_test = selection_model.predict(select_X_test)
#files.download('anomaly-4G-detection/predictions.csv')

# Submission Formatting

In [249]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [250]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,1
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
9153,9154,0
9154,9155,1
9155,9156,0
9156,9157,0


In [251]:
#!rm anomaly-4G-detection/predictions.csv

In [252]:
clf_GS.best_params_
#print(clf_GS.grid_scores_)

{'voting_clf__colsample_bytree': 0.5679080291606311,
 'voting_clf__eta': 0.25669249083545453,
 'voting_clf__gamma': 0.3664509736859072,
 'voting_clf__learning_rate': 0.14474841574499428,
 'voting_clf__max_depth': 20,
 'voting_clf__min_child_weight': 2,
 'voting_clf__n_estimators': 704,
 'voting_clf__reg_alpha': 0,
 'voting_clf__reg_lambda': 0.1,
 'voting_clf__scale_pos_weight': 8,
 'voting_clf__subsample': 0.9858685418924441}