## Data Modeling
***

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline as ImPipeline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, log_loss, roc_auc_score, make_scorer, fbeta_score
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, train_test_split, RepeatedStratifiedKFold, StratifiedKFold, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, log_loss, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

df = pd.read_csv('data/Cardiovascular_Diseases_Risk_Prediction_Dataset.csv')
ordinal = ['General_Health', 'Checkup', 'Age_Category']
numeric = list(df.select_dtypes(exclude=object).columns)
categorical = list(df.select_dtypes(object).columns)
for i in ordinal:
    categorical.remove(i)
categorical.remove('Heart_Disease')

# Splitting the training from the validation data
# Making sure the split is stratefied given teh imbalance of our target variable
y = df['Heart_Disease']
X = df.drop('Heart_Disease',axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=12, stratify= y)

# Transforming the target variable into 1's and 0's
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


# Listing categories in order for each ordinal variable.
health = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
check = [
    'Never', '5 or more years ago', 'Within the past 5 years',
    'Within the past 2 years', 'Within the past year'
]
age = [
    '18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80+'
]

# Instantiating an OrdinalEncoder transformer to encode ordinal variables. 
oe = OrdinalEncoder(categories=[health, check, age])


# Instantiating a OneHotEncoder transformer to be used on the categorical varaibles. 
ohe = OneHotEncoder()

# Creating a column transformer to be used in a pipeline
col_transformer = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories=[health, check, age]), ordinal),
    ('ohe', OneHotEncoder(), categorical)
], remainder="passthrough")

def model_scores(model, X, y, model_list= [], cv= 5, model_name = ''):
    if cv>1:
        skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=12)
        scoring = {'f2': make_scorer(fbeta_score, beta=2),
                   'accuracy':'accuracy',
                   'precision': 'precision',
                   'recall': 'recall',
                   'roc_auc':'roc_auc',
                   'log_loss': 'neg_log_loss'}
        scores = cross_validate(model,
                               X,
                               y,
                               scoring = scoring,
                               cv=skf,
                               n_jobs=-1)
        f2 = round(scores['test_f2'].mean(),4)*100
        accuracy = round(scores['test_accuracy'].mean(),4)*100
        precision = round(scores['test_precision'].mean(),4)*100
        recall = round(scores['test_recall'].mean(),4)*100
        roc_auc = round(scores['test_roc_auc'].mean(),4)*100
        lg_loss = round(scores['test_log_loss'].mean(),4)*100
    else:
        y_preds = model.predict(X)
        f2 = round(fbeta_score(y,y_preds,beta=2),4)*100
        recall = round(recall_score(y,y_preds),4)*100
        accuracy = round(accuracy_score(y,y_preds),4)*100
        roc_auc = round(roc_auc_score(y,y_preds),4)*100
        precision = round(precision_score(y,y_preds, zero_division=0.0),4)*100
        lg_loss = round(log_loss(y,model.predict_proba(X)),4)*100
    model_list.append([model_name, f2, accuracy, precision, recall, roc_auc, lg_loss])
    df = pd.DataFrame(model_list, columns=['name', 'f2', 'accuracy', 'precision', 'recall',  'roc_auc', 'log_loss'])
    return model_list, df

In [2]:
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy=0.13194183512437063, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy=1, random_state=12)),
                         ('xgb', XGBClassifier(random_state=12))])
smote_under = pipe.fit(X_train, y_train)
ml_xgb, df_xgb = model_scores(smote_under, X_train, y_train, model_list = [], model_name= "SMOTE Under")
df_xgb

Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,SMOTE Under,49.18,77.54,22.16,70.76,82.61,-42.91


In [3]:
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(random_state=12)),
                       ('rus',RandomUnderSampler(random_state=12)),
                         ('xgb', XGBClassifier(random_state=12))])

params = {'sm__sampling_strategy': np.linspace(0,1,101)}

gs = GridSearchCV(pipe, param_grid= params, scoring='recall', n_jobs=-1, cv=3, verbose= 5)

g_search = gs.fit(X_train, y_train)
gs_model = g_search.best_estimator_
ml, df1 = model_scores('XGBClassifier', gs_model, X_train, y_train, model_list= [], notes = 'Best Estimator from GS')
print(g_search.best_params_)
df1


510 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
402 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ronlo\anaconda3\envs\ac-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ronlo\anaconda3\envs\ac-env\Lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ronlo\anaconda3\envs\ac-env\Lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^

TypeError: model_scores() got multiple values for argument 'model_list'

In [8]:
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(random_state=12)),
                       ('rus',RandomUnderSampler(random_state=12)),
                         ('xgb', XGBClassifier(random_state=12))])

params = {'sm__sampling_strategy': np.linspace(0,0.49,50),
          'rus__sampling_strategy': np.linspace(0.5,1,50)}

rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=300, scoring='recall', n_jobs=-1, cv=3, random_state=12)
rand_search2 = rs.fit(X_train, y_train)
rand_model2 = rand_search2.best_estimator_
ml_xgb, df_xgb = model_scores(rand_model2, X_train, y_train, model_list=[ ], model_name= "SMOTE Under_rand")
print(rand_search.best_params_)
df_xgb

159 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
138 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ronlo\anaconda3\envs\ac-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ronlo\anaconda3\envs\ac-env\Lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ronlo\anaconda3\envs\ac-env\Lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^

{'sm__sampling_strategy': 0.09, 'rus__sampling_strategy': 0.85}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,SMOTE Under_rand,49.39,73.65,20.27,77.06,82.46,-49.52


In [5]:
ml_xgb, df_xgb = model_scores(rand_model, X_train, y_train , model_name= "SMOTE Under_rand")
df_xgb
#{'sm__sampling_strategy': 0.09, 'rus__sampling_strategy': 0.85}

Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,SMOTE Under_rand,49.27,74.37,20.55,75.72,82.4,-48.11


In [12]:
# Randomize the Weighted
pipe = Pipeline(steps=[('ct', col_transformer),
                       ('xbg', XGBClassifier(random_state=12))])

params = {
    'xbg__scale_pos_weight': np.linspace(0,1,25)
}

gs = GridSearchCV(pipe, param_grid= params, scoring='recall', n_jobs=-1, cv=3, verbose= 5)

g_search = gs.fit(X_train, y_train)
gs_model = g_search.best_estimator_
ml_xgb, df_xgb = model_scores(gs_model, X_train, y_train, model_list=[], model_name= "Xboost weight")
print(g_search.best_params_)
df_xgb

Fitting 3 folds for each of 25 candidates, totalling 75 fits
{'xbg__scale_pos_weight': 1.0}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Xboost weight,6.22,91.89,48.57,5.1,82.99,-22.42


In [17]:
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12))])
X_res, y_res = pipe.fit_resample(X_train,y_train)
pd.Series(y_res).value_counts()

0    22543
1    19162
dtype: int64

In [20]:
# Basic
pipe = Pipeline(steps=[('ct', col_transformer),
                       ('xgb', XGBClassifier(random_state=12))])
model = pipe.fit(X_train, y_train)
ml, df_xgb = model_scores(model, X_train, y_train, model_list = [], model_name= "Basic XGB")

# Weighted
pipe = Pipeline(steps=[('ct', col_transformer),
                       ('xgb', XGBClassifier(random_state=12,scale_pos_weight=283883/24971))])
model = pipe.fit(X_train, y_train)
ml, df_xgb = model_scores(model, X_train, y_train, model_list = ml, model_name= "Weighted XGB")

# Basic - Resampled
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])
model = pipe.fit(X_train,y_train)
ml, df_xgb = model_scores(model, X_train, y_train, model_list=ml, model_name= "Basic XGB Resampled")

# Weighted - Resampled
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12, score_pos_weight = 22543/19162))])
model = pipe.fit(X_train,y_train)
ml, df_xgb = model_scores(model, X_train, y_train, model_list=ml, model_name= "Weighted XGB Resampled")
df_xgb

Parameters: { "score_pos_weight" } are not used.



[['Basic XGB', 6.22, 91.89, 48.57, 5.1, 82.99, -22.42],
 ['Weighted XGB',
  49.27,
  74.52,
  20.62,
  75.49,
  82.41000000000001,
  -47.160000000000004],
 ['Basic XGB Resampled',
  49.27,
  74.37,
  20.549999999999997,
  75.72,
  82.39999999999999,
  -48.11],
 ['Weighted XGB Resampled',
  49.27,
  74.37,
  20.549999999999997,
  75.72,
  82.39999999999999,
  -48.11]]

In [21]:
df_xgb

Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Basic XGB,6.22,91.89,48.57,5.1,82.99,-22.42
1,Weighted XGB,49.27,74.52,20.62,75.49,82.41,-47.16
2,Basic XGB Resampled,49.27,74.37,20.55,75.72,82.4,-48.11
3,Weighted XGB Resampled,49.27,74.37,20.55,75.72,82.4,-48.11


In [23]:
# Randomize 1
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])

params = {
    'xgb__n_estimators': np.linspace(50,1000,100, dtype=int),
    'xgb__max_depth': range(1,15),
    'xgb__eta': [0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]}

rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=500, scoring='recall', n_jobs=-1, cv=3, random_state=12)
rand_search = rs.fit(X_train, y_train)
rand_model = rand_search.best_estimator_
ml, df1 = model_scores(rand_model, X_train, y_train, model_list= [], model_name = 'Best Estimator from RS')
print(rand_search.best_params_)
df1

{'xgb__n_estimators': 251, 'xgb__max_depth': 1, 'xgb__eta': 0.001}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Best Estimator from RS,41.25,56.37,13.74,82.94,76.93,-65.09


In [25]:
# Randomize 2
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])

params = {
    'xgb__n_estimators': range(200,300,50),
    'xgb__max_depth': [1],
    'xgb__eta': [0.001],
    'xgb__colsample_bytree': np.linspace(0,1,50),
    'xgb__min_child_weight': range(1,10),
    'xgb__gamma': [0, 0.1, 1, 10, 100, 1000],
    'xgb__reg_alpha': [0, 0.01, 0.1, 1, 10],
    'xgb__reg_lambda':[0, 0.01, 0.1, 1, 10]
}

rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=500, scoring='recall', n_jobs=-1, cv=3, random_state=12)
rand_search2 = rs.fit(X_train, y_train)
rand_model2 = rand_search2.best_estimator_
ml, df1 = model_scores(rand_model2, X_train, y_train, model_list= [], model_name = 'Best Estimator from RS')
print(rand_search2.best_params_)
df1

{'xgb__reg_lambda': 0, 'xgb__reg_alpha': 0.01, 'xgb__n_estimators': 200, 'xgb__min_child_weight': 6, 'xgb__max_depth': 1, 'xgb__gamma': 1, 'xgb__eta': 0.001, 'xgb__colsample_bytree': 1.0}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Best Estimator from RS,41.18,54.85,13.53,84.61,71.0,-65.81


In [26]:
# Randomize 3
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])

params = {
    'xgb__n_estimators': range(50,1000,50),
    'xgb__max_depth': range(1,15),
    'xgb__eta': [0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'xgb__colsample_bytree': np.linspace(0,1,50),
    'xgb__min_child_weight': range(1,10),
    'xgb__gamma': [0, 0.1, 1, 10, 100, 1000],
    'xgb__reg_alpha': [0, 0.01, 0.1, 1, 10],
    'xgb__reg_lambda':[0, 0.01, 0.1, 1, 10]
}

rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=500, scoring='recall', n_jobs=-1, cv=3, random_state=12)
rand_search3 = rs.fit(X_train, y_train)
rand_model3 = rand_search3.best_estimator_
ml, df1 = model_scores(rand_model3, X_train, y_train, model_list= [], model_name = 'Best Estimator from RS')
print(rand_search3.best_params_)
df1

{'xgb__reg_lambda': 0.01, 'xgb__reg_alpha': 10, 'xgb__n_estimators': 200, 'xgb__min_child_weight': 7, 'xgb__max_depth': 1, 'xgb__gamma': 100, 'xgb__eta': 0.001, 'xgb__colsample_bytree': 0.8571428571428571}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Best Estimator from RS,41.65,55.07,13.67,85.5,78.35,-65.84


In [28]:
# Randomize 1.2 Find best initial parameters for F2
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])

params = {
    'xgb__n_estimators': np.linspace(50,1000,100, dtype=int),
    'xgb__max_depth': range(1,15),
    'xgb__eta': [0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]}
ftwo_scorer = make_scorer(fbeta_score, beta=2)
rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=500, scoring=ftwo_scorer, n_jobs=-1, cv=3, random_state=12)
rand_search = rs.fit(X_train, y_train)
rand_model = rand_search.best_estimator_
ml, df1 = model_scores(rand_model, X_train, y_train, model_list= [], model_name = 'Best Estimator from RS')
print(rand_search.best_params_)
df1

#{'xgb__n_estimators': 501, 'xgb__max_depth': 2, 'xgb__eta': 0.05}

{'xgb__n_estimators': 501, 'xgb__max_depth': 2, 'xgb__eta': 0.05}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Best Estimator from RS,50.65,75.54,21.53,76.54,83.55,-46.49


In [33]:
# Randomize 2.2 use previous parameters to find best other parameter for f2
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])

params = {
    'xgb__n_estimators': np.linspace(450,550,50, dtype= int),
    'xgb__max_depth': [1,2,3],
    'xgb__eta': np.linspace(0,0.1,50),
    'xgb__colsample_bytree': np.linspace(0,1,50),
    'xgb__min_child_weight': range(1,10),
    'xgb__gamma': [0, 0.1, 1, 10, 100, 1000],
    'xgb__reg_alpha': [0, 0.01, 0.1, 1, 10],
    'xgb__reg_lambda':[0, 0.01, 0.1, 1, 10]
}

rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=500, scoring=ftwo_scorer, n_jobs=-1, cv=3, random_state=12)
rand_search2 = rs.fit(X_train, y_train)
rand_model2 = rand_search2.best_estimator_
ml, df1 = model_scores(rand_model2, X_train, y_train, model_list= [], model_name = 'Best Estimator from RS')
print(rand_search2.best_params_)
df1

{'xgb__reg_lambda': 10, 'xgb__reg_alpha': 1, 'xgb__n_estimators': 503, 'xgb__min_child_weight': 6, 'xgb__max_depth': 2, 'xgb__gamma': 1, 'xgb__eta': 0.0816326530612245, 'xgb__colsample_bytree': 0.2857142857142857}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Best Estimator from RS,50.63,75.67,21.58,76.3,83.56,-46.55


In [2]:
# Randomize 3.2
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12))])

params = {
    'xgb__n_estimators': range(50,1000,50),
    'xgb__max_depth': range(1,15),
    'xgb__eta': [0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'xgb__colsample_bytree': np.linspace(0,1,50),
    'xgb__min_child_weight': range(1,10),
    'xgb__gamma': [0, 0.1, 1, 10, 100, 1000],
    'xgb__reg_alpha': [0, 0.01, 0.1, 1, 10],
    'xgb__reg_lambda':[0, 0.01, 0.1, 1, 10]
}
ftwo_scorer = make_scorer(fbeta_score, beta=2)
rs = RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=500, scoring=ftwo_scorer, n_jobs=-1, cv=3, random_state=12)
rand_search3 = rs.fit(X_train, y_train)
rand_model3 = rand_search3.best_estimator_
ml, df1 = model_scores(rand_model3, X_train, y_train, model_list= [], model_name = 'Best Estimator from RS')
print(rand_search3.best_params_)
df1

{'xgb__reg_lambda': 0, 'xgb__reg_alpha': 0.01, 'xgb__n_estimators': 250, 'xgb__min_child_weight': 2, 'xgb__max_depth': 2, 'xgb__gamma': 0.1, 'xgb__eta': 0.05, 'xgb__colsample_bytree': 0.5102040816326531}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,Best Estimator from RS,50.6,75.7,21.59,76.19,83.55,-46.51


In [3]:
pipe = Pipeline(steps=[('ct', col_transformer),
                       ('xbg', XGBClassifier(random_state=12, tree_method='hist', scale_pos_weight= 11.368645877829987))])

params = {
    'xbg__n_estimators': range(50,1000,50),
    'xbg__max_depth': range(1,15),
    'xbg__eta': [0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'xbg__colsample_bytree': np.linspace(0,1,50),
    'xbg__min_child_weight': range(1,10),
    'xbg__gamma': [0, 0.1, 1, 10, 100, 1000], 
    'xbg__reg_alpha': [0, 0.01, 0.1, 1, 10],
    'xbg__reg_lambda':[0, 0.01, 0.1, 1, 10]
} 

rs= RandomizedSearchCV(estimator= pipe, param_distributions= params,
                       n_iter=300, scoring='recall', n_jobs=-1, cv=3, random_state=12,verbose=3)
search_4 = rs.fit(X_train, y_train)
model_4 = search_4.best_estimator_
ml, df1 = model_scores(model_4, X_train, y_train, model_list= [], model_name = 'recall, weighted')
print(rand_search3.best_params_)
df1

Fitting 3 folds for each of 300 candidates, totalling 900 fits
{'xgb__reg_lambda': 0, 'xgb__reg_alpha': 0.01, 'xgb__n_estimators': 250, 'xgb__min_child_weight': 2, 'xgb__max_depth': 2, 'xgb__gamma': 0.1, 'xgb__eta': 0.05, 'xgb__colsample_bytree': 0.5102040816326531}


Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,"recall, weighted",42.6,56.28,14.08,86.34,79.69,-62.21


In [4]:
pipe = ImPipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                       ('sm',SMOTE(sampling_strategy = 0.09, random_state=12)),
                       ('rus',RandomUnderSampler(sampling_strategy = 0.85,random_state=12)),
                       ('xgb', XGBClassifier(random_state=12, reg_lambda= 0.01, reg_alpha= 10, n_estimators= 200, 
                                             min_child_weight= 7, max_depth= 1, gamma= 100, eta= 0.001, 
                                             xgb__colsample_bytree= 0.8571428571428571))])
sampler_xgb = pipe.fit(X_train, y_train)

ml_f, df_f = model_scores(sampler_xgb, X_train, y_train, model_list= [], cv= 20, model_name = 'sampler_xgb')

Parameters: { "xgb__colsample_bytree" } are not used.



In [5]:
ml_f, df_f = model_scores(model_4, X_train, y_train, model_list= ml_f, cv= 20, model_name = 'weighted_xgb')

In [6]:
 df_f

Unnamed: 0,name,f2,accuracy,precision,recall,roc_auc,log_loss
0,sampler_xgb,41.09,53.3,13.3,86.29,70.58,-65.81
1,weighted_xgb,42.6,56.28,14.08,86.34,79.73,-62.21
