In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer


In [2]:
# diabetes_data = pd.read_csv("cleaned_diabetes_data.csv")
diabetes_data = pd.read_pickle("cleaned_diabetes_data.pkl")


In [3]:
diabetes_data.dtypes

race                            category
gender                          category
age                                int64
admission_type_id                  int64
discharge_disposition_id           int64
admission_source_id                int64
time_in_hospital                   int64
num_lab_procedures                 int64
num_procedures                     int64
num_medications                    int64
number_outpatient                  int64
number_emergency                   int64
number_inpatient                   int64
diag_1                          category
diag_2                          category
diag_3                          category
number_diagnoses                   int64
max_glu_serum                   category
A1Cresult                       category
metformin                       category
repaglinide                     category
nateglinide                     category
chlorpropamide                  category
glimepiride                     category
acetohexamide   

In [4]:
diabetes_data.head()
# diabetes_data = diabetes_data.drop(columns=["A1Cresult","max_glu_serum",'metformin',
#  'repaglinide',
#  'nateglinide',
#  'chlorpropamide',
#  'glimepiride',
#  'acetohexamide',
#  'glipizide',
#  'glyburide',
#  'tolbutamide',
#  'pioglitazone',
#  'rosiglitazone',
#  'acarbose',
#  'miglitol',
#  'troglitazone',
#  'tolazamide',
#  'examide',
#  'citoglipton',
#  'insulin',
#  'glyburide-metformin',
#  'glipizide-metformin',
#  'glimepiride-pioglitazone',
#  'metformin-rosiglitazone',
#  'metformin-pioglitazone', ])


# diabetes_data["emergency_inpatient_outpatient"] = (diabetes_data['number_emergency'] +
#                                         diabetes_data['number_inpatient'] +
#                                         diabetes_data['number_outpatient'])
# diabetes_data = diabetes_data.drop(columns=['number_emergency', 'number_inpatient', 'number_outpatient'])
diabetes_data.drop(columns=["diag_1", "diag_2", "diag_3"], inplace=True)

In [5]:
target_data = diabetes_data["readmitted"]
# label_encoder = LabelEncoder()
# target_data = label_encoder.fit_transform(target_data)

Readmittance within 30 days (<30) is replaced with 1 and the values "No" and ">30" is replaced with 0

# We are creating a Binary classification of readmittance value. 


In [6]:




target_data = target_data.replace({'<30': 1, '>30':0, "NO":0})


In [7]:
target_data.unique()

[0, 1]
Categories (2, int64): [1, 0]

In [8]:
features_diabetes_data = diabetes_data.drop(columns="readmitted")
features_diabetes_data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,diabetic_specific_diagnosis,time_spent_hospital_with_med,age_group
1,Caucasian,Female,15,1,1,7,3,59,0,18,...,No,No,No,No,No,Ch,Yes,1,54,young
2,AfricanAmerican,Female,25,1,1,7,2,11,5,13,...,No,No,No,No,No,No,Yes,0,26,young
3,Caucasian,Male,35,1,1,7,2,44,1,16,...,No,No,No,No,No,Ch,Yes,1,32,middle_age
4,Caucasian,Male,45,1,1,7,1,51,0,8,...,No,No,No,No,No,Ch,Yes,0,8,middle_age
5,Caucasian,Male,55,2,1,2,3,31,6,16,...,No,No,No,No,No,No,Yes,0,48,middle_age


In [9]:
#Using the train test split from Sklearn. note that I am using startify because I want the proportions
#of the target to be balanced

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features_diabetes_data,target_data , train_size=.8,random_state=42, stratify=target_data)

In [11]:
# insanity test for confirming the split
print(f"x_train length  is {len(x_train)} which  should be equal to x_train of length {len(y_train)}")

x_train length  is 78500 which  should be equal to x_train of length 78500


In [12]:
x_train.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,diabetic_specific_diagnosis,time_spent_hospital_with_med,age_group
21855,Caucasian,Female,85,3,3,1,10,39,1,14,...,No,No,No,No,No,No,Yes,1,140,Seniors
44806,Caucasian,Female,75,1,1,7,12,41,2,23,...,No,No,No,No,No,Ch,Yes,1,276,Seniors
77799,Caucasian,Female,75,3,1,1,9,52,2,21,...,No,No,No,No,No,Ch,Yes,1,189,Seniors
65752,Caucasian,Male,65,2,1,7,7,40,3,9,...,No,No,No,No,No,No,No,1,63,Seniors
26763,Caucasian,Female,85,1,1,6,4,40,0,12,...,No,No,No,No,No,No,No,1,48,Seniors


In [13]:
y_train.head()

21855    0
44806    0
77799    0
65752    0
26763    0
Name: readmitted, dtype: category
Categories (2, int64): [1, 0]

In [14]:
categorical_features = [feature for feature in x_train.columns[x_train.dtypes=="category"]]
numerical_features = [feature for feature in diabetes_data.columns[diabetes_data.dtypes=="int64"]]

In [15]:
categorical_features

['race',
 'gender',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'age_group']

In [16]:
numerical_features

['age',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'diabetic_specific_diagnosis',
 'time_spent_hospital_with_med']

In [17]:
print(f"Length of {len(categorical_features)} + {len(numerical_features)} = {len(diabetes_data.columns)}")

Length of 30 + 14 = 45


# Numerical and categorical pipeline


In [18]:
log_transformer = FunctionTransformer(np.log1p)
num_pipeline = Pipeline(steps= [
                                ("imputer", SimpleImputer(strategy="mean")),
                                ("log_transform", log_transformer),
                                ("scaler", StandardScaler())
                        
])
cat_pipeline = Pipeline(steps=[
                                ("imputer",SimpleImputer(strategy="most_frequent")),
                                ("one_hot_encode", OneHotEncoder(drop="first", sparse_output=False,handle_unknown='ignore'))
])


# Column transfer to combine pipeline

In [19]:
preprocessor = ColumnTransformer(
                transformers = [ ("number", num_pipeline, numerical_features),
                                ("category", cat_pipeline, categorical_features)
                               ]
                    
)

# bulild a model using Logistic regression

In [20]:
log_reg_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("classifier", LogisticRegression(max_iter=2000))
                                  ])


Now let us fit the pipeline on the data

# i first tried max iter = 1000 but it error out because the iteration was too small to reach a solution so
#I doubled to iteration to 2000

In [21]:
log_reg_pipeline.fit(x_train, y_train)

In [22]:
readmitanc_predict = log_reg_pipeline.predict(x_test)



In [23]:
print(f"Report {classification_report(y_test, readmitanc_predict)}")

Report               precision    recall  f1-score   support

           0       0.89      1.00      0.94     17410
           1       0.25      0.00      0.00      2215

    accuracy                           0.89     19625
   macro avg       0.57      0.50      0.47     19625
weighted avg       0.82      0.89      0.83     19625



In [24]:
print(f"Accuracy is {accuracy_score(y_test, readmitanc_predict)}")

Accuracy is 0.8870318471337579


 There is a huge imbalance in between the two claases. As you can see the model is good at predicting No
readmissions but it terms of predicting readmittance withing 30 days, it does a terrible job. it only predicts 50
percent of readmissions correctly. The recall is worst with predicting 1 percent of the actual readmission.
Therefore, we need to properly balance the classes with SMOT technique.


In [25]:
from imblearn.over_sampling import SMOTE
# sckitlearn pipeline wont work with smot so I am importing Imbpipeline from imbLearn
from imblearn.pipeline import Pipeline as ImbPipeline

In [26]:
y_test[45:55]

92765    0
14837    0
76552    0
76591    0
6077     0
56295    0
43       0
57021    0
67320    0
612      0
Name: readmitted, dtype: category
Categories (2, int64): [1, 0]

# Lets use Random Forest

In [27]:

random_forest_pipeline = ImbPipeline(steps=[("preprocessor", preprocessor),
                                         ("smote", SMOTE(random_state=42, k_neighbors=3)),
                                         ("classifier", RandomForestClassifier(random_state=42))
                                  ])
random_forest_pipeline.fit(x_train, y_train)


In [28]:
# Now predicting
readmitanc_predict_rf = random_forest_pipeline.predict(x_test)



In [29]:
print(f"Accuracy is {accuracy_score(y_test, readmitanc_predict_rf)}")

Accuracy is 0.8853503184713376


In [30]:
print(f"Report {classification_report(y_test, readmitanc_predict_rf)}")

Report               precision    recall  f1-score   support

           0       0.89      1.00      0.94     17410
           1       0.35      0.02      0.03      2215

    accuracy                           0.89     19625
   macro avg       0.62      0.51      0.49     19625
weighted avg       0.83      0.89      0.84     19625



#using class_weight parameter to give more wieght to the minority class and see the difference

In [31]:
rand_forest_cw_included = RandomForestClassifier(class_weight="balanced",random_state=42)
random_forest_pipeline = ImbPipeline(steps=[("preprocessor", preprocessor),
                                         ("smote", SMOTE(random_state=42)),
                                         ("classifier", rand_forest_cw_included)])


In [32]:
random_forest_pipeline.fit(x_train,y_train)
predict_random_forest = random_forest_pipeline.predict(x_test)



In [33]:
print(f"Accuracy is {accuracy_score(y_test, predict_random_forest)}")

Accuracy is 0.8851974522292994


In [34]:
print(f"Report {classification_report(y_test, predict_random_forest)}")

Report               precision    recall  f1-score   support

           0       0.89      0.99      0.94     17410
           1       0.37      0.02      0.04      2215

    accuracy                           0.89     19625
   macro avg       0.63      0.51      0.49     19625
weighted avg       0.83      0.89      0.84     19625



In [35]:
# XGBClassifier boost Pipeline
xgb_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42))
])



In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__max_depth": [3, 4, 5, 6],
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__n_estimators": [300, 400, 500, 600]
}


grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    cv=5,                   
    scoring="f1",           
    n_jobs=-1,             
    verbose=1
)


In [37]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [38]:
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation F1-score:", grid_search.best_score_)

Best parameters found: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 300}
Best cross-validation F1-score: 0.21758864867248695


In [39]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)




In [40]:
print(f"The accuracy score is  {accuracy_score(y_test, y_pred)}")
print(f" {classification_report(y_test, y_pred)}")


The accuracy score is  0.8353630573248407
               precision    recall  f1-score   support

           0       0.90      0.91      0.91     17410
           1       0.24      0.21      0.22      2215

    accuracy                           0.84     19625
   macro avg       0.57      0.56      0.57     19625
weighted avg       0.83      0.84      0.83     19625



For the Numerical Pipeline, I am going to replace logit transformer with FunctionTransformer
and standard Scaler with Robust scaler because the score of 0.24 precision and 0.21 recall for the minority 
class is not effective.

Creating pipeline with robust scaler and Function Transformer

In [88]:
from sklearn.preprocessing import FunctionTransformer, RobustScaler
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
#     ("log_transform", FunctionTransformer(np.log1p, validate=False)),
    ("scaler", RobustScaler())
])

Also creating a new category pipeline to handle uknow with "infrequent_if_exist" due to warning messages
while running grid search

In [89]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    #changing handle_unknown to "ignore"
    ("one_hot_encode", OneHotEncoder(drop="first", sparse_output=False, handle_unknown='ignore'))
])


In [90]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

In [91]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

In [92]:

xgb_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
#     ("smot", SMOTE(sampling_strategy=0.3 , random_state=42)),
    ("classifier", XGBClassifier( 
        objective="binary:logistic",
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False, 
#         eval_metric="mlogloss",
        eval_metric="logloss",
        random_state=42,
        reg_lambda=1.0, 
        reg_alpha=0.1
    ))
])

In [93]:
# param_grid = {
#     "classifier__max_depth": [3, 5],
#     "classifier__learning_rate": [0.01, 0.1],
#     "classifier__n_estimators": [200, 300],
#     "classifier__subsample": [0.8, 1.0],
#     "classifier__colsample_bytree": [0.8, 1.0]
# }

In [94]:
param_dist = {
    "classifier__max_depth": [3, 5, 7],
    "classifier__learning_rate": [0.01, 0.05],
    "classifier__n_estimators": [100, 200],
    "classifier__subsample": [0.8, 1.0],
    "classifier__colsample_bytree": [0.8, 1.0]
}

In [95]:
# from sklearn.model_selection import StratifiedKFold
# grid_search = GridSearchCV(
#     estimator=xgb_pipeline,
#     param_grid=param_grid,
#     cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
#     scoring="recall",  
#     n_jobs=-1,
#     verbose=1
# )

In [96]:
random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist,
    n_iter=10,  # Only try 10 random combinations
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),  # Reduced to 3 folds
    scoring="recall",
    n_jobs=-1,
    verbose=1,
    random_state=42
)

In [97]:
# grid_search.fit(x_train, y_train)

In [98]:
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [99]:
# best_model = grid_search.best_estimator_

In [100]:
best_model = random_search.best_estimator_

In [101]:
y_pred = best_model.predict(x_test)



In [102]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.56      0.70     17410
           1       0.16      0.67      0.26      2215

    accuracy                           0.57     19625
   macro avg       0.55      0.61      0.48     19625
weighted avg       0.84      0.57      0.65     19625



In [103]:
print(f"The accuracy score is  {accuracy_score(y_test, y_pred)}")

The accuracy score is  0.5725859872611465
