In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
import optuna

In [2]:
# ! pip install xgboost
# ! pip install optuna
# ! pip install lightgbm

## Read Data

In [3]:
train_df = pd.read_csv(os.path.join('data', 'train.csv'))
label_df = pd.read_csv(os.path.join('data', 'train_churn_labels.csv'))

train_fnl_df = pd.concat([train_df, label_df], axis=1)

In [4]:
train_fnl_df.shape

(50000, 231)

In [5]:
train_fnl_df.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,Label
0,,,,,,1526.0,7.0,,,,...,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,,-1
1,,,,,,525.0,0.0,,,,...,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,,1
2,,,,,,5236.0,7.0,,,,...,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,,-1
3,,,,,,,0.0,,,,...,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,,-1
4,,,,,,1029.0,7.0,,,,...,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,,-1


In [6]:
train_fnl_df["Label"] = train_fnl_df["Label"].replace({-1: 0, 1: 1})

In [7]:
train_fnl_df["Label"].value_counts(normalize=True)

Label
0    0.92656
1    0.07344
Name: proportion, dtype: float64

## Drop columns cols having null values greater than 50%

In [8]:
def missing_values(df):
    """
    Returns a DataFrame with the count and percentage of missing values in each column.
    """
    total = df.isnull().sum()
    percent = (total / len(df)) * 100

    mis_val_tbl = pd.concat([total, percent], axis=1)
    mis_val_tbl.columns = ['Missing Values', 'Percentage']

    mis_val_tbl = mis_val_tbl[mis_val_tbl.iloc[:,1]!=0].sort_values('Percentage', ascending=False)
    return mis_val_tbl.reset_index()

In [9]:
miss_df = missing_values(train_fnl_df)

In [10]:
drop_cols = miss_df[miss_df["Percentage"]>50]["index"].values

In [11]:
train_fnl_df.drop(columns=drop_cols, inplace=True)

In [12]:
train_fnl_df.shape

(50000, 70)

In [13]:
X = train_fnl_df.drop(columns=["Label"])
y = train_fnl_df["Label"]

In [14]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

## Feature Eng using sklearn pipeline

In [15]:
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # If input is ndarray, use feature_names_in_ if set, else fallback to range
        if isinstance(X, np.ndarray):
            if self.feature_names_in_ is not None:
                X = pd.DataFrame(X, columns=self.feature_names_in_)
            else:
                X = pd.DataFrame(X)
        self.feature_names_in_ = list(getattr(X, 'columns', []))
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col].astype(str))
            self.encoders[col] = le
        return self

    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        X_encoded = X.copy()
        for col in X.columns:
            X_encoded[col] = self.encoders[col].transform(X_encoded[col].astype(str))
        return X_encoded

    def get_feature_names_out(self, input_features=None):
        if self.feature_names_in_ is not None:
            return list(self.feature_names_in_)
        elif input_features is not None:
            return list(input_features)
        else:
            return None



In [16]:
class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.7):
        self.threshold = threshold
        self.to_drop_ = None
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        if isinstance(X, np.ndarray):
            if self.feature_names_in_ is not None:
                X = pd.DataFrame(X, columns=self.feature_names_in_)
            else:
                X = pd.DataFrame(X)
        self.feature_names_in_ = list(getattr(X, 'columns', []))
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        return X.drop(columns=self.to_drop_, errors='ignore')

    def get_feature_names_out(self, input_features=None):
        if self.feature_names_in_ is None:
            return None
        return [col for col in self.feature_names_in_ if col not in (self.to_drop_ or [])]

In [17]:
preprocessor = ColumnTransformer([
    ("num_imputer", SimpleImputer(strategy="mean"), num_cols),
    ("cat_imputer", SimpleImputer(strategy="most_frequent"), cat_cols),
], remainder="drop")

In [18]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

# Numeric pipeline: impute, scale, then drop correlated features
num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler(),
    DropCorrelatedFeatures(threshold=0.7)
 )

# Categorical pipeline: impute then label encode (use MultiColumnLabelEncoder directly)
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    MultiColumnLabelEncoder()
 )

# ColumnTransformer to apply pipelines
preprocessor = make_column_transformer(
    (num_pipeline, num_cols),
    (cat_pipeline, cat_cols),
    remainder="drop"
 )

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(random_state=42))
 ])

# Ensure all pipeline steps output pandas DataFrames
pipeline.set_output(transform="pandas")

In [19]:
pipeline.fit(X, y)

In [20]:
# Get numeric feature names after DropCorrelatedFeatures
num_feature_names = pipeline.named_steps['preprocessor'].transformers_[0][1].named_steps['dropcorrelatedfeatures'].get_feature_names_out()

# Get categorical feature names after MultiColumnLabelEncoder
cat_feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['multicolumnlabelencoder'].get_feature_names_out()

# Combine feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names)


In [21]:
all_feature_names

['Var6',
 'Var7',
 'Var13',
 'Var21',
 'Var24',
 'Var28',
 'Var35',
 'Var38',
 'Var44',
 'Var57',
 'Var65',
 'Var72',
 'Var73',
 'Var74',
 'Var76',
 'Var78',
 'Var81',
 'Var94',
 'Var113',
 'Var125',
 'Var126',
 'Var133',
 'Var134',
 'Var140',
 'Var143',
 'Var144',
 'Var149',
 'Var153',
 'Var163',
 'Var173',
 'Var181',
 'Var192',
 'Var193',
 'Var195',
 'Var196',
 'Var197',
 'Var198',
 'Var199',
 'Var202',
 'Var203',
 'Var204',
 'Var205',
 'Var206',
 'Var207',
 'Var208',
 'Var210',
 'Var211',
 'Var212',
 'Var216',
 'Var217',
 'Var218',
 'Var219',
 'Var220',
 'Var221',
 'Var222',
 'Var223',
 'Var226',
 'Var227',
 'Var228']

## Get Feature Importance

In [22]:
importances = pipeline.named_steps["rf"].feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feature_importance_df

Unnamed: 0,feature,importance
18,Var113,0.040832
9,Var57,0.036914
37,Var199,0.036065
38,Var202,0.035203
49,Var217,0.034256
54,Var222,0.033885
36,Var198,0.033775
52,Var220,0.033613
20,Var126,0.032581
31,Var192,0.032443


## Select Features having 80% importance

In [23]:
# Calculate cumulative importance
feature_importance_df = feature_importance_df.reset_index(drop=True)
feature_importance_df['cumulative_importance'] = feature_importance_df['importance'].cumsum()

# Select features that capture 80% of the importance
selected_features = feature_importance_df[feature_importance_df['cumulative_importance'] <= 0.8]['feature'].tolist()
# If not enough features, include the next one to cross 80%
if len(selected_features) < len(feature_importance_df):
    selected_features.append(feature_importance_df.iloc[len(selected_features)]['feature'])


In [24]:
len(selected_features)

28

## Modeling

In [25]:
# Get the transformed features (as a pandas DataFrame)
X_transformed = pipeline.named_steps["preprocessor"].transform(X)

# Remove pipeline prefixes from column names if present
def strip_pipeline_prefix(col):
    if isinstance(col, str) and "__" in col:
        return col.split("__", 1)[-1]
    return col

X_transformed.columns = [strip_pipeline_prefix(col) for col in X_transformed.columns]

X_transformed_df = X_transformed
X_transformed_df

Unnamed: 0,Var6,Var7,Var13,Var21,Var24,Var28,Var35,Var38,Var44,Var57,...,Var217,Var218,Var219,Var220,Var221,Var222,Var223,Var226,Var227,Var228
0,0.078791,3.193535e-02,-0.404348,4.302500e-01,1.033675,-6.200792e-01,-2.522251e-01,-0.902022,-0.107944,0.278694,...,12236,1,11,94,4,2964,3,22,2,8
1,-0.316420,-1.141517e+00,-0.474163,-1.247134e-01,-0.273110,1.380519e+00,-2.522251e-01,0.765547,-0.107944,0.935760,...,9781,1,11,10,4,125,0,14,2,8
2,1.543559,3.193535e-02,-0.131163,1.832657e+00,2.340460,-4.737905e-02,-2.522251e-01,1.157425,-0.107944,1.523967,...,8473,0,11,1293,0,1610,3,10,0,25
3,0.000000,-1.141517e+00,-0.474163,5.328719e-17,0.000000,-2.166113e+00,-2.522251e-01,-0.903273,-0.107944,-0.752303,...,9161,0,11,1444,4,817,0,7,2,8
4,-0.117433,3.193535e-02,0.746067,-3.197006e-01,-0.055313,-2.622486e-01,-2.522251e-01,-0.903273,-0.107944,0.513428,...,3834,0,11,1731,4,65,0,7,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-0.382749,-1.141517e+00,-0.474163,-1.922090e-01,-0.273110,6.802669e-01,-2.522251e-01,1.212945,-0.107944,-0.372362,...,7428,1,11,257,4,969,0,3,2,8
49996,-0.098087,-1.141517e+00,0.563943,2.727603e-01,-0.273110,-6.200792e-01,-2.522251e-01,-0.903273,-0.107944,-1.440056,...,1059,0,11,1512,4,1125,0,7,2,2
49997,0.584550,3.193535e-02,0.079797,6.252371e-01,-0.055313,-6.200792e-01,-2.522251e-01,-0.888490,-0.107944,1.511313,...,3040,0,11,1041,4,2834,0,14,2,14
49998,0.000000,-1.488907e-16,0.000000,5.328719e-17,0.000000,3.041315e-16,-3.906559e-17,0.000000,0.000000,1.505408,...,1874,0,11,837,4,142,0,7,2,8


In [26]:
X_transformed_df

Unnamed: 0,Var6,Var7,Var13,Var21,Var24,Var28,Var35,Var38,Var44,Var57,...,Var217,Var218,Var219,Var220,Var221,Var222,Var223,Var226,Var227,Var228
0,0.078791,3.193535e-02,-0.404348,4.302500e-01,1.033675,-6.200792e-01,-2.522251e-01,-0.902022,-0.107944,0.278694,...,12236,1,11,94,4,2964,3,22,2,8
1,-0.316420,-1.141517e+00,-0.474163,-1.247134e-01,-0.273110,1.380519e+00,-2.522251e-01,0.765547,-0.107944,0.935760,...,9781,1,11,10,4,125,0,14,2,8
2,1.543559,3.193535e-02,-0.131163,1.832657e+00,2.340460,-4.737905e-02,-2.522251e-01,1.157425,-0.107944,1.523967,...,8473,0,11,1293,0,1610,3,10,0,25
3,0.000000,-1.141517e+00,-0.474163,5.328719e-17,0.000000,-2.166113e+00,-2.522251e-01,-0.903273,-0.107944,-0.752303,...,9161,0,11,1444,4,817,0,7,2,8
4,-0.117433,3.193535e-02,0.746067,-3.197006e-01,-0.055313,-2.622486e-01,-2.522251e-01,-0.903273,-0.107944,0.513428,...,3834,0,11,1731,4,65,0,7,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-0.382749,-1.141517e+00,-0.474163,-1.922090e-01,-0.273110,6.802669e-01,-2.522251e-01,1.212945,-0.107944,-0.372362,...,7428,1,11,257,4,969,0,3,2,8
49996,-0.098087,-1.141517e+00,0.563943,2.727603e-01,-0.273110,-6.200792e-01,-2.522251e-01,-0.903273,-0.107944,-1.440056,...,1059,0,11,1512,4,1125,0,7,2,2
49997,0.584550,3.193535e-02,0.079797,6.252371e-01,-0.055313,-6.200792e-01,-2.522251e-01,-0.888490,-0.107944,1.511313,...,3040,0,11,1041,4,2834,0,14,2,14
49998,0.000000,-1.488907e-16,0.000000,5.328719e-17,0.000000,3.041315e-16,-3.906559e-17,0.000000,0.000000,1.505408,...,1874,0,11,837,4,142,0,7,2,8


In [27]:
selected_features

['Var113',
 'Var57',
 'Var199',
 'Var202',
 'Var217',
 'Var222',
 'Var198',
 'Var220',
 'Var126',
 'Var192',
 'Var204',
 'Var81',
 'Var153',
 'Var6',
 'Var197',
 'Var216',
 'Var133',
 'Var73',
 'Var21',
 'Var134',
 'Var28',
 'Var38',
 'Var76',
 'Var94',
 'Var226',
 'Var125',
 'Var163',
 'Var13']

In [28]:
X_selected = X_transformed_df[selected_features]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42)

In [30]:
print(f"Shape of training data: {X_train.shape},\nShape of test data: {X_test.shape}")

Shape of training data: (35000, 28),
Shape of test data: (15000, 28)


## XG Boost Model

In [31]:
# Fit XGBoost
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Train Prediction

In [32]:
y_train_pred = xgb_model.predict(X_train)

# Classification report
print("Classification Report:\n", classification_report(y_train, y_train_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     32434
           1       1.00      0.50      0.67      2566

    accuracy                           0.96     35000
   macro avg       0.98      0.75      0.82     35000
weighted avg       0.96      0.96      0.96     35000



## Test Prediction

In [33]:
y_test_pred = xgb_model.predict(X_test)
y_test_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Classification report
print("Classification Report:\n", classification_report(y_test, y_test_pred))

# AUC-ROC score
auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"AUC-ROC Score: {auc:.4f}")

Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     13894
           1       0.39      0.03      0.05      1106

    accuracy                           0.93     15000
   macro avg       0.66      0.51      0.51     15000
weighted avg       0.89      0.93      0.89     15000

AUC-ROC Score: 0.6737


## Hyper-param Tuning

In [34]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print('Best trial:')
print(study.best_trial.params)

# Train final model with best params
best_params = study.best_trial.params
best_params['random_state'] = 42
best_params['use_label_encoder'] = False
best_params['eval_metric'] = 'logloss'

[I 2025-07-15 23:37:00,161] A new study created in memory with name: no-name-c0f42d36-2d23-43aa-84ad-0fe1b28dfca3
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-15 23:37:01,158] Trial 0 finished with value: 0.7032047215666226 and parameters: {'n_estimators': 141, 'max_depth': 6, 'learning_rate': 0.1794771830847651, 'subsample': 0.6390778408722642, 'colsample_bytree': 0.7758005419768825, 'gamma': 4.024553112107021, 'reg_alpha': 1.6630340234170682, 'reg_lambda': 2.8122181407651374}. Best is trial 0 with value: 0.7032047215666226.
[I 2025-07-15 23:37:01,158] Trial 0 finished with value: 0.7032047215666226 and parameters: {'n_estimators': 141, 'max_depth': 6, 'learning_rate': 0.1794771830847651, 'subsample': 0.6390778408722642, 'colsample_bytree': 0.7758005419768825, 'gamma': 4.024553112107021, 'reg_alpha': 1.6630340234170682, 'reg_lambda

Best trial:
{'n_estimators': 273, 'max_depth': 3, 'learning_rate': 0.06309350559680499, 'subsample': 0.8663343470444552, 'colsample_bytree': 0.8357769173807776, 'gamma': 1.0216922285082934, 'reg_alpha': 2.618458383109329, 'reg_lambda': 3.797118344624291}


## Get the best params using Optuna

In [35]:
xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Test Prediction after XG-Boost Hyper-param

In [36]:
# Evaluate
y_pred = xgb_best.predict(X_test)
y_pred_proba = xgb_best.predict_proba(X_test)[:, 1]
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     13894
           1       1.00      0.00      0.00      1106

    accuracy                           0.93     15000
   macro avg       0.96      0.50      0.48     15000
weighted avg       0.93      0.93      0.89     15000



## Apply SMOTE

In [37]:
# Apply SMOTE to balance the training data
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Retrain XGBoost with best params on resampled data
xgb_best_smote = XGBClassifier(**best_params)
xgb_best_smote.fit(X_train_res, y_train_res)


y_pred = xgb_best_smote.predict(X_test)
y_pred_proba = xgb_best_smote.predict_proba(X_test)[:, 1]
print("Classification Report after SMOTE:\n", classification_report(y_test, y_pred))
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC Score after SMOTE: {auc:.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report after SMOTE:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92     13894
           1       0.12      0.17      0.14      1106

    accuracy                           0.85     15000
   macro avg       0.53      0.54      0.53     15000
weighted avg       0.87      0.85      0.86     15000

AUC-ROC Score after SMOTE: 0.6233


## Apply Class Weights

In [38]:
# Compute class weights for XGBoost
from collections import Counter

# Calculate scale_pos_weight for XGBoost (ratio of negative to positive class)
class_counts = Counter(y_train_res)
scale_pos_weight = class_counts[0] / class_counts[1] if class_counts[1] != 0 else 1

# Add scale_pos_weight to best_params
tuned_params = best_params.copy()
tuned_params['scale_pos_weight'] = scale_pos_weight

# Retrain XGBoost with class weights
xgb_best_weighted = XGBClassifier(**tuned_params)
xgb_best_weighted.fit(X_train_res, y_train_res)

# Evaluate
y_pred = xgb_best_weighted.predict(X_test)
y_pred_proba = xgb_best_weighted.predict_proba(X_test)[:, 1]
print("Classification Report with class weights:\n", classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report with class weights:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92     13894
           1       0.12      0.17      0.14      1106

    accuracy                           0.85     15000
   macro avg       0.53      0.54      0.53     15000
weighted avg       0.87      0.85      0.86     15000



## Test Prediction by varying thershold

In [39]:
# Tune classification threshold for best F1-score on test set
import numpy as np
from sklearn.metrics import f1_score

probas = xgb_best_weighted.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.91, 0.01)
f1_scores = [f1_score(y_test, (probas >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f'Best threshold: {best_threshold:.2f}, Best F1-score: {best_f1:.4f}')

# Classification report at best threshold
y_pred_best = (probas >= best_threshold).astype(int)
print('Classification Report at best threshold:')
print(classification_report(y_test, y_pred_best))

Best threshold: 0.30, Best F1-score: 0.1785
Classification Report at best threshold:
              precision    recall  f1-score   support

           0       0.94      0.72      0.82     13894
           1       0.11      0.44      0.18      1106

    accuracy                           0.70     15000
   macro avg       0.53      0.58      0.50     15000
weighted avg       0.88      0.70      0.77     15000



## Apply Light gbm

In [40]:
# Train and evaluate LightGBM on resampled data
# !pip install lightgbm
import lightgbm as lgb

# Compute class weights for LightGBM
from collections import Counter
class_counts = Counter(y_train_res)
class_weight = {0: class_counts[1]/class_counts[0], 1: 1.0} if class_counts[0] != 0 else {0: 1.0, 1: 1.0}

lgbm = lgb.LGBMClassifier(random_state=42, class_weight=class_weight)
lgbm.fit(X_train_res, y_train_res)

# Predict probabilities and tune threshold
probas = lgbm.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.91, 0.01)
f1_scores = [f1_score(y_test, (probas >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f'Best threshold (LightGBM): {best_threshold:.2f}, Best F1-score: {best_f1:.4f}')

y_pred_best = (probas >= best_threshold).astype(int)
print('Classification Report (LightGBM) at best threshold:')
print(classification_report(y_test, y_pred_best))

[LightGBM] [Info] Number of positive: 32434, number of negative: 32434
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008786 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6712
[LightGBM] [Info] Number of data points in the train set: 64868, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best threshold (LightGBM): 0.21, Best F1-score: 0.1922
Classification Report (LightGBM) at best threshold:
              precision    recall  f1-score   support

           0       0.94      0.81      0.87     13894
           1       0.13      0.36      0.19      1106

    accuracy                           0.78     15000
   macro avg       0.54      0.59      0.53     15000
weighted avg       0.88      0.78      0.82     15000

Best threshold (LightGBM): 0.21, Best F1-score: 0.1922
Classification Report (LightGBM) at best threshold:
              precis