In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Hyper parameter Tuning

In [2]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/lending_club_loan_two.csv")

print('First five rows:\n',df.head(5))

First five rows:
    loan_amnt        term  int_rate  installment grade sub_grade  \
0    10000.0   36 months     11.44       329.48     B        B4   
1     8000.0   36 months     11.99       265.68     B        B5   
2    15600.0   36 months     10.49       506.97     B        B3   
3     7200.0   36 months      6.49       220.65     A        A2   
4    24375.0   60 months     17.27       609.33     C        C5   

                 emp_title emp_length home_ownership  annual_inc  ...  \
0                Marketing  10+ years           RENT    117000.0  ...   
1          Credit analyst     4 years       MORTGAGE     65000.0  ...   
2             Statistician   < 1 year           RENT     43057.0  ...   
3          Client Advocate    6 years           RENT     54000.0  ...   
4  Destiny Management Inc.    9 years       MORTGAGE     55000.0  ...   

  open_acc pub_rec revol_bal revol_util total_acc  initial_list_status  \
0     16.0     0.0   36369.0       41.8      25.0                 

In [3]:
#  HANDLE MISSING VALUES
# =======================
# Numerical: fill with median
num_cols = df.select_dtypes(include=["float64", "int64"]).columns
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Categorical: fill with mode
cat_cols = df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\nAfter handling missing values:\n", df.isnull().sum().sum(), " missing values remain")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)



After handling missing values:
 0  missing values remain


In [4]:
#  Feature Engineering
# ----------------------------
# Convert term: "36 months" â†’ 36
df["term"] = df["term"].str.replace(" months", "").astype(int)

# Convert emp_length: "10+ years", "< 1 year", etc.
def clean_emp_length(x):
    if "<" in str(x):
        return 0
    elif "10+" in str(x):
        return 10
    elif x == "n/a":
        return 0
    else:
        try:
            return int(x.split()[0])
        except:
            return 0

df["emp_length"] = df["emp_length"].apply(clean_emp_length)


# Drop irrelevant / high-cardinality columns
df.drop(columns=["address", "emp_title", "title"], inplace=True, errors="ignore")

print("Feature engineering done")

Feature engineering done


In [5]:
#  Scale Numerical Features
# ----------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = df.select_dtypes(include=["float64", "int64"]).columns

df[num_cols] = scaler.fit_transform(df[num_cols])
print(df)
print("Scaling done")

        loan_amnt      term  int_rate  installment grade sub_grade  \
0       -0.492243 -0.557975 -0.491799    -0.408291     B        B4   
1       -0.731551 -0.557975 -0.368816    -0.662750     B        B5   
2        0.177819 -0.557975 -0.704225     0.299609     B        B3   
3       -0.827274 -0.557975 -1.598649    -0.842348     A        A2   
4        1.227783  1.792196  0.811824     0.707861     C        C5   
...           ...       ...       ...          ...   ...       ...   
396025  -0.492243  1.792196 -0.592422    -0.855390     B        B4   
396026   0.823951 -0.557975 -0.301734     1.071164     C        C1   
396027  -1.090513 -0.557975 -0.816028    -1.078979     B        B1   
396028   0.823951  1.792196  0.373556     0.283855     C        C2   
396029  -1.449475 -0.557975 -0.006574    -1.451256     C        C2   

        emp_length home_ownership  annual_inc verification_status  ...  \
0         1.058091           RENT    0.694330        Not Verified  ...   
1        -0

In [6]:
# Encoding target colum

df['loan_status'] = df['loan_status'].map({'Fully Paid':0, 'Charged Off':1})

In [7]:
# Encode Categorical Features (One-Hot)
df = pd.get_dummies(df, drop_first=True)
print("âœ… One-hot encoding done. New shape:", df.shape)

âœ… One-hot encoding done. New shape: (396030, 875)


In [8]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ----------------------------
#  Train-Test Split
# ----------------------------
# Target column = loan_status (assume itâ€™s binary: Fully Paid / Charged Off etc.)
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
#base model
base_model = lgb.LGBMClassifier(objective='binary', metric='auc', random_state=42)


In [25]:
#grid search parameters
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [5, 10],
    'learning_rate': [ 0.05, 0.1],
    'n_estimators': [100, 200],
    'min_data_in_leaf': [20, 40]
}



In [20]:
#random search parameters
param_dist = {
    'num_leaves': np.arange(20, 100, 10),
    'max_depth': np.arange(3, 15, 2),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'n_estimators': np.arange(100, 500, 50),
    'min_data_in_leaf': np.arange(10, 100, 10)
}


In [26]:
#grid search
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_grid = grid_search.best_estimator_


Fitting 3 folds for each of 32 candidates, totalling 96 fits
[LightGBM] [Info] Number of positive: 62138, number of negative: 254686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3275
[LightGBM] [Info] Number of data points in the train set: 316824, number of used features: 648
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196128 -> initscore=-1.410674
[LightGBM] [Info] Start training from score -1.410674


In [24]:
#random search
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=5,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_random = random_search.best_estimator_


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Number of positive: 62138, number of negative: 254686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3155
[LightGBM] [Info] Number of data points in the train set: 316824, number of used features: 588
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196128 -> initscore=-1.410674
[LightGBM] [Info] Start training from score -1.410674


In [27]:
#evaluate
# Predict probabilities
y_prob_grid = best_grid.predict_proba(X_test)[:, 1]
y_prob_random = best_random.predict_proba(X_test)[:, 1]

# AUC Scores
auc_grid = roc_auc_score(y_test, y_prob_grid)
auc_random = roc_auc_score(y_test, y_prob_random)

print("âœ… Best Parameters (Grid Search):", grid_search.best_params_)
print("âœ… Best Parameters (Random Search):", random_search.best_params_)
print("ðŸŽ¯ AUC (Grid Search):", round(auc_grid, 4))
print("ðŸŽ¯ AUC (Random Search):", round(auc_random, 4))


âœ… Best Parameters (Grid Search): {'learning_rate': 0.05, 'max_depth': 10, 'min_data_in_leaf': 40, 'n_estimators': 200, 'num_leaves': 50}
âœ… Best Parameters (Random Search): {'num_leaves': np.int64(30), 'n_estimators': np.int64(450), 'min_data_in_leaf': np.int64(80), 'max_depth': np.int64(11), 'learning_rate': np.float64(0.03111111111111111)}
ðŸŽ¯ AUC (Grid Search): 0.7288
ðŸŽ¯ AUC (Random Search): 0.7296
