In [14]:
import warnings

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [15]:
warnings.filterwarnings("ignore")

train_df = pd.read_csv('data/loan_sanction_train.csv')

# The original test data doesn't contain the Loan_Status field
# Nevertheless loading it to construct a test set for another algorithm
test_df = pd.read_csv('data/loan_sanction_test.csv')

for df in [train_df, test_df]:
    # Convert categorical variables into numeric
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
    df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate': 0})
    df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0})
    df['Property_Area'] = df['Property_Area'].map({'Urban': 2, 'Semiurban': 1, 'Rural': 0})
    df['Dependents'] = df['Dependents'].replace('3+', 3)

    # Fill missing values. Do it after converting categorical values into numeric
    df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
    df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
    df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)
    df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
    df['Married'].fillna(df['Married'].mode()[0], inplace=True)
    df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
    df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)

    # Create extra features that can be useful and meaningful
    df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    df['LoanIncomeRatio'] = df['LoanAmount'] / df['TotalIncome']

# Convert the output variable into numeric
train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y': 1, 'N': 0})
X = train_df.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_df['Loan_Status']

# Carry out train/test split from the given training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_report = classification_report(y_test, y_pred_lr)
print(lr_report)
print(lr_accuracy)

              precision    recall  f1-score   support

           0       0.90      0.42      0.57        43
           1       0.76      0.97      0.85        80

    accuracy                           0.78       123
   macro avg       0.83      0.70      0.71       123
weighted avg       0.81      0.78      0.75       123

0.7804878048780488


In [17]:
def optimize_logistic_regression(C, max_iter):
    model = LogisticRegression(
        C=C,
        max_iter=int(max_iter),
        random_state=42,
        solver='liblinear'  # You can change the solver if needed
    )
    return cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()


In [18]:
param_space = {
    'C': (0.01, 10),  # Regularization strength
    'max_iter': (100, 1000)  # Number of iterations
}


In [19]:
optimizer = BayesianOptimization(
    f=optimize_logistic_regression,
    pbounds=param_space,
    random_state=42
)

# Start the optimization process
optimizer.maximize(init_points=5, n_iter=25)

|   iter    |  target   |     C     | max_iter  |
-------------------------------------------------
| [39m1        [39m | [39m0.8146   [39m | [39m3.752    [39m | [39m955.6    [39m |
| [39m2        [39m | [39m0.8126   [39m | [39m7.323    [39m | [39m638.8    [39m |
| [39m3        [39m | [39m0.8126   [39m | [39m1.569    [39m | [39m240.4    [39m |
| [39m4        [39m | [39m0.8105   [39m | [39m0.5903   [39m | [39m879.6    [39m |
| [39m5        [39m | [39m0.8105   [39m | [39m6.015    [39m | [39m737.3    [39m |
| [39m6        [39m | [39m0.8105   [39m | [39m3.059    [39m | [39m956.3    [39m |
| [39m7        [39m | [39m0.8126   [39m | [39m1.378    [39m | [39m240.5    [39m |
| [39m8        [39m | [39m0.8105   [39m | [39m4.033    [39m | [39m955.4    [39m |
| [39m9        [39m | [39m0.8126   [39m | [39m3.672    [39m | [39m955.7    [39m |
| [39m10       [39m | [39m0.8126   [39m | [39m4.44     [39m | [39m820.0    [39m |


In [20]:
best_params = optimizer.max['params']
best_model = LogisticRegression(
    C=best_params['C'],
    max_iter=int(best_params['max_iter']),
    random_state=42,
    solver='liblinear'
)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Logistic Regression Accuracy: {accuracy}")


Optimized Logistic Regression Accuracy: 0.7886178861788617
