## Importing Essential Libraries

In [None]:
# Numerical and Data Manipulation 
import numpy as np
import pandas as pd 

# Data Visualization
import matplotlib.pyplot as plt 
import seaborn as sns

# Scikit-learn Pipeline and Preprocessing 
from sklearn.pipeline import Pipeline            
from sklearn.compose import ColumnTransformer         

#Evaluation Metrics
from sklearn.metrics import (
    confusion_matrix,
    classification_report, 
    accuracy_score,
    auc,
    roc_curve
)

from sklearn.model_selection import (
    train_test_split,            # To split data into training and test sets
    GridSearchCV,
    learning_curve
)

# Models
from sklearn.linear_model import LogisticRegression      
from sklearn.ensemble import RandomForestClassifier

# Data Preprocessing 
from sklearn.impute import SimpleImputer                 # To fill in missing values
from sklearn.preprocessing import (
    StandardScaler,              # To standardize features (zero mean, unit variance)
    OneHotEncoder                # To encode categorical variables as binary vectors
)

import warnings
warnings.filterwarnings("ignore")  # Ignoring warning messages


## Reading the data

In [None]:
df = pd.read_csv("loan_data.csv")

## Dataset Attributes:

The dataset contains the following attributes:

Loan_ID – Unique identifier for each loan application

Gender – Gender of the applicant (Male/Female)

Married – Marital status of the applicant (Yes/No)

Dependents – Number of dependents (e.g., 0, 1, 2, 3+)

Education – Education level (Graduate/Not Graduate)

Self_Employed – Employment status (Yes/No)

ApplicantIncome – Monthly income of the applicant (numeric)

CoapplicantIncome – Monthly income of the co-applicant (numeric)

LoanAmount – Loan amount requested (in thousands)

Loan_Amount_Term – Duration of the loan in days

Credit_History – Credit history (1 = Good, 0 = Bad)

Property_Area – Area type where the property is located (Urban, Semiurban, Rural)



## Exploratory Data Analysis (EDA)

In [None]:
print("Looking at the dataset:")
df.head(8)

In [None]:
print(f"dataset shape: \n:{df.shape}")

In [None]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [None]:
print("Target distribution:\n", y.value_counts())

In [None]:
print(f"dataset information:\n {df.info()}")

In [None]:
print(f"Missing values:\n{df.isnull().sum()}")

In [None]:
print("Description of Numerical Features:")
X.describe()

In [None]:
print("Creating the Correlation Heatmap:")
plt.figure(figsize=(10, 8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", square=True)
plt.title("Correlation Heatmap of Numerical Features")
plt.tight_layout()
plt.show()

In [None]:
categorical_features = [
    "person_gender",
    "person_education",
    "person_home_ownership",
    "loan_intent",
    "previous_loan_defaults_on_file",
]

fig, axes = plt.subplots(3, 2, figsize=(16, 14))
axes = axes.flatten()

for i, feature in enumerate(categorical_features):
    sns.countplot(data=df, x=feature, hue="loan_status", ax=axes[i])
    axes[i].set_title(f"Loan Status by {feature}")
    axes[i].tick_params(axis="x", rotation=45)

if len(categorical_features) % 2 != 0:
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

# Pre_processing and Pipeline definition

In [None]:
categorical_features = [
    var for var in df.columns if df[var].dtypes == "object"
]  # Identify categorical features
numerical_features = [
    var for var in df.columns if df[var].dtypes != "object"
]  # Identify numerical features
numerical_features.remove("loan_status")

print(f"Categorical columns:\n {categorical_features}")
print(f"Numerical columns:\n {numerical_features}")

In [None]:
df["person_gender"].value_counts()

In [None]:
# numerical features: Imputation and Scaling
numerical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="mean"),
        ),  # Replace missing values with column mean
        ("scaler", StandardScaler()),  # Standardize features (zero mean, unit variance)
    ]
)

# categorical features: Imputation and One-Hot Encoding
categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),  # Replace missing values with most frequent category
        (
            "onehot",
            OneHotEncoder(handle_unknown="ignore"),
        ),  # Convert categories to one-hot vectors, ignore unseen categories
    ]
)

In [None]:
# Create a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",  # Drop columns not specified
    verbose_feature_names_out=True,  # To get clean feature names out of OneHotEncoder
)

## Spliting the Training and Test Sets

In [None]:
# Split the dataset into training and testing sets (80/20), stratified by the target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=616
)

# Display the shapes of the splits
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_test shape:  {y_test.shape}")

In [None]:
pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression()),  # Placeholder
    ]
)

In [None]:
param_grid = [
    {
        "classifier": [LogisticRegression(solver="saga", max_iter=1000)],
        "classifier__C": [0.01, 0.1, 1, 10], # Learning Rate
        "classifier__penalty": ["l1", "l2"], # Lasso and Ridge Regularizers 
    },
    {
        "classifier": [RandomForestClassifier()],
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [5, 10],          # (reduce overfitting)
        "classifier__max_features": ["sqrt"],
        "classifier__class_weight": ["balanced"],
    },
]


In [None]:
# Grid search with both LogisticRegression and RandomForestClassifier
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,                       # 5-fold cross-validation
    scoring= "accuracy",         # metric
    n_jobs=-1               
)
# Fit the grid search
grid.fit(X_train, y_train)


In [None]:
print("Best parameters:\n", grid.best_params_)
print("Best model:\n", grid.best_estimator_)
print("Best cross-validation score:\n", grid.best_score_)

# Predict using the best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)


## Final Model Evaluation and Visualization

In [None]:
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

#  Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


## learning_curve

In [None]:
train_sizes, train_scores, val_scores = learning_curve(
    estimator=best_model, X=X_train, y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='accuracy', n_jobs=-1
)

train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)

plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_mean, label='Training Accuracy', marker='o')
plt.plot(train_sizes, val_mean, label='Validation Accuracy', marker='o')
plt.title("Learning Curve")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()