## Assignment for 3.3 Supervised Learning

Author: Derek ENG assisted by Microsoft Copilot
Jupyter Environment: ml

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [3]:
# Load the dataset
loan_data = pd.read_csv('https://raw.githubusercontent.com/prasertcbs/basic-dataset/refs/heads/master/Loan-Approval-Prediction.csv')

In [4]:
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
loan_data.info() # Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
loan_data.describe() # Statistical summary

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [8]:
loan_data.isna().sum() # Check for missing values

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## Data Loading and Exploration

## Data preprocessing (missing values, encoding, scaling)

## Feature selection


```python
# Split features and target
X = loan_data.drop('Loan_Status', axis=1)
y = loan_data['Loan_Status']
```

In [9]:
# Preprocessing, feature selection, training and evaluation (complete)
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Load and encode target
df = pd.read_csv('https://raw.githubusercontent.com/prasertcbs/basic-dataset/refs/heads/master/Loan-Approval-Prediction.csv')
df['Loan_Status'] = df['Loan_Status'].map({'Y':1,'N':0})
X = df.drop('Loan_Status', axis=1); y = df['Loan_Status']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Column lists
num_cols = X.select_dtypes(['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(['object']).columns.tolist()

# Transformers
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
preproc = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)])

# Pipelines with selector
selector = SelectKBest(mutual_info_classif, k=10)
pipe_lr = Pipeline([('pre', preproc), ('sel', selector), ('clf', LogisticRegression(max_iter=1000))])
pipe_knn = Pipeline([('pre', preproc), ('sel', selector), ('clf', KNeighborsClassifier())])

# Grid search
grid_lr = GridSearchCV(pipe_lr, {'sel__k':[8,10,12], 'clf__C':[0.01,0.1,1,10]}, cv=5, scoring='roc_auc', n_jobs=-1)
grid_knn = GridSearchCV(pipe_knn, {'sel__k':[8,10,12], 'clf__n_neighbors':[3,5,7], 'clf__weights':['uniform','distance']}, cv=5, scoring='roc_auc', n_jobs=-1)

grid_lr.fit(X_train, y_train)
grid_knn.fit(X_train, y_train)

def eval_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else None
    print(f"--- {name} ---")
    print("Accuracy", accuracy_score(y_test,y_pred))
    print("Precision", precision_score(y_test,y_pred, zero_division=0))
    print("Recall", recall_score(y_test,y_pred, zero_division=0))
    print("F1", f1_score(y_test,y_pred, zero_division=0))
    print("ROC AUC", roc_auc_score(y_test,y_proba) if y_proba is not None else 'N/A')
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))

eval_model(grid_lr.best_estimator_, X_test, y_test, 'Logistic Regression')
eval_model(grid_knn.best_estimator_, X_test, y_test, 'KNN')



--- Logistic Regression ---
Accuracy 0.8617886178861789
Precision 0.84
Recall 0.9882352941176471
F1 0.9081081081081082
ROC AUC 0.7521671826625387
              precision    recall  f1-score   support

           0       0.96      0.58      0.72        38
           1       0.84      0.99      0.91        85

    accuracy                           0.86       123
   macro avg       0.90      0.78      0.81       123
weighted avg       0.88      0.86      0.85       123

[[22 16]
 [ 1 84]]
--- KNN ---
Accuracy 0.8536585365853658
Precision 0.8316831683168316
Recall 0.9882352941176471
F1 0.903225806451613
ROC AUC 0.7809597523219814
              precision    recall  f1-score   support

           0       0.95      0.55      0.70        38
           1       0.83      0.99      0.90        85

    accuracy                           0.85       123
   macro avg       0.89      0.77      0.80       123
weighted avg       0.87      0.85      0.84       123

[[21 17]
 [ 1 84]]


## Model training (Logistic Regression and KNN) with hyperparameter tuning

## Model evaluation (accuracy, precision, recall, F1, ROC AUC) and ROC curve