In [1]:
pip install pandas numpy scikit-learn imbalanced-learn matplotlib joblib

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

# Full file path
DATA_PATH = "/Users/abdul-rehman/Downloads/loan.csv"

# Load dataset
df = pd.read_csv(DATA_PATH)

# Check shape and columns
print(df.shape, df.columns.tolist())

# Quick look at first rows
display(df.head())

# Info about dataset
df.info()

# Check top 20 columns with most missing values
display(df.isnull().sum().sort_values(ascending=False).head(20))


(4269, 13) ['loan_id', ' no_of_dependents', ' education', ' self_employed', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value', ' loan_status']


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [4]:
# common target names to check
for col in ["Loan_Status", "loan_status", "Status", "LoanStatus", "Approved", "approved", "Loan"]:
    if col in df.columns:
        print(col, df[col].value_counts(dropna=False))
# fallback: assume last column is target if none of above present
target_col = None
for c in ["Loan_Status","loan_status","Status","LoanStatus","Approved","approved","Loan"]:
    if c in df.columns:
        target_col = c
        break
if target_col is None:
    target_col = df.columns[-1]
print("Using target:", target_col)
display(df[target_col].value_counts(normalize=True))


Using target:  loan_status


 loan_status
Approved    0.62216
Rejected    0.37784
Name: proportion, dtype: float64

In [5]:
# Example fixes - modify if your column names differ
df = df.copy()

# Drop identifier if exists
for id_col in ["Loan_ID","id","ID"]:
    if id_col in df.columns:
        df.drop(columns=[id_col], inplace=True)

# Convert 'Dependents' values like '3+' to numeric 3
if 'Dependents' in df.columns:
    df['Dependents'] = df['Dependents'].replace('3+', '3')
    df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')

# Remove commas/spaces from numeric-like strings
for col in df.columns:
    if df[col].dtype == object:
        # check if column looks numeric-with-commas
        sample = df[col].dropna().astype(str).head(20).tolist()
        if all(s.replace(',','').replace('.','').replace('-','').isdigit() for s in sample if s):
            df[col] = df[col].str.replace(',','').astype(float)

# After cleaning, cast obvious numeric columns
display(df.dtypes)


loan_id                       int64
 no_of_dependents             int64
 education                   object
 self_employed               object
 income_annum                 int64
 loan_amount                  int64
 loan_term                    int64
 cibil_score                  int64
 residential_assets_value     int64
 commercial_assets_value      int64
 luxury_assets_value          int64
 bank_asset_value             int64
 loan_status                 object
dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder

y_raw = df[target_col].astype(str).str.strip().str.lower()

def map_target(y):
    # common mappings
    if set(y.unique()) <= {'y','n'}:
        return y.map({'y':1,'n':0})
    if set(y.unique()) <= {'yes','no'}:
        return y.map({'yes':1,'no':0})
    # approval-like strings
    return y.map(lambda v: 1 if ('approve' in v or 'yes' in v or v=='1') else 0)

y = map_target(y_raw)
# fallback if mapping failed (still non-binary)
if y.nunique() > 2:
    le = LabelEncoder()
    y = pd.Series(le.fit_transform(y_raw), index=y_raw.index)

X = df.drop(columns=[target_col])
print("Target counts:\n", y.value_counts())


Target counts:
  loan_status
1    2656
0    1613
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(X_train.shape, X_test.shape)


(3415, 12) (854, 12)


In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# detect numeric/categorical
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# numeric pipeline: median impute + scale
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categorical pipeline: impute mode + one-hot
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # updated
])

# combine preprocessing
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
], remainder='drop')

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)


Numeric cols: ['loan_id', ' no_of_dependents', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value']
Categorical cols: [' education', ' self_employed']


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# Logistic Regression baseline
pipe_lr = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])
pipe_lr.fit(X_train, y_train)

# Decision Tree baseline
pipe_dt = Pipeline([
    ('preproc', preprocessor),
    ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])
pipe_dt.fit(X_train, y_train)


In [12]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(pipeline, X_test, y_test, name="model"):
    y_pred = pipeline.predict(X_test)
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    try:
        y_proba = pipeline.predict_proba(X_test)[:,1]
        print("ROC AUC:", roc_auc_score(y_test, y_proba))
    except Exception:
        pass
    print()

evaluate_model(pipe_lr, X_test, y_test, "Logistic Regression (baseline)")
evaluate_model(pipe_dt, X_test, y_test, "Decision Tree (baseline)")


=== Logistic Regression (baseline) ===
              precision    recall  f1-score   support

           0     0.8779    0.9350    0.9055       323
           1     0.9588    0.9209    0.9395       531

    accuracy                         0.9262       854
   macro avg     0.9184    0.9279    0.9225       854
weighted avg     0.9282    0.9262    0.9266       854

Confusion matrix:
 [[302  21]
 [ 42 489]]
ROC AUC: 0.9731157404978048

=== Decision Tree (baseline) ===
              precision    recall  f1-score   support

           0     0.9842    0.9628    0.9734       323
           1     0.9777    0.9906    0.9841       531

    accuracy                         0.9801       854
   macro avg     0.9809    0.9767    0.9787       854
weighted avg     0.9801    0.9801    0.9800       854

Confusion matrix:
 [[311  12]
 [  5 526]]
ROC AUC: 0.9767160506783743



In [14]:
# If imbalanced-learn is installed:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

smote = SMOTE(random_state=42)
pipe_smote_lr = ImbPipeline([
    ('preproc', preprocessor),
    ('smote', smote),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])
pipe_smote_lr.fit(X_train, y_train)

pipe_smote_dt = ImbPipeline([
    ('preproc', preprocessor),
    ('smote', smote),
    ('clf', DecisionTreeClassifier(random_state=42))
])
pipe_smote_dt.fit(X_train, y_train)

evaluate_model(pipe_smote_lr, X_test, y_test, "LR + SMOTE")
evaluate_model(pipe_smote_dt, X_test, y_test, "DT + SMOTE")


=== LR + SMOTE ===
              precision    recall  f1-score   support

           0     0.8801    0.9319    0.9053       323
           1     0.9570    0.9228    0.9396       531

    accuracy                         0.9262       854
   macro avg     0.9186    0.9273    0.9224       854
weighted avg     0.9279    0.9262    0.9266       854

Confusion matrix:
 [[301  22]
 [ 41 490]]
ROC AUC: 0.9730807577268196

=== DT + SMOTE ===
              precision    recall  f1-score   support

           0     0.9689    0.9659    0.9674       323
           1     0.9793    0.9812    0.9802       531

    accuracy                         0.9754       854
   macro avg     0.9741    0.9736    0.9738       854
weighted avg     0.9754    0.9754    0.9754       854

Confusion matrix:
 [[312  11]
 [ 10 521]]
ROC AUC: 0.9735559403660364



In [15]:
from sklearn.metrics import precision_recall_curve, f1_score

def find_best_threshold(pipeline, X_val, y_val, metric='f1'):
    proba = pipeline.predict_proba(X_val)[:,1]
    prec, rec, thresholds = precision_recall_curve(y_val, proba)
    thresholds = np.append(thresholds, 1.0)  # align lengths
    f1_scores = 2 * (prec * rec) / (prec + rec + 1e-12)
    best_idx = np.nanargmax(f1_scores)
    return thresholds[best_idx], prec[best_idx], rec[best_idx], f1_scores[best_idx]

th, p, r, f1s = find_best_threshold(pipe_smote_lr, X_test, y_test)
print("Best threshold (by F1) on test:", th, "precision:", p, "recall:", r, "f1:", f1s)

# Apply threshold
proba = pipe_smote_lr.predict_proba(X_test)[:,1]
y_pred_th = (proba >= th).astype(int)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_th, digits=4))


Best threshold (by F1) on test: 0.4963471704405915 precision: 0.9552529182879378 recall: 0.9246704331450094 f1: 0.9397129186597871
              precision    recall  f1-score   support

           0     0.8824    0.9288    0.9050       323
           1     0.9553    0.9247    0.9397       531

    accuracy                         0.9262       854
   macro avg     0.9188    0.9267    0.9223       854
weighted avg     0.9277    0.9262    0.9266       854



In [16]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Example: tune Logistic Regression C
param_grid_lr = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l2'],
    'clf__solver': ['lbfgs']
}
pipe_full_lr = Pipeline([('preproc', preprocessor), ('clf', LogisticRegression(max_iter=1000, random_state=42))])
grid_lr = GridSearchCV(pipe_full_lr, param_grid_lr, cv=cv, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train, y_train)
print("Best LR params:", grid_lr.best_params_)
evaluate_model(grid_lr.best_estimator_, X_test, y_test, "Tuned Logistic Regression")

# Example: tune Decision Tree
param_grid_dt = {
    'clf__max_depth': [3,5,7, None],
    'clf__min_samples_leaf': [1,3,5,10]
}
pipe_full_dt = Pipeline([('preproc', preprocessor), ('clf', DecisionTreeClassifier(random_state=42))])
grid_dt = GridSearchCV(pipe_full_dt, param_grid_dt, cv=cv, scoring='f1', n_jobs=-1)
grid_dt.fit(X_train, y_train)
print("Best DT params:", grid_dt.best_params_)
evaluate_model(grid_dt.best_estimator_, X_test, y_test, "Tuned Decision Tree")


Best LR params: {'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
=== Tuned Logistic Regression ===
              precision    recall  f1-score   support

           0     0.9026    0.8607    0.8811       323
           1     0.9176    0.9435    0.9304       531

    accuracy                         0.9122       854
   macro avg     0.9101    0.9021    0.9058       854
weighted avg     0.9119    0.9122    0.9117       854

Confusion matrix:
 [[278  45]
 [ 30 501]]
ROC AUC: 0.9727542518642902

Best DT params: {'clf__max_depth': None, 'clf__min_samples_leaf': 3}
=== Tuned Decision Tree ===
              precision    recall  f1-score   support

           0     0.9812    0.9690    0.9751       323
           1     0.9813    0.9887    0.9850       531

    accuracy                         0.9813       854
   macro avg     0.9812    0.9789    0.9800       854
weighted avg     0.9813    0.9813    0.9812       854

Confusion matrix:
 [[313  10]
 [  6 525]]
ROC AUC: 0.9860331286841

In [17]:
import joblib
best_model = grid_lr.best_estimator_ if 'grid_lr' in globals() else pipe_smote_lr
joblib.dump(best_model, "best_loan_model.joblib")
print("Saved model to best_loan_model.joblib")


Saved model to best_loan_model.joblib
