In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("loan_approval.csv")
df.head()

Unnamed: 0,name,city,income,credit_score,loan_amount,years_employed,points,loan_approved
0,Allison Hill,East Jill,113810,389,39698,27,50.0,False
1,Brandon Hall,New Jamesside,44592,729,15446,28,55.0,False
2,Rhonda Smith,Lake Roberto,33278,584,11189,13,45.0,False
3,Gabrielle Davis,West Melanieview,127196,344,48823,29,50.0,False
4,Valerie Gray,Mariastad,66048,496,47174,4,25.0,False


In [3]:
df.shape

(2000, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            2000 non-null   object 
 1   city            2000 non-null   object 
 2   income          2000 non-null   int64  
 3   credit_score    2000 non-null   int64  
 4   loan_amount     2000 non-null   int64  
 5   years_employed  2000 non-null   int64  
 6   points          2000 non-null   float64
 7   loan_approved   2000 non-null   bool   
dtypes: bool(1), float64(1), int64(4), object(2)
memory usage: 111.5+ KB


In [5]:
df.isnull().sum()

name              0
city              0
income            0
credit_score      0
loan_amount       0
years_employed    0
points            0
loan_approved     0
dtype: int64

In [6]:
X = df.iloc[:, :-1] 

In [7]:
X

Unnamed: 0,name,city,income,credit_score,loan_amount,years_employed,points
0,Allison Hill,East Jill,113810,389,39698,27,50.0
1,Brandon Hall,New Jamesside,44592,729,15446,28,55.0
2,Rhonda Smith,Lake Roberto,33278,584,11189,13,45.0
3,Gabrielle Davis,West Melanieview,127196,344,48823,29,50.0
4,Valerie Gray,Mariastad,66048,496,47174,4,25.0
...,...,...,...,...,...,...,...
1995,James Schaefer,Robertton,92163,770,12251,13,85.0
1996,Diana Lin,New Frank,38799,635,48259,17,40.0
1997,Brandon Meyer,East Haley,41957,763,16752,5,60.0
1998,Jason Price,Adamland,139022,360,24031,35,55.0


In [8]:
y=df["loan_approved"]

In [9]:
y

0       False
1       False
2       False
3       False
4       False
        ...  
1995     True
1996    False
1997     True
1998    False
1999    False
Name: loan_approved, Length: 2000, dtype: bool

In [10]:
# Confirm classification
n_unique = y.nunique(dropna=True)
if n_unique <= 1:
    raise ValueError("Target has only one class; cannot train classifier.")
print("Number of unique target classes:", n_unique)

Number of unique target classes: 2


In [11]:
# Encode target to numeric labels
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
if y.dtype == object or y.dtype.name == 'category' or y.dtype == bool:
    le = LabelEncoder()
    y_new = le.fit_transform(y.fillna("Missing"))
    print("Target classes:", list(le.classes_))
else:
    # numeric but small unique -> map to 0..k-1
    uniq = sorted(y.dropna().unique())
    mapping = {v:i for i,v in enumerate(uniq)}
    y_new = y.map(mapping).fillna(-1).astype(int).values
    print("Numeric mapping for target:", mapping)


Target classes: [np.False_, np.True_]


In [12]:
# Drop ID-like columns
import numpy as np
id_cols = [c for c in X.columns if 'id' in c.lower() or c.lower() in ('index','serial')]
if id_cols:
    print("Dropping ID columns:", id_cols)
    X = X.drop(columns=id_cols)

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

Numeric cols: ['income', 'credit_score', 'loan_amount', 'years_employed', 'points']
Categorical cols: ['name', 'city']


In [13]:
# Preprocessing pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
], remainder='drop')

In [14]:
# Train/test split (stratify if possible)

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
stratify_y = y if len(np.unique(y)) > 1 else None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=stratify_y, random_state=42)

In [15]:
# Pipelines
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
pipelines = {
    'knn': Pipeline([('pre', preprocessor), ('clf', KNeighborsClassifier())]),
    'rf': Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(random_state=42))]),
    'dt': Pipeline([('pre', preprocessor), ('clf', DecisionTreeClassifier(random_state=42))])
}

param_grids = {
    'knn': {
        'clf__n_neighbors': [3,5,7,9],
        'clf__weights': ['uniform','distance'],
        'clf__p': [1,2]
    },
    'rf': {
        'clf__n_estimators': [100,200],
        'clf__max_depth': [None,5,10],
        'clf__min_samples_split': [2,5]
    },
    'dt': {
        'clf__max_depth': [None,5,10],
        'clf__min_samples_split': [2,5],
        'clf__criterion': ['gini','entropy']
    }
}

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'f1' if len(np.unique(y))==2 else 'accuracy'

results = {}
best_models = {}

for name in ['knn','rf','dt']:
    print("\n--- GridSearch for", name, "---")
    grid = GridSearchCV(pipelines[name], param_grid=param_grids[name], cv=cv, scoring=scoring, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print("Best params:", grid.best_params_)
    print(f"Best CV {scoring}:", grid.best_score_)
    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='binary' if len(np.unique(y))==2 else 'weighted')
    print(f"Test acc: {acc:.4f}, test f1: {f1:.4f}")
    print("Classification report:")
    print(classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    results[name] = {'best_score_cv': grid.best_score_, 'best_params': grid.best_params_, 'test_accuracy': acc, 'test_f1': f1}
    best_models[name] = grid.best_estimator_



--- GridSearch for knn ---
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'clf__n_neighbors': 5, 'clf__p': 1, 'clf__weights': 'uniform'}
Best CV f1: 0.9872789936649005
Test acc: 1.0000, test f1: 1.0000
Classification report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       224
        True       1.00      1.00      1.00       176

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

Confusion matrix:
 [[224   0]
 [  0 176]]

--- GridSearch for rf ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}
Best CV f1: 1.0
Test acc: 1.0000, test f1: 1.0000
Classification report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       224
        True  

“We trained multiple models (KNN, Decision Tree, Random Forest) with GridSearchCV and cross-validation.
All models achieved 100% accuracy, which indicates the dataset is highly separable and the features strongly determine loan decisions.
Among them, Random Forest was selected as the final model due to its stability, robustness, and industry-standard performance for tabular data.”

In [17]:
# Summary
results_df = pd.DataFrame(results).T.sort_values(by='best_score_cv', ascending=False)
print("\nResults summary:\n", results_df)

best_name = results_df.index[0]
print("\nSelected best model:", best_name)
best_model = best_models[best_name]


Results summary:
     best_score_cv                                        best_params  \
rf            1.0  {'clf__max_depth': None, 'clf__min_samples_spl...   
dt            1.0  {'clf__criterion': 'gini', 'clf__max_depth': N...   
knn      0.987279  {'clf__n_neighbors': 5, 'clf__p': 1, 'clf__wei...   

    test_accuracy test_f1  
rf            1.0     1.0  
dt            1.0     1.0  
knn           1.0     1.0  

Selected best model: rf


In [18]:
# saving the model
import joblib
joblib.dump(best_model, "loan_approval.joblib")

['loan_approval.joblib']