In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
!pip install scikit-learn==1.3.2



In [None]:
d = pd.read_csv('/content/loan_approval_dataset.csv')
d.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [None]:
d.shape

(4269, 13)

In [None]:
d.drop(['loan_id'], axis = 1, inplace = True)

In [None]:
d.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [None]:
d.dtypes

Unnamed: 0,0
no_of_dependents,int64
education,object
self_employed,object
income_annum,int64
loan_amount,int64
loan_term,int64
cibil_score,int64
residential_assets_value,int64
commercial_assets_value,int64
luxury_assets_value,int64


In [None]:
d.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
X = d.drop([' loan_status'], axis = 1)
y = d[' loan_status']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape, X.shape , y.shape

((2860, 11), (1409, 11), (2860,), (1409,), (4269, 11), (4269,))

In [None]:
numerical_cols = [i for i in X if X[i].dtype != 'object']
categorical_cols = [i for i in X.columns if i not in numerical_cols]

In [None]:
numerical_cols, categorical_cols

([' no_of_dependents',
  ' income_annum',
  ' loan_amount',
  ' loan_term',
  ' cibil_score',
  ' residential_assets_value',
  ' commercial_assets_value',
  ' luxury_assets_value',
  ' bank_asset_value'],
 [' education', ' self_employed'])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
y_train = y_train.replace({' Rejected': 0, ' Approved': 1})
y_test = y_test.replace({' Rejected': 0, ' Approved': 1})

  y_train = y_train.replace({' Rejected': 0, ' Approved': 1})
  y_test = y_test.replace({' Rejected': 0, ' Approved': 1})


In [None]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 0.0255500354861604


In [None]:
my_pipeline

In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
roc_auc = roc_auc_score(y_test, preds)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)


Accuracy: 0.9744499645138396
Precision: 0.9754189944134078
Recall: 0.9842164599774521
F1-score: 0.9797979797979798
ROC AUC: 0.9710354330538601


In [None]:
preds

array([0, 1, 0, ..., 1, 0, 1])

In [None]:

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())

if cv_scores.mean() < accuracy:
    print("Possible overfitting detected. Model performs better on training data than cross-validation.")
else:
    print("Model is likely not overfitting.")


Cross-Validation Scores: [0.98009368 0.98126464 0.98711944 0.97775176 0.97772567]
Mean CV Accuracy: 0.9807910364576327
Standard Deviation of CV Accuracy: 0.003446532829318028
Model is likely not overfitting.


In [None]:

sample_data = X_test.sample(5)

sample_predictions = my_pipeline.predict(sample_data)
print("Sample Input Data:")
print(sample_data)
print("\nSample Predictions:")
print(sample_predictions)


Sample Input Data:
      no_of_dependents      education self_employed  income_annum  \
1027                 1   Not Graduate            No       9900000   
4007                 4       Graduate            No       3700000   
274                  3       Graduate           Yes       1700000   
500                  2       Graduate            No        200000   
2651                 2       Graduate            No       3800000   

      loan_amount  loan_term  cibil_score  residential_assets_value  \
1027     38700000          6          670                  19400000   
4007      7600000         14          794                   9500000   
274       4900000         16          591                   4700000   
500        700000          8          851                    400000   
2651      9400000         16          383                   8300000   

      commercial_assets_value  luxury_assets_value  bank_asset_value  
1027                 12600000             35800000           6500000

In [None]:
lel = [i for i in sample_data.iloc[0]]

In [None]:
print(sample_data.iloc[0].to_dict())

{' no_of_dependents': 1, ' education': ' Not Graduate', ' self_employed': ' No', ' income_annum': 9900000, ' loan_amount': 38700000, ' loan_term': 6, ' cibil_score': 670, ' residential_assets_value': 19400000, ' commercial_assets_value': 12600000, ' luxury_assets_value': 35800000, ' bank_asset_value': 6500000}


In [None]:
for i in lel:
    print(i)

1
 Not Graduate
 No
9900000
38700000
6
670
19400000
12600000
35800000
6500000


In [None]:

import pickle

filename = 'american-loan-approval.pkl'
pickle.dump(my_pipeline, open(filename, 'wb'))


In [None]:

import sklearn

print(sklearn.__version__)


1.3.2
