In [1]:
import pandas as pd

In [2]:
# Load the dataset
data_loan = pd.read_csv('loan_approval_dataset.csv')

data_loan.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
# Investigating data and looking for any missing values
data_loan.info()
print("A list with all the null values of the dataset", data_loan.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB
A list with all

In [4]:
# Checking the columns names
data_loan.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [5]:
# Keeping only the data provided in the exercise
data = data_loan[[' income_annum', ' cibil_score',  ' loan_amount', ' loan_term', ' self_employed',  ' loan_status']]

In [6]:
data.head()

Unnamed: 0,income_annum,cibil_score,loan_amount,loan_term,self_employed,loan_status
0,9600000,778,29900000,12,No,Approved
1,4100000,417,12200000,8,Yes,Rejected
2,9100000,506,29700000,20,No,Rejected
3,8200000,467,30700000,8,No,Rejected
4,9800000,382,24200000,20,Yes,Rejected


In [7]:
# Losing the empty space at the beginning of the loan_status and self_employed
data.loc[data[' loan_status'].isin([' Approved', ' Rejected']), ' loan_status'] = data[' loan_status'].replace({' Approved':'Approved', ' Rejected':'Rejected'})
data.loc[data[' self_employed'].isin([' Yes', ' No']), ' self_employed'] = data[' self_employed'].replace({' Yes':'Yes', ' No':'No'})

# Creating dummies for the self_employed column
data = pd.get_dummies(data, columns=[' self_employed'])

# Inspect the new data
data.head()

Unnamed: 0,income_annum,cibil_score,loan_amount,loan_term,loan_status,self_employed_No,self_employed_Yes
0,9600000,778,29900000,12,Approved,True,False
1,4100000,417,12200000,8,Rejected,False,True
2,9100000,506,29700000,20,Rejected,True,False
3,8200000,467,30700000,8,Rejected,True,False
4,9800000,382,24200000,20,Rejected,False,True


In [8]:
# Splitting the data to independent and the dependent variables
X = data.drop([' loan_status', ' self_employed_No'], axis=1)
y = data[[' loan_status']].values.reshape(-1)

In [9]:
# Split the data to train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# First we will create the KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Standardization function will standardize the numeric data in order to make better predictions
def standardization(Xtrain, Xtest):
    scaler = StandardScaler()
    
    # We are making sure that only the numeric data will be standardized
    x_train_num = Xtrain.select_dtypes(include=['float64', 'int64'])
    x_test_num = Xtest.select_dtypes(include=['float64', 'int64'])
    # We are transforming the X_test_numerical_scaled based on the previous scale
    X_train_numerical_scaled = scaler.fit_transform(x_train_num)
    X_test_numerical_scaled = scaler.transform(x_test_num)
    
    # Separate binary data
    X_train_binary = Xtrain.select_dtypes(include=['bool'])
    X_test_binary = Xtest.select_dtypes(include=['bool'])
    X_train_binary = X_train_binary.reset_index(drop=True)
    X_test_binary = X_test_binary.reset_index(drop=True)
    
    # Scale numerical data
    X_train_numerical_scaled = pd.DataFrame(X_train_numerical_scaled, columns=x_train_num.columns).reset_index(drop=True)
    X_test_numerical_scaled = pd.DataFrame(X_test_numerical_scaled, columns=x_test_num.columns).reset_index(drop=True)
    X_train_numerical_scaled = X_train_numerical_scaled.fillna(X_train_numerical_scaled.mean())
    X_test_numerical_scaled = X_test_numerical_scaled.fillna(X_test_numerical_scaled.mean())
    
    # Combine scaled numerical and binary features
    X_train_scaled = pd.concat([pd.DataFrame(X_train_numerical_scaled), X_train_binary], axis=1)
    X_test_scaled = pd.concat([pd.DataFrame(X_test_numerical_scaled), X_test_binary], axis=1)
    
    #Make sure no null value exists from the previous
    print(X_train_scaled.isnull().sum())
    
    # Convert column names to strings
    X_train_scaled.columns = X_train_scaled.columns.astype(str)
    X_test_scaled.columns = X_test_scaled.columns.astype(str)
    
    return X_train_scaled, X_test_scaled


# Evaluation function 
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation metrics for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='macro'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='macro'):.4f}")
    print("-" * 30)
    return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='macro')
    
    
# Creating a knn-classifier function which finds the best n_neighbors for which max the accuracy of the model
def knn_class(xtrain, xtest, ytrain, ytest):
    max_accuracy = 0
    best_k = None
    max_pred = 0
    x_train_scaled, x_test_scaled = standardization(Xtrain=xtrain, Xtest=xtest)
    for i in range(1, 8):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(x_train_scaled, ytrain)
        
        knn.score(x_test_scaled, ytest)
        y_pred = knn.predict(x_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {i} is: {accuracy}")
        # Update max_accuracy and best_k if current accuracy is higher
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            best_k = i
            max_pred = y_pred
        
    # Print accuracy for informational purposes (optional)
    print(f"For {best_k} neighbors the model has the best accuracy, with : {max_accuracy}")
    return best_k, max_accuracy, max_pred

# Calling the knn_classification function to create the knn model 
best_n , accuracy_sc, knn_pred  = knn_class(xtrain=X_train, xtest=X_test, ytrain=y_train, ytest=y_test)

income_annum         0
cibil_score          0
loan_amount          0
loan_term            0
self_employed_Yes    0
dtype: int64
Accuracy for 1 is: 0.9238875878220141
Accuracy for 2 is: 0.9285714285714286
Accuracy for 3 is: 0.9402810304449649
Accuracy for 4 is: 0.9414519906323185
Accuracy for 5 is: 0.9355971896955504
Accuracy for 6 is: 0.949648711943794
Accuracy for 7 is: 0.9402810304449649
For 6 neighbors the model has the best accuracy, with : 0.949648711943794


In [11]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, y_train)

# Predictions
tree_pred = d_tree.predict(X_test)

print(accuracy_score(y_test, tree_pred))

0.9800936768149883


In [12]:
# Checking the decision tree accuracy using the standardized X values
d_tree_stand = DecisionTreeClassifier()
X_tree_train, X_tree_test = standardization(Xtrain=X_train, Xtest=X_test)
d_tree_stand.fit(X_tree_train, y_train)

# Predictions with standardized values
tree_stand_pred = d_tree_stand.predict(X_tree_test)

print(accuracy_score(y_test, tree_stand_pred))

income_annum         0
cibil_score          0
loan_amount          0
loan_term            0
self_employed_Yes    0
dtype: int64
0.9824355971896955


In [14]:
# Calling the evaluate_model function and calculate the accuracy, precision, recall score and f1 score value for all models
accuracy_knn, precision_knn, recall_knn, f1_knn = evaluate_model(y_test, knn_pred, "KNN")
accuracy_tree, precision_tree, recall_tree, f1_tree = evaluate_model(y_test, tree_pred, "Decision Tree")
accuracy_tree_s, precision_tree_s, recall_tree_s, f1_tree_s = evaluate_model(y_test, tree_stand_pred, "Decision Tree Standardize")

# Feature importance from Decision Tree with the standardize values
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': d_tree_stand.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importances from Decision Tree:")
print(feature_importances)
print("-" * 30)
first_feature, first_importance = feature_importances.iloc[0]
print(f"Most Important Feature: {first_feature.strip()} with Importance: {first_importance:.4f}")
last_feature, last_importance = feature_importances.iloc[-1]
print(f"Least Important Feature: {last_feature.strip()} with Importance: {last_importance:.4f}")

Evaluation metrics for KNN:
Accuracy: 0.9496
Precision: 0.9469
Recall: 0.9452
F1-Score: 0.9460
------------------------------
Evaluation metrics for Decision Tree:
Accuracy: 0.9801
Precision: 0.9784
Recall: 0.9790
F1-Score: 0.9787
------------------------------
Evaluation metrics for Decision Tree Standardize:
Accuracy: 0.9824
Precision: 0.9804
Recall: 0.9822
F1-Score: 0.9812
------------------------------
Feature Importances from Decision Tree:
              Feature  Importance
1         cibil_score    0.829847
3           loan_term    0.082307
2         loan_amount    0.049305
0        income_annum    0.036520
4   self_employed_Yes    0.002021
------------------------------
Most Important Feature: cibil_score with Importance: 0.8298
Least Important Feature: self_employed_Yes with Importance: 0.0020
