In [12]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 5000)
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split

In [None]:
df_raw = pd.read_excel('../data/loan_eligibility.xlsx')

In [None]:
train = df_raw.copy()
train.info()

In [11]:
X = train
y = X.pop('Loan_Status')

### Preprocessing

In [None]:
def drop_cols_with_missing_vals(df, percentage = 50):
    percent_missing = train.isnull().sum() * 100 / len(train)
    missing_value_df = pd.DataFrame({'column_name': train.columns, 'percent_missing': percent_missing})
    cols_to_drop = missing_value_df[missing_value_df['percent_missing'] > percentage]
    cols_to_drop = cols_to_drop['column_name'].tolist()
    
    return df.drop(cols_to_drop, axis=1)

In [None]:
X = drop_cols_with_missing_vals(X, 50)
X = X.drop(['Loan ID', 'Customer ID'], axis=1)

In [None]:
num_cols = X._get_numeric_data().columns.to_list()
cat_cols = X.select_dtypes('object').columns

#### Categorical Missing Values

In [5]:
X.head()

Unnamed: 0,Loan_ID,Customer_ID,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6,1,228190,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9,0,256329,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15,0,253460,427174.0,0.0,0.0


In [7]:
for col in cat_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

NameError: name 'cat_cols' is not defined

#### Numerical Missing Values

In [6]:
for col in num_cols:
    X[col] = X[col].fillna(X[col].mean())

NameError: name 'num_cols' is not defined

#### Encoding Categoricals

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

oh_encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown = 'ignore'), cat_cols)], remainder='passthrough')
X = oh_encoder.fit_transform(X)


In [12]:
y = y.replace({'Fully Paid': 1, 'Charged Off': 0})

### Model Training

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [10]:
data = {
    "Current_Loan_Amount": 450000,
    "Term": "Short Term",
    "Credit_Score": 700,
    "Annual_Income": 100000,
    "Years_in_current_job": "8 years",
    "Home_Ownership": "Home Mortgage",
    "Purpose": "Home Improvements",
    "Monthly_Debt": 5200.25,
    "Years_of_Credit_History": 17.2,
    "Months_since_last_delinquent": 5,
    "Number_of_Open_Accounts": 6,
    "Number_of_Credit_Problems": 1,
    "Current_Credit_Balance": 228100,
    "Maximum_Open_Credit": 416746,
    "Bankruptcies": 1,
    "Tax_Liens": 0
}

In [11]:
data

{'Current_Loan_Amount': 450000,
 'Term': 'Short Term',
 'Credit_Score': 700,
 'Annual_Income': 100000,
 'Years_in_current_job': '8 years',
 'Home_Ownership': 'Home Mortgage',
 'Purpose': 'Home Improvements',
 'Monthly_Debt': 5200.25,
 'Years_of_Credit_History': 17.2,
 'Months_since_last_delinquent': 5,
 'Number_of_Open_Accounts': 6,
 'Number_of_Credit_Problems': 1,
 'Current_Credit_Balance': 228100,
 'Maximum_Open_Credit': 416746,
 'Bankruptcies': 1,
 'Tax_Liens': 0}

In [31]:
#y_pred = classifier.predict([445412,Short Term,709.0,1167493.0,8,years,Home Mortgage,Home Improvements,5214.74,17.2,6,1,228190,416746.0,1.0,0.0])
y_pred = classifier('C:/Users/Souran/Downloads/dsp-waqar-alvi/dsp-waqar-alvi/models/model_reg_multi.joblib')

NameError: name 'classifier' is not defined

In [19]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 1602  4058]
 [ 1162 18178]]
              precision    recall  f1-score   support

           0       0.58      0.28      0.38      5660
           1       0.82      0.94      0.87     19340

    accuracy                           0.79     25000
   macro avg       0.70      0.61      0.63     25000
weighted avg       0.76      0.79      0.76     25000



In [20]:
print('Recall Score for testing data is',recall_score(y_test,y_pred))
print('Precision Score for testing data is',precision_score(y_test,y_pred))
print('F1 Score for testing data is',f1_score(y_test,y_pred))
print('Accuracy Score for testing data is',accuracy_score(y_test,y_pred))

Recall Score for testing data is 0.9399172699069287
Precision Score for testing data is 0.8175031480482101
F1 Score for testing data is 0.8744467962285934
Accuracy Score for testing data is 0.7912
