In [None]:
# import libraries and packages 
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from scipy.stats import uniform, randint
from sklearn import model_selection, linear_model, metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedShuffleSplit, KFold
from sklearn.metrics import auc, accuracy_score, confusion_matrix, roc_auc_score, classification_report

import xgboost as xgb
import seaborn as sns

In [None]:
df = pd.read_csv("credit_risk_dataset.csv")

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

In [None]:
copy_df = df.copy()

In [None]:
object_columns = list(copy_df.select_dtypes("object"))
object_columns

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']

In [None]:
copy_df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [None]:
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [None]:
y = copy_df["loan_status"]
x = copy_df.drop(["loan_status"], axis=1)

In [None]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_percent_income         32581 non-null  float64
 9   cb_person_default_on_file   32581 non-null  object 
 10  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 2.7+ MB


In [None]:
class CustomImputer(BaseEstimator, TransfomerMixin):
    def __init__(self, columns, stragedy):
        self.stragedy = stragedy
        self.columns = columns

        assert type(columns) == type(stragedy), "Columns and stragedy must in same type"
        
    def fit(self, X):
        

            


0        1
1        0
2        1
3        1
4        1
        ..
32576    0
32577    0
32578    1
32579    0
32580    0
Name: loan_status, Length: 32581, dtype: int64

In [None]:
class ExperimentTransformers(BaseEstimator, TransfomerMixin):
    def __init__(self, columns, stragedy):

    def fit(self, x):
    
    def transform(self, x):
         

In [None]:
object_columns = list(x.select_dtypes("object"))

In [None]:
enc = OrdinalEncoder()
enc.fit(x[object_columns])
x[object_columns] = enc.transform(x[object_columns])

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")

In [None]:
missing_columns = ["person_emp_length", "loan_int_rate"]

In [None]:
imp_mean.fit(x[missing_columns])
x[missing_columns] = imp_mean.transform(x[missing_columns])

In [None]:
x

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,3.0,123.0,4.0,3.0,35000,16.02,0.59,1.0,3
1,21,9600,2.0,5.0,1.0,1.0,1000,11.14,0.10,0.0,2
2,25,9600,0.0,1.0,3.0,2.0,5500,12.87,0.57,0.0,3
3,23,65500,3.0,4.0,3.0,2.0,35000,15.23,0.53,0.0,2
4,24,54400,3.0,8.0,3.0,2.0,35000,14.27,0.55,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,0.0,1.0,4.0,2.0,5800,13.16,0.11,0.0,30
32577,54,120000,0.0,4.0,4.0,0.0,17625,7.49,0.15,0.0,19
32578,65,76000,3.0,3.0,2.0,1.0,35000,10.99,0.46,0.0,28
32579,56,150000,0.0,5.0,4.0,1.0,15000,11.48,0.10,0.0,26


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
logistic_regression = LogisticRegression(max_iter=100)

In [None]:
logistic_regression.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred_proba = logistic_regression.predict_proba(x_test)
y_pred = logistic_regression.predict(x_test)

In [None]:
y_pred_proba[0], y_pred[0]

(array([0.51826045, 0.48173955]), 0)

In [None]:
confusion_matrix(y_pred, y_test)

array([[7491, 1823],
       [ 122,  339]])

In [None]:
accuracy_score(y_pred, y_test)

0.8010230179028133

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
y_pred = clf.predict(x_test)

In [None]:
confusion_matrix(y_pred, y_test)

array([[7019,  514],
       [ 594, 1648]])

In [None]:
accuracy_score(y_pred, y_test)

0.8866496163682864