In [1]:
#data analysis
import pandas as pd

#visualize data
import plotly.express as pe

#split data
from sklearn.model_selection import train_test_split

#model
from sklearn.tree import DecisionTreeClassifier

#evaluate
from sklearn.metrics import accuracy_score

#preprocessing

from sklearn.preprocessing import StandardScaler, LabelEncoder


from hyperopt import hp,tpe,fmin,Trials,STATUS_OK,space_eval

from hyperopt.early_stop import no_progress_loss


### objective: predict whether a loan application is to be accepted or rejected

## step 1: Source the data

In [2]:
path = r"C:\Users\harsh\Desktop\NPCI-Python-ML\datasets\Balanced_credit_Risk.txt"

balanced_credit_df = pd.read_csv(path)

balanced_credit_df = balanced_credit_df.drop(columns=["Unnamed: 0", "index"])

balanced_credit_df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...
12401,30,102540,MORTGAGE,6.0,HOMEIMPROVEMENT,A,1500,7.90,0,0.01,N,5
12402,24,60000,RENT,0.0,PERSONAL,B,12000,12.21,0,0.20,N,2
12403,22,40000,RENT,0.0,EDUCATION,C,6000,12.87,0,0.15,Y,3
12404,22,50000,RENT,2.0,PERSONAL,C,8000,13.16,0,0.16,Y,2


### step 2: Data exploration & pre-processing

In [3]:
print(f"Shape of my table is: {balanced_credit_df.shape}")
print(f"List of columns is: {balanced_credit_df.columns}")
print(f"Current index is: {balanced_credit_df.index}")


Shape of my table is: (12406, 12)
List of columns is: Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')
Current index is: RangeIndex(start=0, stop=12406, step=1)


In [4]:
print(f"Number of unique values per column:\n{balanced_credit_df.nunique()}")

Number of unique values per column:
person_age                      50
person_income                 2184
person_home_ownership            4
person_emp_length               33
loan_intent                      6
loan_grade                       7
loan_amnt                      576
loan_int_rate                  340
loan_status                      2
loan_percent_income             77
cb_person_default_on_file        2
cb_person_cred_hist_length      29
dtype: int64


In [5]:
print(f"Info of the table: {balanced_credit_df.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12406 entries, 0 to 12405
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  12406 non-null  int64  
 1   person_income               12406 non-null  int64  
 2   person_home_ownership       12406 non-null  object 
 3   person_emp_length           12406 non-null  float64
 4   loan_intent                 12406 non-null  object 
 5   loan_grade                  12406 non-null  object 
 6   loan_amnt                   12406 non-null  int64  
 7   loan_int_rate               12406 non-null  float64
 8   loan_status                 12406 non-null  int64  
 9   loan_percent_income         12406 non-null  float64
 10  cb_person_default_on_file   12406 non-null  object 
 11  cb_person_cred_hist_length  12406 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 1.1+ MB
Info of the table: None


In [6]:
categorical_features = ["person_home_ownership", "loan_intent", "loan_grade", "cb_person_default_on_file"]
real_value_features = [col for col in balanced_credit_df.columns if col not in categorical_features and col != "loan_status"]


In [7]:
balanced_credit_df[       real_value_features    ].describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
count,12406.0,12406.0,12406.0,12406.0,12406.0,12406.0,12406.0
mean,27.59447,60404.5,4.555054,10187.790182,11.784964,0.197573,5.74472
std,6.247178,44547.57,4.230786,6692.722063,3.391123,0.121986,4.081106
min,20.0,4000.0,0.0,800.0,5.42,0.0,2.0
25%,23.0,34615.5,2.0,5000.0,8.94,0.1,3.0
50%,26.0,50915.0,4.0,8800.0,11.83,0.17,4.0
75%,30.0,73000.0,7.0,14000.0,14.42,0.28,8.0
max,144.0,1362000.0,123.0,35000.0,23.22,0.83,30.0



v1 : 40 : 5 units below average

average : 45 0 units differing from average


v2 : 510 : 465 units above average

In [8]:
for col in real_value_features:
    display(     pe.box(y=col, data_frame=balanced_credit_df)     )

In [9]:
balanced_credit_df[       categorical_features      ].describe(  include="object"  )

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
count,12406,12406,12406,12406
unique,4,6,7,2
top,RENT,MEDICAL,B,N
freq,7299,2480,3550,9585


In [10]:
for col in categorical_features:
    
 
    fig = pe.histogram(
        x=col,  #data to be used on x axis
        data_frame=balanced_credit_df, #source of data
        barmode="stack",  #style of histogram
        color=balanced_credit_df[col] #coloring criteria (based on values of the current column)
    )

    
    display( fig )

In [11]:
balanced_credit_df.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

### scaling and encoding

In [12]:
sc = StandardScaler()

le = LabelEncoder()

#apply standard scalar and overwrite original unscaled real column values
balanced_credit_df[  real_value_features  ]  =  sc.fit_transform(  balanced_credit_df[real_value_features]  )

#for any col that is in the list of categorical_features
for col in categorical_features:

    #apply encoding technique on the values of the column & overwrite original values in the table
    balanced_credit_df[col] = le.fit_transform(balanced_credit_df[col])



In [13]:
balanced_credit_df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,-0.895556,-0.031529,3,27.997097,4,3,3.707491,1.248910,1,3.217111,1,-0.672570
1,-0.415320,-1.140501,0,-0.840316,3,2,-0.700459,0.319977,1,3.053151,0,-0.672570
2,-0.735477,0.114388,3,-0.131199,3,2,3.707491,1.015939,1,2.725232,0,-0.917612
3,-0.575398,-0.134794,3,0.814290,3,2,3.707491,0.732836,1,2.889191,1,-0.427529
4,-1.055634,-1.133766,2,-0.603944,5,0,-1.148725,-1.369797,1,0.429796,0,-0.917612
...,...,...,...,...,...,...,...,...,...,...,...,...
12401,0.385074,0.945892,0,0.341545,2,0,-1.298147,-1.145674,0,-1.537720,0,-0.182487
12402,-0.575398,-0.009081,3,-1.076688,4,1,0.270784,0.125343,0,0.019897,0,-0.917612
12403,-0.895556,-0.458057,3,-1.076688,1,2,-0.625748,0.319977,0,-0.390002,1,-0.672570
12404,-0.895556,-0.233569,3,-0.603944,4,2,-0.326904,0.405497,0,-0.308022,1,-0.917612


In [14]:
balanced_credit_df[    ["loan_status"]    ].value_counts(normalize=True) *100

loan_status
0              50.0
1              50.0
dtype: float64

## step 3: selection of features of target!

In [15]:
features = categorical_features + real_value_features

target = "loan_status"

### step 4: split the data

In [16]:
x_train, x_test, y_train, y_test=train_test_split( 
    balanced_credit_df[features], #feature column values from the data frame
    balanced_credit_df[target],  #target column values from the data frame
    test_size=0.2,  #size for testing data
    stratify=balanced_credit_df[target], #IN WHAT RATIO SAMPLES MUST BE STRATIFIED?
    random_state=42   #seed value for randomization 
    )


print(f"split in training data:  {y_train.value_counts()} ")
print(f"split in testing data:  {y_test.value_counts()} ")

split in training data:  1    4962
0    4962
Name: loan_status, dtype: int64 
split in testing data:  0    1241
1    1241
Name: loan_status, dtype: int64 


In [17]:
#keys of this dictionary are names of parameters
space ={
    "max_features" : hp.choice("feature choice", ["sqrt", "log2"]),
    'max_depth' : hp.choice(  'depth parameter',[1,2,3,4,5,6,7,8]  ), #choose some number between 1 to 8
    'criterion' : hp.choice(  'criteria parameter' ,   ['gini',"entropy", 'log_loss'] ),
    'splitter' : hp.choice("splitter choice", ["best", "random"]),
    # 'min_impurity_decrease' : hp.uniform("impurity factor", 0, 0.02)
}

In [18]:
scores=[]
def objective(space):
    model = DecisionTreeClassifier(**space)

    model.fit(x_train, y_train)

    pred = model.predict(x_test)

    accuracy = accuracy_score(y_test,pred) 


    scores.append(accuracy)
    return {'loss': -accuracy,'status':STATUS_OK}

### step 5 : train the model

In [19]:
trials = Trials() #create a database (in-memory)

      #fmin is the function to be used for minimum optimization
best = fmin(fn=objective, #function
            space=space, #search space parameters
            algo=tpe.suggest, #next combination has to be picked intelligently(bayesian optimization)
            max_evals=125,
            early_stop_fn=no_progress_loss(  iteration_stop_count=25, percent_increase=0.001 ), 
            trials=trials)

print (space_eval(space, best))

 25%|██▍       | 31/125 [00:00<00:01, 73.53trial/s, best loss: -0.8352135374697824]
{'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'splitter': 'best'}


In [20]:
for entry in trials:
    print(entry)

{'state': 2, 'tid': 0, 'spec': None, 'result': {'loss': -0.7933118452860596, 'status': 'ok'}, 'misc': {'tid': 0, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'criteria parameter': [0], 'depth parameter': [0], 'feature choice': [0], 'splitter choice': [0]}, 'vals': {'criteria parameter': [1], 'depth parameter': [6], 'feature choice': [0], 'splitter choice': [0]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2023, 3, 14, 5, 6, 12, 974000), 'refresh_time': datetime.datetime(2023, 3, 14, 5, 6, 12, 992000)}
{'state': 2, 'tid': 1, 'spec': None, 'result': {'loss': -0.7288477034649476, 'status': 'ok'}, 'misc': {'tid': 1, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'criteria parameter': [1], 'depth parameter': [1], 'feature choice': [1], 'splitter choice': [1]}, 'vals': {'criteria parameter': [1], 'depth parameter': [6], 'feature choice': [1], 'splitter choice': [1]}}, 'exp_key': None, 'owner': None, '