Import Libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
df.dtypes

id                        int64
annual_income           float64
debt_to_income_ratio    float64
credit_score              int64
loan_amount             float64
interest_rate           float64
gender                   object
marital_status           object
education_level          object
employment_status        object
loan_purpose             object
grade_subgrade           object
loan_paid_back          float64
dtype: object

In [5]:
df.isnull().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442235,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


### 📊 Histograms for Numerical Features
Visualizes distributions to detect skewness and outliers.

In [ ]:
df.hist(figsize=(12,8))
plt.tight_layout()
plt.show()

In [7]:
df = df.drop('id' , axis = 1)

In [8]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)

In [9]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [10]:
df.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,29367.99,0.084,736,2528.42,13.67,female,single,high_school,self-employed,other,c3,1.0
1,22108.02,0.166,636,4593.1,12.92,male,married,master's,employed,debt_consolidation,d3,0.0
2,49566.2,0.097,694,17005.15,9.76,male,single,high_school,employed,debt_consolidation,c5,1.0
3,46858.25,0.065,533,4682.48,16.1,female,single,high_school,employed,debt_consolidation,f1,1.0
4,25496.7,0.053,665,12184.43,10.21,male,married,high_school,employed,other,d1,1.0


In [11]:
df.nunique()


annual_income           119728
debt_to_income_ratio       526
credit_score               399
loan_amount             111570
interest_rate             1454
gender                       3
marital_status               4
education_level              5
employment_status            5
loan_purpose                 8
grade_subgrade              30
loan_paid_back               2
dtype: int64

In [12]:
df.shape

(593994, 12)

In [13]:
df_grouped = df.groupby('education_level')['education_level'].count()

df_grouped

education_level
bachelor's     279606
high_school    183592
master's        93097
other           26677
phd             11022
Name: education_level, dtype: int64

In [14]:
df_grouped = df.groupby('loan_purpose')['loan_purpose'].count()

df_grouped

loan_purpose
business               35303
car                    58108
debt_consolidation    324695
education              36641
home                   44118
medical                22806
other                  63874
vacation                8449
Name: loan_purpose, dtype: int64

In [15]:
df_grouped = df.groupby('grade_subgrade')['grade_subgrade'].count()

df_grouped

grade_subgrade
a1     1600
a2     2018
a3     2066
a4     1701
a5     2471
b1    14344
b2    15167
b3    13926
b4    13877
b5    13937
c1    53363
c2    54443
c3    58695
c4    55957
c5    53317
d1    37029
d2    34432
d3    36694
d4    35097
d5    32101
e1     6891
e2     6372
e3     7075
e4     8036
e5     6084
f1     5534
f2     5203
f3     5082
f4     5535
f5     5947
Name: grade_subgrade, dtype: int64

In [16]:
df_grouped = df.groupby('employment_status')['employment_status'].count()

df_grouped

employment_status
employed         450645
retired           16453
self-employed     52480
student           11931
unemployed        62485
Name: employment_status, dtype: int64

### 🔥 Correlation Heatmap
Shows relationships among numerical variables.

In [ ]:
plt.figure(figsize=(10,8))
cor=df.corr(numeric_only=True)
sns.heatmap(cor, annot=False)
plt.show()

In [17]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
#df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

#df_train = df_train.reset_index(drop=True)
#df_val = df_val.reset_index(drop=True)
#df_test = df_test.reset_index(drop=True)
#y_train = df_train.loan_paid_back.values
#y_val = df_val.loan_paid_back.values
#y_test = df_test.loan_paid_back.values

#del df_train['loan_paid_back']
#del df_val['loan_paid_back']
#del df_test['loan_paid_back']

In [18]:
#len(df_train), len(df_val), len(df_test)


In [19]:
numerical_columns

['annual_income',
 'debt_to_income_ratio',
 'credit_score',
 'loan_amount',
 'interest_rate',
 'loan_paid_back']

In [20]:
categorical_columns

['gender',
 'marital_status',
 'education_level',
 'employment_status',
 'loan_purpose',
 'grade_subgrade']

In [21]:
df_grouped = df_full_train.groupby('gender')['loan_paid_back'].mean()

df_grouped

gender
female    0.801796
male      0.795958
other     0.797635
Name: loan_paid_back, dtype: float64

In [22]:
df_grouped = df_full_train.groupby('marital_status')['loan_paid_back'].mean()

df_grouped

marital_status
divorced    0.797757
married     0.799305
single      0.798968
widowed     0.789344
Name: loan_paid_back, dtype: float64

In [23]:
df_grouped = df_full_train.groupby('education_level')['loan_paid_back'].mean()

df_grouped

education_level
bachelor's     0.789271
high_school    0.809445
master's       0.803191
other          0.801152
phd            0.830204
Name: loan_paid_back, dtype: float64

In [24]:
df_grouped = df_full_train.groupby('employment_status')['loan_paid_back'].mean()

df_grouped

employment_status
employed         0.894139
retired          0.997432
self-employed    0.898833
student          0.262556
unemployed       0.077512
Name: loan_paid_back, dtype: float64

In [25]:
df_grouped = df_full_train.groupby('loan_purpose')['loan_paid_back'].mean()

df_grouped

loan_purpose
business              0.813167
car                   0.799674
debt_consolidation    0.797505
education             0.777561
home                  0.822732
medical               0.776356
other                 0.802105
vacation              0.797554
Name: loan_paid_back, dtype: float64

In [26]:
df_grouped = df_full_train.groupby('grade_subgrade')['loan_paid_back'].mean()

df_grouped

grade_subgrade
a1    0.951969
a2    0.952584
a3    0.956231
a4    0.952518
a5    0.945729
b1    0.915150
b2    0.938617
b3    0.938836
b4    0.932075
b5    0.934884
c1    0.860000
c2    0.851475
c3    0.835574
c4    0.843796
c5    0.846599
d1    0.732926
d2    0.722333
d3    0.696985
d4    0.715412
d5    0.713410
e1    0.650136
e2    0.660066
e3    0.641326
e4    0.649395
e5    0.667836
f1    0.618833
f2    0.619904
f3    0.598776
f4    0.642275
f5    0.637018
Name: loan_paid_back, dtype: float64

In [27]:
global_loan_paid_back = df_full_train['loan_paid_back'].mean()

global_loan_paid_back

np.float64(0.7989751575668936)

79% of people have paid back the loan - higly imbalanced data set

In [28]:
for c in categorical_columns:
    print(c)
    df_group = df_full_train.groupby(c).loan_paid_back.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_loan_paid_back 
    df_group['risk'] = df_group['mean'] / global_loan_paid_back 
    display(df_group)
    print()
    print()

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.801796,244723,0.002821,1.003531
male,0.795958,227512,-0.003017,0.996224
other,0.797635,2960,-0.00134,0.998323




marital_status


Unnamed: 0_level_0,mean,count,diff,risk
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
divorced,0.797757,17034,-0.001218,0.998476
married,0.799305,221635,0.00033,1.000413
single,0.798968,231252,-7e-06,0.999991
widowed,0.789344,5274,-0.009631,0.987946




education_level


Unnamed: 0_level_0,mean,count,diff,risk
education_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bachelor's,0.789271,223809,-0.009704,0.987855
high_school,0.809445,146598,0.01047,1.013104
master's,0.803191,74590,0.004216,1.005276
other,0.801152,21358,0.002177,1.002724
phd,0.830204,8840,0.031228,1.039086




employment_status


Unnamed: 0_level_0,mean,count,diff,risk
employment_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
employed,0.894139,360613,0.095163,1.119107
retired,0.997432,13239,0.198457,1.248389
self-employed,0.898833,41891,0.099858,1.124982
student,0.262556,9537,-0.536419,0.328616
unemployed,0.077512,49915,-0.721463,0.097014




loan_purpose


Unnamed: 0_level_0,mean,count,diff,risk
loan_purpose,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
business,0.813167,28282,0.014192,1.017763
car,0.799674,46594,0.000699,1.000874
debt_consolidation,0.797505,259651,-0.00147,0.99816
education,0.777561,29271,-0.021414,0.973198
home,0.822732,35184,0.023757,1.029734
medical,0.776356,18212,-0.022619,0.97169
other,0.802105,51214,0.00313,1.003917
vacation,0.797554,6787,-0.001421,0.998221




grade_subgrade


Unnamed: 0_level_0,mean,count,diff,risk
grade_subgrade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a1,0.951969,1270,0.152993,1.191487
a2,0.952584,1645,0.153608,1.192257
a3,0.956231,1645,0.157256,1.196822
a4,0.952518,1390,0.153543,1.192175
a5,0.945729,1990,0.146753,1.183677
b1,0.91515,11432,0.116175,1.145405
b2,0.938617,12137,0.139642,1.174777
b3,0.938836,11085,0.139861,1.175051
b4,0.932075,11130,0.1331,1.166589
b5,0.934884,11180,0.135909,1.170104






In [29]:
from sklearn.metrics import mutual_info_score


In [30]:


def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.loan_paid_back)



In [31]:
mi = df_full_train[categorical_columns].apply(mutual_info_churn_score)
print(mi.sort_values(ascending=False))


employment_status    0.175917
grade_subgrade       0.026748
loan_purpose         0.000325
education_level      0.000311
gender               0.000026
marital_status       0.000003
dtype: float64


In [32]:
print(mi.sort_values(ascending=False))

employment_status    0.175917
grade_subgrade       0.026748
loan_purpose         0.000325
education_level      0.000311
gender               0.000026
marital_status       0.000003
dtype: float64


Based on mutual information score we will drop gender , marital status fields

In [33]:
df_full_train = df_full_train.drop(['gender', 'marital_status'], axis = 1)

In [34]:
df_full_train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
105200,47868.15,0.07,773,28644.19,10.16,high_school,employed,car,b2,1.0
436067,35444.49,0.153,693,27775.31,16.14,high_school,employed,debt_consolidation,c5,1.0
385322,92914.49,0.14,715,30784.02,12.35,high_school,employed,car,c4,1.0
464782,43761.69,0.156,723,8181.09,10.13,phd,employed,other,c4,1.0
468954,63671.48,0.149,675,17626.06,12.6,bachelor's,employed,debt_consolidation,c3,1.0


In [35]:
df_test = df_test.drop(['gender', 'marital_status'], axis = 1)

In [36]:
df_full_train['education_level'] = df_full_train['education_level'].replace("master's", "masters")

In [37]:
df_full_train['education_level'] = df_full_train['education_level'].replace("bachelor's", "bachelors")

In [38]:
df_test['education_level'] = df_test['education_level'].replace("master's", "masters")
df_test['education_level'] = df_test['education_level'].replace("bachelor's", "bachelors")


In [39]:
from sklearn.preprocessing import OrdinalEncoder


In [40]:


    # Define the order of categories
    category_order = [['high_school', 'other', 'bachelors', 'masters', 'phd']]
    edu_encoder = OrdinalEncoder(categories=category_order)
    df_full_train['education_encoded'] = edu_encoder.fit_transform(df_full_train[['education_level']])
    df_test['education_encoded'] = edu_encoder.transform(df_test[['education_level']])
    #print(df_full_train.head())

In [41]:
df_full_train.tail(10)

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back,education_encoded
413825,24274.33,0.088,669,25019.89,12.83,high_school,employed,car,d2,1.0,0.0
229520,53789.17,0.12,750,16591.53,9.92,bachelors,employed,home,b3,1.0,2.0
21440,69500.13,0.044,666,12409.13,15.0,high_school,employed,debt_consolidation,d5,1.0,0.0
117583,29096.46,0.138,708,23198.65,13.17,bachelors,self-employed,other,c3,1.0,2.0
73349,64435.76,0.078,709,27563.86,10.52,high_school,employed,education,c2,1.0,0.0
371403,51957.35,0.086,681,12452.0,14.96,bachelors,employed,debt_consolidation,c4,1.0,2.0
491263,20264.17,0.16,652,18207.08,12.17,high_school,self-employed,debt_consolidation,d4,1.0,0.0
470924,25609.7,0.053,664,1418.39,12.66,high_school,employed,debt_consolidation,d4,1.0,0.0
491755,39711.94,0.262,638,9121.82,11.2,bachelors,employed,home,d3,1.0,2.0
128037,23983.29,0.091,569,10790.12,15.44,masters,self-employed,debt_consolidation,f4,0.0,3.0


In [42]:
df_full_train = df_full_train.drop(['education_level','loan_purpose'], axis=1)

In [43]:
df_test = df_test.drop(['education_level','loan_purpose'], axis=1)

In [44]:
df_full_train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,employment_status,grade_subgrade,loan_paid_back,education_encoded
105200,47868.15,0.07,773,28644.19,10.16,employed,b2,1.0,0.0
436067,35444.49,0.153,693,27775.31,16.14,employed,c5,1.0,0.0
385322,92914.49,0.14,715,30784.02,12.35,employed,c4,1.0,0.0
464782,43761.69,0.156,723,8181.09,10.13,employed,c4,1.0,4.0
468954,63671.48,0.149,675,17626.06,12.6,employed,c3,1.0,2.0


In [45]:

df_test.head()


Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,employment_status,grade_subgrade,loan_paid_back,education_encoded
566215,52233.83,0.125,663,7454.22,14.89,employed,d2,1.0,4.0
123502,29247.24,0.127,639,20391.28,12.29,unemployed,d2,0.0,0.0
367747,13802.88,0.164,736,17984.01,11.0,self-employed,c5,1.0,2.0
359153,48758.31,0.09,707,10209.48,10.17,employed,c2,1.0,2.0
440880,49816.42,0.09,665,18123.14,16.06,employed,d4,1.0,0.0


In [46]:
category_order = [['f5', 'f4', 'f3', 'f2', 'f1', 'e5', 'e4','e3','e2','e1', 'd5', 'd4','d3','d2','d1', 'c5', 'c4','c3','c2','c1', 'b5', 'b4','b3','b2','b1',  'a5', 'a4','a3','a2','a1']]
grade_encoder = OrdinalEncoder(categories=category_order)
df_full_train['grade_code'] = grade_encoder.fit_transform(df_full_train[['grade_subgrade']])
df_full_train  = df_full_train.drop(['grade_subgrade'], axis = 1)
df_test['grade_code'] = grade_encoder.transform(df_test[['grade_subgrade']])
df_test  = df_test.drop(['grade_subgrade'], axis = 1)

df_full_train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,employment_status,loan_paid_back,education_encoded,grade_code
105200,47868.15,0.07,773,28644.19,10.16,employed,1.0,0.0,23.0
436067,35444.49,0.153,693,27775.31,16.14,employed,1.0,0.0,15.0
385322,92914.49,0.14,715,30784.02,12.35,employed,1.0,0.0,16.0
464782,43761.69,0.156,723,8181.09,10.13,employed,1.0,4.0,16.0
468954,63671.48,0.149,675,17626.06,12.6,employed,1.0,2.0,17.0


In [47]:
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns

['annual_income',
 'debt_to_income_ratio',
 'credit_score',
 'loan_amount',
 'interest_rate',
 'loan_paid_back']

In [48]:
df_full_train[numerical_columns].corrwith(df_full_train.loan_paid_back).abs()


annual_income           0.006148
debt_to_income_ratio    0.335758
credit_score            0.234319
loan_amount             0.003521
interest_rate           0.130789
loan_paid_back          1.000000
dtype: float64

In [49]:
def analyzenumeric_cols(numerical_columns): 
    for n in numerical_columns:
        n_mean = df_full_train[n].mean()
        n_below_mean = df_full_train[df_full_train[n] <= n_mean].loan_paid_back.mean()
        n_above_mean = df_full_train[df_full_train[n] > n_mean].loan_paid_back.mean()
        print("For numeric column ", n, "loan paid back % below mean is ", n_below_mean)
        print("For numeric column ", n, "loan paid back % above mean is ", n_above_mean)

In [50]:
analyzenumeric_cols(numerical_columns)

For numeric column  annual_income loan paid back % below mean is  0.7961039885476562
For numeric column  annual_income loan paid back % above mean is  0.8022409039182083
For numeric column  debt_to_income_ratio loan paid back % below mean is  0.8815228762320478
For numeric column  debt_to_income_ratio loan paid back % above mean is  0.658643848225548
For numeric column  credit_score loan paid back % below mean is  0.715795716179446
For numeric column  credit_score loan paid back % above mean is  0.8765592779836253
For numeric column  loan_amount loan paid back % below mean is  0.801466426873593
For numeric column  loan_amount loan paid back % above mean is  0.7964640659860727
For numeric column  interest_rate loan paid back % below mean is  0.8421851547196052
For numeric column  interest_rate loan paid back % above mean is  0.7563975617194975
For numeric column  loan_paid_back loan paid back % below mean is  0.0
For numeric column  loan_paid_back loan paid back % above mean is  1.0


So it shows that higher credit score implies person will pay back loan and  lower credit score implies person will not payback loan
So it shows that higher debttoincome ratio implies person will not pay back loan and  lower debttoincomee implies person will  payback loan
For lower interest_rate loan paid back percent is higher than higher interest rate

The other columns like Annual Income/Loan Amount does not really impact the Loan Pay Back intention




In [51]:
#This is also corroborated by Correlation table
df_full_train[numerical_columns].corrwith(df_full_train.loan_paid_back).abs().sort_values(ascending=False)



loan_paid_back          1.000000
debt_to_income_ratio    0.335758
credit_score            0.234319
interest_rate           0.130789
annual_income           0.006148
loan_amount             0.003521
dtype: float64

In [52]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.loan_paid_back.values
y_val = df_val.loan_paid_back.values
y_test = df_test.loan_paid_back.values

del df_train['loan_paid_back']
del df_val['loan_paid_back']
del df_test['loan_paid_back']

In [53]:
df_train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,employment_status,education_encoded,grade_code
0,60737.14,0.204,784,8243.98,9.97,employed,0.0,20.0
1,52493.49,0.076,689,14924.53,13.91,self-employed,3.0,15.0
2,30887.15,0.079,611,2274.98,14.52,employed,0.0,12.0
3,48362.84,0.053,715,25863.66,11.74,employed,0.0,19.0
4,74599.93,0.152,563,9995.24,14.64,employed,0.0,2.0


In [54]:
categorical_columns = list(df_train.dtypes[df_train.dtypes == 'object'].index)

categorical_columns

['employment_status']

In [55]:
numerical_columns = list(df_train.dtypes[df_train.dtypes != 'object'].index)

numerical_columns

['annual_income',
 'debt_to_income_ratio',
 'credit_score',
 'loan_amount',
 'interest_rate',
 'education_encoded',
 'grade_code']

In [56]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [57]:
from sklearn.linear_model import LogisticRegression


In [None]:
model = LogisticRegression(solver='lbfgs')
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

In [None]:
model.intercept_[0]


In [None]:
model.coef_[0].round(3)


In [None]:
y_pred = model.predict_proba(X_val)[:, 1]


In [None]:
loan_decision = (y_pred >= 0.5)

In [None]:
(y_val == loan_decision).mean()


In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val, y_pred)

In [None]:
###LogisticRegression Model is giving Accuracy of 86% and roc_auc_score of 0.8108

In [None]:
dicts_test = df_test[categorical_columns + numerical_columns].to_dict(orient='records')
X_test = dv.transform(dicts_test)
y_pred = model.predict_proba(X_test)[:, 1]


In [None]:
test_loan_decision  = (y_pred >= 0.5)

In [None]:
(y_test == test_loan_decision).mean()

In [None]:
numerical_columns

In [None]:
categorical_columns

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)


In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

In [None]:

depths = [1, 2, 3, 4, 5, 6, 10, 15, 20, None]

for depth in depths: 
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    
    print('%4s -> %.3f' % (depth, auc))

In [None]:
scores = []
for s in [1, 5, 10, 15, 20, 500, 100, 200]:
    dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=s)
    dt.fit(X_train, y_train)

    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
        
    scores.append((10, s, auc))

In [None]:
columns = ['max_depth', 'min_samples_leaf', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
df_scores_pivot = df_scores.pivot(index='min_samples_leaf', columns=['max_depth'], values=['auc'])
df_scores_pivot.round(3)

In [None]:
# lets considder min_sample_leaves = 100

In [None]:
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=100)
dt.fit(X_train, y_train)
y_pred = dt.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
print("Auc with Decision tree of depth 10 and min sample leaves = 100 is", auc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
scores = []

for n in [10,50,100,200]:
    rf = RandomForestClassifier(n_estimators=n, random_state=1, n_jobs = -1, max_depth = 10,min_samples_leaf = 100 )
    rf.fit(X_train, y_train)

    y_pred = rf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    
    scores.append((n, auc))

In [None]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'auc'])

plt.plot(df_scores.n_estimators, df_scores.auc)

plt.show()

In [None]:
df_scores

In [104]:
n_estimators = [10,50,100]
max_features = ['auto', 'sqrt']
max_depth = [5,6,10]
max_depth.append(None)

min_samples_leaf = [20,100,200]


params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
                   'max_depth': max_depth, 
                   'min_samples_leaf': min_samples_leaf}


model_rf = RandomForestClassifier(random_state=42)

model_cv = GridSearchCV(model_rf, params_grid, scoring="roc_auc", cv=3, verbose=1, n_jobs=-1)
model_cv.fit(X_train, y_train)
best_params = model_cv.best_params_
print(f"Best parameters: {best_params}")
model_rf = RandomForestClassifier(**best_params)
model_rf.fit(X_train, y_train)
    

Fitting 3 folds for each of 72 candidates, totalling 216 fits


108 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
46 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Meera\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Meera\anaconda3\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\Meera\anaconda3\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self._parameter_constraints,

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'n_estimators': 100}


In [105]:
pred = model_rf.predict_proba(X_val)[:, 1]

In [107]:
auc = roc_auc_score(y_val, pred)

print(auc)

0.9141356222176091
