# Weight of Evidence and Information Value 

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Generate Data 

In [99]:
d = {
    "Age": np.random.randint(18, 70, 1000),
    "Status": np.random.choice(["Good", "Bad"], size=1000, p=[0.7, 0.3])
}

In [100]:
data = pd.DataFrame(data=d)

In [101]:
data

Unnamed: 0,Age,Status
0,52,Good
1,63,Bad
2,46,Good
3,44,Good
4,69,Bad
...,...,...
995,39,Good
996,39,Bad
997,45,Good
998,54,Good


In [102]:
data["Status"].value_counts()

Good    688
Bad     312
Name: Status, dtype: int64

### Binning Data into 5 groups/bins 

In [103]:
data["Age Binning"] = pd.qcut(data["Age"], q=5)

In [104]:
data["Age Binning"].value_counts()

(27.8, 38.0]      212
(50.0, 60.0]      211
(17.999, 27.8]    200
(38.0, 50.0]      192
(60.0, 69.0]      185
Name: Age Binning, dtype: int64

In [105]:
data.groupby(["Age Binning", "Status"]).size()

Age Binning     Status
(17.999, 27.8]  Bad        69
                Good      131
(27.8, 38.0]    Bad        66
                Good      146
(38.0, 50.0]    Bad        57
                Good      135
(50.0, 60.0]    Bad        71
                Good      140
(60.0, 69.0]    Bad        49
                Good      136
dtype: int64

Gimana cara nge convert nya jadi DataFrame? menggunakan crosstab ternyata wkwk

In [106]:
new_data = pd.crosstab(data["Age Binning"], data["Status"])

In [107]:
new_data

Status,Bad,Good
Age Binning,Unnamed: 1_level_1,Unnamed: 2_level_1
"(17.999, 27.8]",69,131
"(27.8, 38.0]",66,146
"(38.0, 50.0]",57,135
"(50.0, 60.0]",71,140
"(60.0, 69.0]",49,136


Ambil distribusi atau proporsi % data nya

In [108]:
new_data["Good Distribution"] = round(new_data["Good"]/np.sum(new_data["Good"]), 2)

In [109]:
new_data["Bad Distribution"] = round(new_data["Bad"]/np.sum(new_data["Bad"]), 2)

In [110]:
new_data

Status,Bad,Good,Good Distribution,Bad Distribution
Age Binning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(17.999, 27.8]",69,131,0.19,0.22
"(27.8, 38.0]",66,146,0.21,0.21
"(38.0, 50.0]",57,135,0.2,0.18
"(50.0, 60.0]",71,140,0.2,0.23
"(60.0, 69.0]",49,136,0.2,0.16


In [81]:
np.sum(new_data["Good"])

710

In [82]:
np.sum(new_data["Bad"])

290

In [115]:
np.log(new_data["Good Distribution"]) - np.log(new_data["Bad Distribution"])

Age Binning
(17.999, 27.8]   -0.146603
(27.8, 38.0]      0.000000
(38.0, 50.0]      0.105361
(50.0, 60.0]     -0.139762
(60.0, 69.0]      0.223144
dtype: float64

### WOE Formula

$$WOE=ln(\text{Distr Good}) - ln(\text{Distr Bad})$$

In [116]:
round(np.log(new_data["Good Distribution"]) - np.log(new_data["Bad Distribution"]), 2)

Age Binning
(17.999, 27.8]   -0.15
(27.8, 38.0]      0.00
(38.0, 50.0]      0.11
(50.0, 60.0]     -0.14
(60.0, 69.0]      0.22
dtype: float64

In [117]:
new_data["WOE"] = round(np.log(new_data["Good Distribution"]) - np.log(new_data["Bad Distribution"]), 2)

In [118]:
new_data

Status,Bad,Good,Good Distribution,Bad Distribution,WOE
Age Binning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(17.999, 27.8]",69,131,0.19,0.22,-0.15
"(27.8, 38.0]",66,146,0.21,0.21,0.0
"(38.0, 50.0]",57,135,0.2,0.18,0.11
"(50.0, 60.0]",71,140,0.2,0.23,-0.14
"(60.0, 69.0]",49,136,0.2,0.16,0.22


#### IV Formula 
$$IV=WOE_i*(\text{Distr Good} - \text{Distr Bad})$$

In [124]:
new_data["IV"] = new_data["WOE"] * (new_data["Good Distribution"] - new_data["Bad Distribution"])

In [125]:
new_data["IV"]

Age Binning
(17.999, 27.8]    0.0045
(27.8, 38.0]      0.0000
(38.0, 50.0]      0.0022
(50.0, 60.0]      0.0042
(60.0, 69.0]      0.0088
Name: IV, dtype: float64