In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [2]:
mortgage = pd.read_csv("mortgage.csv")

In [3]:
mortgage.head()

Unnamed: 0,id,time,orig_time,first_time,mat_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,...,REtype_SF_orig_time,investor_orig_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time,default_time,payoff_time,status_time
0,1,25,-7,25,113,41303.42,24.498336,9.2,226.29,2.899137,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
1,1,26,-7,25,113,41061.95,24.483867,9.2,225.1,2.151365,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
2,1,27,-7,25,113,40804.42,24.626795,9.2,222.39,2.361722,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
3,1,28,-7,25,113,40483.89,24.735883,9.2,219.67,1.229172,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
4,1,29,-7,25,113,40367.06,24.925476,9.2,217.37,1.692969,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0


In [4]:
mortgage.tail()

Unnamed: 0,id,time,orig_time,first_time,mat_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,...,REtype_SF_orig_time,investor_orig_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time,default_time,payoff_time,status_time
622484,50000,56,16,52,177,517107.42,57.659403,2.664,181.43,1.717053,...,0,0,664000.0,653,80.0,1.75,167.91,0,0,0
622485,50000,57,16,52,177,512274.57,55.359916,2.652,187.2,2.556052,...,0,0,664000.0,653,80.0,1.75,167.91,0,0,0
622486,50000,58,16,52,177,507396.72,54.492206,2.644,188.37,2.868594,...,0,0,664000.0,653,80.0,1.75,167.91,0,0,0
622487,50000,59,16,52,177,502478.87,54.171106,2.638,187.65,2.443648,...,0,0,664000.0,653,80.0,1.75,167.91,0,0,0
622488,50000,60,16,52,177,497521.35,53.023479,2.635,189.82,2.836358,...,0,0,664000.0,653,80.0,1.75,167.91,0,0,0


In [5]:
print('The shape of the mortgage dataset is:', mortgage.shape)

The shape of the mortgage dataset is: (622489, 23)


In [6]:
mortgage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622489 entries, 0 to 622488
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       622489 non-null  int64  
 1   time                     622489 non-null  int64  
 2   orig_time                622489 non-null  int64  
 3   first_time               622489 non-null  int64  
 4   mat_time                 622489 non-null  int64  
 5   balance_time             622489 non-null  float64
 6   LTV_time                 622219 non-null  float64
 7   interest_rate_time       622489 non-null  float64
 8   hpi_time                 622489 non-null  float64
 9   gdp_time                 622489 non-null  float64
 10  uer_time                 622489 non-null  float64
 11  REtype_CO_orig_time      622489 non-null  int64  
 12  REtype_PU_orig_time      622489 non-null  int64  
 13  REtype_SF_orig_time      622489 non-null  int64  
 14  inve

In [7]:
missing_counts = mortgage.isnull().sum()

print("Missing Counts:")
print(missing_counts)

Missing Counts:
id                           0
time                         0
orig_time                    0
first_time                   0
mat_time                     0
balance_time                 0
LTV_time                   270
interest_rate_time           0
hpi_time                     0
gdp_time                     0
uer_time                     0
REtype_CO_orig_time          0
REtype_PU_orig_time          0
REtype_SF_orig_time          0
investor_orig_time           0
balance_orig_time            0
FICO_orig_time               0
LTV_orig_time                0
Interest_Rate_orig_time      0
hpi_orig_time                0
default_time                 0
payoff_time                  0
status_time                  0
dtype: int64


In [8]:
distinct_counts = mortgage.nunique()

print("Distinct Counts:")
print(distinct_counts)

Distinct Counts:
id                          50000
time                           60
orig_time                      98
first_time                     50
mat_time                      202
balance_time               482832
LTV_time                   567800
interest_rate_time           5539
hpi_time                       60
gdp_time                       60
uer_time                       36
REtype_CO_orig_time             2
REtype_PU_orig_time             2
REtype_SF_orig_time             2
investor_orig_time              2
balance_orig_time           10694
FICO_orig_time                397
LTV_orig_time                 552
Interest_Rate_orig_time      1825
hpi_orig_time                  97
default_time                    2
payoff_time                     2
status_time                     3
dtype: int64


#### Function to plot variables

In [9]:
def plot_hist(data, title = '', x_label = '', y_label = '', bins = 10):
    plt.hist(data, bins = bins, edgecolor = "black")
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

### Do not run below unless needed

In [None]:
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
# Load Data
df = mortgage
# Build App
app = JupyterDash(__name__)
app.layout = html.Div(children = [
    html.Div([
    html.H1("Mortgage Dataset - Variables"),
        html.Div(children='''Dashboard: Plot'''),
    dcc.Graph(id='graph'),
    html.Label([
        "variables",
        dcc.Dropdown(
            id='var-dropdown', clearable=False,
            value='id', options=
                list(df.columns[0:len(df.columns)-1]))
    ]),
])
])

@app.callback(
Output('graph','figure'),
    [Input('var-dropdown', 'value')])

def update_graph(variables):
    return px.histogram(
        df[variables] ,nbins = 30, title=variables)
# Run app and display result inline in the notebook
app.run_server(mode='inline', debug=False)

### Feature Engineering

### Weight of Evidence

In [10]:
def woe_binning(df, feature, target, bins = 5):
    #woe_df = pd.DataFrame(columns=['Bin', 'Event', 'Non-Event', 'Total', 'Event Rate', 'Non-Event Rate', 'WoE'])
    bin_no = []
    event = []
    non_event = []
    total = []
    event_rate_list = []
    non_event_rate_list = []
    woe_list = []
    
    total_event = df[target].sum()
    total_non_event = df.shape[0] - total_event

    bins_cut = pd.cut(df[feature], bins=bins, precision=2)
    
    for bin_label, group in df.groupby(bins_cut):
        event_count = group[target].sum()
        non_event_count = group.shape[0] - event_count

        event_rate = (event_count + 1) / (total_event + 2)
        non_event_rate = (non_event_count + 1) / (total_non_event + 2)

        woe = (np.log(event_rate / non_event_rate))

        bin_no.append(str(bin_label))
        event.append(event_count)
        non_event.append(non_event_count)
        total.append(event_count + non_event_count)
        event_rate_list.append(event_rate)
        non_event_rate_list.append(non_event_rate)
        woe_list.append(woe)
    
    if bins == 4:
        woe_df = pd.DataFrame(
                {'Variable': feature,
                 'Group': [1,2,3,4],
                'Bin': bin_no, 
                'Event': event,
                'Non_Event': non_event,
                'Total': total,
                'Event Rate': event_rate_list,
                'Non-Event Rate': non_event_rate_list,
                'WoE': woe_list
                })
    else:        
                woe_df = pd.DataFrame(
                {'Variable': feature,
                 'Group': [1,2],
                'Bin': bin_no, 
                'Event': event,
                'Non_Event': non_event,
                'Total': total,
                'Event Rate': event_rate_list,
                'Non-Event Rate': non_event_rate_list,
                'WoE': woe_list
                })

    return woe_df

In [11]:
woe_result = woe_binning(mortgage, 'LTV_time', 'default_time', bins=4)
print(woe_result)

   Variable  Group               Bin  Event  Non_Event   Total  Event Rate  \
0  LTV_time      1    (-0.8, 200.88]  15153     606996  622149    0.999604   
1  LTV_time      2  (200.88, 401.76]      0         32      32    0.000066   
2  LTV_time      3  (401.76, 602.64]      0          2       2    0.000066   
3  LTV_time      4  (602.64, 803.51]      0         36      36    0.000066   

   Non-Event Rate       WoE  
0        0.999447  0.000158  
1        0.000054  0.193909  
2        0.000005  2.591805  
3        0.000061  0.079499  


In [12]:
woe_result

Unnamed: 0,Variable,Group,Bin,Event,Non_Event,Total,Event Rate,Non-Event Rate,WoE
0,LTV_time,1,"(-0.8, 200.88]",15153,606996,622149,0.999604,0.999447,0.000158
1,LTV_time,2,"(200.88, 401.76]",0,32,32,6.6e-05,5.4e-05,0.193909
2,LTV_time,3,"(401.76, 602.64]",0,2,2,6.6e-05,5e-06,2.591805
3,LTV_time,4,"(602.64, 803.51]",0,36,36,6.6e-05,6.1e-05,0.079499


#### Some WoE bins do not include any events and the minimum amount of observatiosn required...therefore I have deicded to remove outliers from features

In [13]:
def remove_outliers(feature):
    Q1 = feature.quantile(0.25)
    Q3 = feature.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - (1.5*IQR)
    upper = Q3 + (1.5*IQR)
    
    return feature[(feature >= lower) & (feature <= upper)]
    
#    upper_array = np.where(data[feature] >= upper)[0]
#    lower_array = np.where(data[feature] <= lower)[0]
    
#    data.drop(index = upper_array, inplace=True)
#    data.drop(index = lower_array, inplace=True)
    
#    return data

#### Continuous variables

In [14]:
numeric_features = ['balance_time', 'LTV_time', 'interest_rate_time', 'hpi_time', 'gdp_time', 'uer_time', 'balance_orig_time'
                   ,'FICO_orig_time', 'LTV_orig_time', 'Interest_Rate_orig_time', 'hpi_orig_time']

In [15]:
data = mortgage

In [16]:
data[numeric_features] = data[numeric_features].apply(remove_outliers)

In [17]:
print('The shape of the mortgage dataset after removing outliers is:', data.shape)

The shape of the mortgage dataset after removing outliers is: (622489, 23)


In [18]:
woe_result_balance = woe_binning(data, 'balance_time', 'default_time', bins=4)
woe_result_ltv=woe_binning(data, 'LTV_time','default_time', bins = 4)
woe_result_interest = woe_binning(data, 'interest_rate_time','default_time', bins = 4)
woe_result_hpi=woe_binning(data, 'hpi_time', 'default_time', bins=4)
woe_result_gdp=woe_binning(data, 'gdp_time', 'default_time', bins=4)
woe_result_uer=woe_binning(data, 'uer_time', 'default_time', bins=4)
woe_result_balance_orig_time = woe_binning(data, 'balance_orig_time', 'default_time', bins=4)
woe_result_FICO_orig_time = woe_binning(data, 'FICO_orig_time', 'default_time', bins=4)
woe_result_LTV_orig_time = woe_binning(data, 'LTV_orig_time', 'default_time', bins=4)
woe_result_Interest_Rate_orig_time = woe_binning(data, 'Interest_Rate_orig_time', 'default_time', bins=4)
woe_result_hpi_orig_time = woe_binning(data, 'hpi_orig_time', 'default_time', bins=4)

In [19]:
woe_df = pd.concat([woe_result_balance
,woe_result_ltv
,woe_result_interest
,woe_result_hpi
,woe_result_gdp
,woe_result_uer
,woe_result_balance_orig_time
,woe_result_FICO_orig_time
,woe_result_LTV_orig_time
,woe_result_Interest_Rate_orig_time
,woe_result_hpi_orig_time
], ignore_index=True)
woe_df.head()

Unnamed: 0,Variable,Group,Bin,Event,Non_Event,Total,Event Rate,Non-Event Rate,WoE
0,balance_time,1,"(-690.68, 172670.22]",6309,292165,298474,0.416227,0.481064,-0.14477
1,balance_time,2,"(172670.22, 345340.45]",5223,168302,173525,0.344591,0.277118,0.217914
2,balance_time,3,"(345340.45, 518010.68]",2390,89268,91658,0.157718,0.146985,0.070474
3,balance_time,4,"(518010.68, 690680.9]",848,37619,38467,0.056003,0.061943,-0.100815
4,LTV_time,1,"(16.69, 50.35]",402,51038,51440,0.026583,0.084038,-1.150992


#### Binary Variables

In [20]:
binary_variables = ['REtype_CO_orig_time', 'REtype_PU_orig_time', 'REtype_SF_orig_time', 'investor_orig_time']

In [21]:
woe_result_REtype_CO_orig_time = woe_binning(data, 'REtype_CO_orig_time', 'default_time', bins=2)
woe_result_REtype_PU_orig_time = woe_binning(data, 'REtype_PU_orig_time', 'default_time', bins=2)
woe_result_REtype_SF_orig_time = woe_binning(data, 'REtype_SF_orig_time', 'default_time', bins=2)
woe_result_investor_orig_time = woe_binning(data, 'investor_orig_time', 'default_time', bins=2)

In [22]:
woe_df_full = pd.concat([woe_df,
                        woe_result_REtype_CO_orig_time,
                        woe_result_REtype_PU_orig_time,
                        woe_result_REtype_SF_orig_time,
                        woe_result_investor_orig_time], ignore_index=True)
woe_df_full.tail()

Unnamed: 0,Variable,Group,Bin,Event,Non_Event,Total,Event Rate,Non-Event Rate,WoE
47,REtype_PU_orig_time,2,"(0.5, 1.0]",1853,75855,77708,0.122296,0.1249,-0.021074
48,REtype_SF_orig_time,1,"(-0.001, 0.5]",5714,235770,241484,0.376979,0.388207,-0.02935
49,REtype_SF_orig_time,2,"(0.5, 1.0]",9444,371561,381005,0.623021,0.611793,0.018187
50,investor_orig_time,1,"(-0.001, 0.5]",13196,523249,536445,0.870515,0.861554,0.010347
51,investor_orig_time,2,"(0.5, 1.0]",1962,84082,86044,0.129485,0.138446,-0.066914


In [23]:
input_string = woe_df_full['Bin']

In [24]:
lower_bound = []
for x in range(len(input_string)):
    test = input_string[x]
    start_index = test.find('(')
    end_index = test.find(',')
    if start_index != -1 and end_index != -1:
        features_string = test[start_index + 1:end_index].strip()
        features_list = features_string.split(', ')
        lower_bound.append(features_list)
    else:
        lower_bound.append("NA")

In [25]:
lower_bound

[['-690.68'],
 ['172670.22'],
 ['345340.45'],
 ['518010.68'],
 ['16.69'],
 ['50.35'],
 ['83.87'],
 ['117.4'],
 ['2.31'],
 ['4.54'],
 ['6.76'],
 ['8.99'],
 ['107.71'],
 ['137.44'],
 ['167.06'],
 ['196.68'],
 ['-0.32'],
 ['0.84'],
 ['2.0'],
 ['3.16'],
 ['3.79'],
 ['5.35'],
 ['6.9'],
 ['8.45'],
 ['-717.5'],
 ['179375.0'],
 ['358750.0'],
 ['538125.0'],
 ['471.63'],
 ['564.0'],
 ['656.0'],
 ['748.0'],
 ['67.48'],
 ['72.5'],
 ['77.5'],
 ['82.5'],
 ['1.34'],
 ['3.8'],
 ['6.25'],
 ['8.69'],
 ['116.58'],
 ['144.09'],
 ['171.49'],
 ['198.89'],
 ['-0.001'],
 ['0.5'],
 ['-0.001'],
 ['0.5'],
 ['-0.001'],
 ['0.5'],
 ['-0.001'],
 ['0.5']]

In [26]:
lower_bound_float = []

for x in range(len(lower_bound)):
    lower_bound_float.append(float(lower_bound[x][0]))

In [27]:
upper_bound = []
for x in range(len(input_string)):
    test = input_string[x]
    start_index = test.find(',')
    end_index = test.find(']')
    if start_index != -1 and end_index != -1:
        features_string = test[start_index + 1:end_index].strip()
        features_list = features_string.split(', ')
        upper_bound.append(features_list)
    else:
        upper_bound.append("NA")

In [28]:
upper_bound

[['172670.22'],
 ['345340.45'],
 ['518010.68'],
 ['690680.9'],
 ['50.35'],
 ['83.87'],
 ['117.4'],
 ['150.92'],
 ['4.54'],
 ['6.76'],
 ['8.99'],
 ['11.21'],
 ['137.44'],
 ['167.06'],
 ['196.68'],
 ['226.29'],
 ['0.84'],
 ['2.0'],
 ['3.16'],
 ['4.32'],
 ['5.35'],
 ['6.9'],
 ['8.45'],
 ['10.0'],
 ['179375.0'],
 ['358750.0'],
 ['538125.0'],
 ['717500.0'],
 ['564.0'],
 ['656.0'],
 ['748.0'],
 ['840.0'],
 ['72.5'],
 ['77.5'],
 ['82.5'],
 ['87.5'],
 ['3.8'],
 ['6.25'],
 ['8.69'],
 ['11.14'],
 ['144.09'],
 ['171.49'],
 ['198.89'],
 ['226.29'],
 ['0.5'],
 ['1.0'],
 ['0.5'],
 ['1.0'],
 ['0.5'],
 ['1.0'],
 ['0.5'],
 ['1.0']]

In [29]:
upper_bound_float = []

for x in range(len(upper_bound)):
    upper_bound_float.append(float(upper_bound[x][0]))

In [30]:
woe_df_full['lower_bound'] = lower_bound_float
woe_df_full['upper_bound'] = upper_bound_float

In [31]:
woe_df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Variable        52 non-null     object 
 1   Group           52 non-null     int64  
 2   Bin             52 non-null     object 
 3   Event           52 non-null     int64  
 4   Non_Event       52 non-null     int64  
 5   Total           52 non-null     int64  
 6   Event Rate      52 non-null     float64
 7   Non-Event Rate  52 non-null     float64
 8   WoE             52 non-null     float64
 9   lower_bound     52 non-null     float64
 10  upper_bound     52 non-null     float64
dtypes: float64(5), int64(4), object(2)
memory usage: 4.6+ KB


In [32]:
woe_df_full

Unnamed: 0,Variable,Group,Bin,Event,Non_Event,Total,Event Rate,Non-Event Rate,WoE,lower_bound,upper_bound
0,balance_time,1,"(-690.68, 172670.22]",6309,292165,298474,0.416227,0.481064,-0.14477,-690.68,172670.22
1,balance_time,2,"(172670.22, 345340.45]",5223,168302,173525,0.344591,0.277118,0.217914,172670.22,345340.45
2,balance_time,3,"(345340.45, 518010.68]",2390,89268,91658,0.157718,0.146985,0.070474,345340.45,518010.68
3,balance_time,4,"(518010.68, 690680.9]",848,37619,38467,0.056003,0.061943,-0.100815,518010.68,690680.9
4,LTV_time,1,"(16.69, 50.35]",402,51038,51440,0.026583,0.084038,-1.150992,16.69,50.35
5,LTV_time,2,"(50.35, 83.87]",3684,271860,275544,0.243074,0.447631,-0.610604,50.35,83.87
6,LTV_time,3,"(83.87, 117.4]",8656,237677,246333,0.571042,0.391347,0.377868,83.87,117.4
7,LTV_time,4,"(117.4, 150.92]",2364,42099,44463,0.156003,0.069319,0.811147,117.4,150.92
8,interest_rate_time,1,"(2.31, 4.54]",999,64507,65506,0.065963,0.106215,-0.476372,2.31,4.54
9,interest_rate_time,2,"(4.54, 6.76]",3509,249510,253019,0.23153,0.410831,-0.57347,4.54,6.76


In [33]:
woe_df_to_apply = woe_df_full[['Variable', 'Group', 'WoE', 'lower_bound', 'upper_bound']]
woe_df_to_apply

Unnamed: 0,Variable,Group,WoE,lower_bound,upper_bound
0,balance_time,1,-0.14477,-690.68,172670.22
1,balance_time,2,0.217914,172670.22,345340.45
2,balance_time,3,0.070474,345340.45,518010.68
3,balance_time,4,-0.100815,518010.68,690680.9
4,LTV_time,1,-1.150992,16.69,50.35
5,LTV_time,2,-0.610604,50.35,83.87
6,LTV_time,3,0.377868,83.87,117.4
7,LTV_time,4,0.811147,117.4,150.92
8,interest_rate_time,1,-0.476372,2.31,4.54
9,interest_rate_time,2,-0.57347,4.54,6.76


In [34]:
data.head()

Unnamed: 0,id,time,orig_time,first_time,mat_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,...,REtype_SF_orig_time,investor_orig_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time,default_time,payoff_time,status_time
0,1,25,-7,25,113,41303.42,24.498336,9.2,226.29,2.899137,...,1,0,45000.0,715.0,69.4,9.2,,0,0,0
1,1,26,-7,25,113,41061.95,24.483867,9.2,225.1,2.151365,...,1,0,45000.0,715.0,69.4,9.2,,0,0,0
2,1,27,-7,25,113,40804.42,24.626795,9.2,222.39,2.361722,...,1,0,45000.0,715.0,69.4,9.2,,0,0,0
3,1,28,-7,25,113,40483.89,24.735883,9.2,219.67,1.229172,...,1,0,45000.0,715.0,69.4,9.2,,0,0,0
4,1,29,-7,25,113,40367.06,24.925476,9.2,217.37,1.692969,...,1,0,45000.0,715.0,69.4,9.2,,0,0,0


In [186]:
index_df = data[['id', 'time', 'default_time']]

In [37]:
#def check_row_for_number(woe_df, number):
 #   return woe_df['lower_bound'] <= number < woe_df['upper_bound']

#def woe_apply(input_df, woe_map, feature):
 #   feat_empty_list = []
    
  #  feat_list = input_df[feature]
    
   # feat_woe_map = woe_map[woe_map['Variable'] == feature]
    
   # for number in feat_list:
    #    feat_woe_map[feature] = feat_woe_map.apply(lambda woe_df: check_row_for_number(woe_df,number),axis = 1)
     #   row_in_range = feat_woe_map.loc[feat_woe_map[feature], 'WoE'].tolist()
      #  feat_empty_list.append(row_in_range)
    #return feat_empty_list

In [45]:
#def woe_apply(input_df, woe_map, feature):
    # Extract the relevant WOE mapping for the given feature
 #   feat_woe_map = woe_map[woe_map['Variable'] == feature][['lower_bound', 'upper_bound', 'WoE']]

    # Create a new column to store the WOE values
  #  woe_column_name = f'{feature}_WoE'
   # input_df[woe_column_name] = 0  # Initialize with default value

    # Iterate through each row in the input DataFrame
    #for index, row in input_df.iterrows():
     #   number = row[feature]

        # Find the corresponding WOE value for the number
      #  matched_row = feat_woe_map.apply(lambda woe_row: check_row_for_number(woe_row, number), axis=1)
       # woe_value = feat_woe_map.loc[matched_row, 'WoE'].values

        #if woe_value.size > 0:
         #   input_df.at[index, woe_column_name] = woe_value[0]

    #return input_df

In [182]:
def woe_apply(input_df, woe_map, feature):
    # Extract the relevant WOE mapping for the given feature
    feat_woe_map = woe_map[woe_map['Variable'] == feature][['lower_bound', 'upper_bound', 'WoE']]
    lower_bounds = feat_woe_map['lower_bound'].values
    upper_bounds = feat_woe_map['upper_bound'].values
    woe_values = feat_woe_map['WoE'].values

    # Extract NumPy arrays from the DataFrame
    numbers = input_df[feature].values
    
    # Find the corresponding WOE value for each number
    matched_rows = (numbers[:, None] > lower_bounds) & (numbers[:, None] <= upper_bounds)
    woe_vals = []
    for x in range(len(matched_rows)):
        if np.size(matched_rows[x]) == 0:
            woe_vals.append(0)
        else:
            woe_vals.append(woe_values[matched_rows[x]])

    woe_values = [item for arr in woe_vals for item in (arr if arr.size > 0 else [arr])]
    #input_df[woe_column_name] = np.where(matched_rows.any(axis=1), matched_woe_values[:, 0], np.nan)

    return woe_values

#### Binary Variables

In [183]:
REtype_CO_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'REtype_CO_orig_time') 
REtype_PU_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'REtype_PU_orig_time')
REtype_SF_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'REtype_SF_orig_time')
investor_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'investor_orig_time')

#### Continuous Variables

In [184]:
balance_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'balance_time') 
LTV_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'LTV_time')
interest_rate_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'interest_rate_time')
hpi_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'hpi_time')
gdp_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'gdp_time')
uer_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'uer_time')
balance_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'balance_orig_time')
FICO_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'FICO_orig_time')
LTV_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'LTV_orig_time')
Interest_Rate_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'Interest_Rate_orig_time')
hpi_orig_time = woe_apply(input_df = data, woe_map = woe_df_to_apply, feature = 'hpi_orig_time')

In [187]:
index_df

Unnamed: 0,id,time,default_time
0,1,25,0
1,1,26,0
2,1,27,0
3,1,28,0
4,1,29,0
...,...,...,...
622484,50000,56,0
622485,50000,57,0
622486,50000,58,0
622487,50000,59,0


#### Create Modelling Dataset

In [188]:
len(data)

622489

In [189]:
index_df['REtype_CO_orig_time'] = REtype_CO_orig_time
index_df['REtype_PU_orig_time'] = REtype_PU_orig_time
index_df['REtype_SF_orig_time'] = REtype_SF_orig_time
index_df['investor_orig_time'] = investor_orig_time
index_df['balance_time'] = balance_time
index_df['LTV_time'] = LTV_time
index_df['interest_rate_time'] = interest_rate_time
index_df['hpi_time'] = hpi_time
index_df['gdp_time'] = gdp_time
index_df['uer_time'] = uer_time
index_df['balance_orig_time'] = balance_orig_time
index_df['FICO_orig_time'] = FICO_orig_time
index_df['LTV_orig_time'] = LTV_orig_time
index_df['Interest_Rate_orig_time'] = Interest_Rate_orig_time
index_df['hpi_orig_time'] = hpi_orig_time

In [195]:
index_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622489 entries, 0 to 622488
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       622489 non-null  int64  
 1   time                     622489 non-null  int64  
 2   default_time             622489 non-null  int64  
 3   REtype_CO_orig_time      622489 non-null  float64
 4   REtype_PU_orig_time      622489 non-null  float64
 5   REtype_SF_orig_time      622489 non-null  float64
 6   investor_orig_time       622489 non-null  float64
 7   balance_time             622489 non-null  object 
 8   LTV_time                 622489 non-null  object 
 9   interest_rate_time       622489 non-null  object 
 10  hpi_time                 622489 non-null  float64
 11  gdp_time                 622489 non-null  object 
 12  uer_time                 622489 non-null  float64
 13  balance_orig_time        622489 non-null  object 
 14  FICO

In [265]:
balance = []
ltv = []
interest_rate = []
gdp = []
fico = []
bal = []
ltv_orig = []
ir = []
hpi = []

In [266]:
for x in range(len(index_df['balance_time'])):
    if np.size(index_df['balance_time'][x]) == 0:
        balance.append(0)
    else:
        balance.append(index_df['balance_time'][x])
    
for x in range(len(index_df['LTV_time'])):
    if np.size(index_df['LTV_time'][x]) == 0:
        ltv.append(0)
    else:
        ltv.append(index_df['LTV_time'][x])
    
for x in range(len(index_df['interest_rate_time'])):
    if np.size(index_df['interest_rate_time'][x]) == 0:
        interest_rate.append(0)
    else:
        interest_rate.append(index_df['interest_rate_time'][x])
    
for x in range(len(index_df['gdp_time'])):
    if np.size(index_df['gdp_time'][x]) == 0:
        gdp.append(0)
    else:
        gdp.append(index_df['gdp_time'][x])
    
for x in range(len(index_df['FICO_orig_time'])):
    if np.size(index_df['FICO_orig_time'][x]) == 0:
        fico.append(0)
    else:
        fico.append(index_df['FICO_orig_time'][x])
    
for x in range(len(index_df['balance_orig_time'])):
    if np.size(index_df['balance_orig_time'][x]) == 0:
        bal.append(0)
    else:
        bal.append(index_df['balance_orig_time'][x])
    
for x in range(len(index_df['LTV_orig_time'])):
    if np.size(index_df['LTV_orig_time'][x]) == 0:
        ltv_orig.append(0)
    else:
        ltv_orig.append(index_df['LTV_orig_time'][x])
    
for x in range(len(index_df['Interest_Rate_orig_time'])):
    if np.size(index_df['Interest_Rate_orig_time'][x]) == 0:
        ir.append(0)
    else:
        ir.append(index_df['Interest_Rate_orig_time'][x])
    
for x in range(len(index_df['hpi_orig_time'])):
    if np.size(index_df['hpi_orig_time'][x]) == 0:
        hpi.append(0)
    else:
        hpi.append(index_df['hpi_orig_time'][x])

In [267]:
index_df['balance_time'] = balance
index_df['LTV_time'] = ltv
index_df['interest_rate_time'] = interest_rate
index_df['gdp_time'] = gdp
index_df['balance_orig_time'] = bal
index_df['FICO_orig_time'] = fico
index_df['LTV_orig_time'] = ltv_orig
index_df['Interest_Rate_orig_time'] = ir
index_df['hpi_orig_time'] = hpi

In [268]:
index_df.head()

Unnamed: 0,id,time,default_time,REtype_CO_orig_time,REtype_PU_orig_time,REtype_SF_orig_time,investor_orig_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,uer_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time
0,1,25,0,5.8e-05,0.002972,0.018187,0.010347,-0.14477,-1.150992,0.64186,-0.333145,-0.38957,-0.118037,-0.108957,-0.098985,-0.316658,0.433394,0.0
1,1,26,0,5.8e-05,0.002972,0.018187,0.010347,-0.14477,-1.150992,0.64186,-0.333145,-0.38957,-0.118037,-0.108957,-0.098985,-0.316658,0.433394,0.0
2,1,27,0,5.8e-05,0.002972,0.018187,0.010347,-0.14477,-1.150992,0.64186,-0.333145,-0.38957,-0.118037,-0.108957,-0.098985,-0.316658,0.433394,0.0
3,1,28,0,5.8e-05,0.002972,0.018187,0.010347,-0.14477,-1.150992,0.64186,-0.333145,0.013494,-0.118037,-0.108957,-0.098985,-0.316658,0.433394,0.0
4,1,29,0,5.8e-05,0.002972,0.018187,0.010347,-0.14477,-1.150992,0.64186,-0.333145,0.013494,-0.118037,-0.108957,-0.098985,-0.316658,0.433394,0.0


In [269]:
index_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622489 entries, 0 to 622488
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       622489 non-null  int64  
 1   time                     622489 non-null  int64  
 2   default_time             622489 non-null  int64  
 3   REtype_CO_orig_time      622489 non-null  float64
 4   REtype_PU_orig_time      622489 non-null  float64
 5   REtype_SF_orig_time      622489 non-null  float64
 6   investor_orig_time       622489 non-null  float64
 7   balance_time             622489 non-null  float64
 8   LTV_time                 622489 non-null  float64
 9   interest_rate_time       622489 non-null  float64
 10  hpi_time                 622489 non-null  float64
 11  gdp_time                 622489 non-null  float64
 12  uer_time                 622489 non-null  float64
 13  balance_orig_time        622489 non-null  float64
 14  FICO

### Logistic Regression

In [270]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [362]:
X = index_df.iloc[:,3:]
y = index_df['default_time']

In [292]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [293]:
model = LogisticRegression()

model.fit(X_train, y_train)

In [294]:
y_pred = model.predict(X_test)

In [295]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.9754641306152174
Confusion Matrix:
 [[182165      0]
 [  4582      0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99    182165
           1       0.00      0.00      0.00      4582

    accuracy                           0.98    186747
   macro avg       0.49      0.50      0.49    186747
weighted avg       0.95      0.98      0.96    186747



In [363]:
import statsmodels.api as sm

In [388]:
X = index_df.iloc[:,3:]
y = index_df['default_time']

In [389]:
X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [390]:
# Create a logistic regression model
model = sm.Logit(y_train, X_train)

# Fit the model
result = model.fit()

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.106810
         Iterations 15
                           Logit Regression Results                           
Dep. Variable:           default_time   No. Observations:               435742
Model:                          Logit   Df Residuals:                   435726
Method:                           MLE   Df Model:                           15
Date:                Sun, 28 Jan 2024   Pseudo R-squ.:                 0.06493
Time:                        19:41:20   Log-Likelihood:                -46541.
converged:                       True   LL-Null:                       -49773.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      -3.6572      0.012   -309.655      0.000      -3.680  

In [391]:
X_train = X_train.drop('REtype_SF_orig_time',axis=1)

In [392]:
# Create a logistic regression model
model = sm.Logit(y_train, X_train)

# Fit the model
result = model.fit()

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.106810
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:           default_time   No. Observations:               435742
Model:                          Logit   Df Residuals:                   435727
Method:                           MLE   Df Model:                           14
Date:                Sun, 28 Jan 2024   Pseudo R-squ.:                 0.06492
Time:                        19:41:32   Log-Likelihood:                -46542.
converged:                       True   LL-Null:                       -49773.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      -3.6572      0.012   -309.656      0.000      -3.680  

In [393]:
X_train = X_train.drop('REtype_PU_orig_time',axis=1)

In [394]:
# Create a logistic regression model
model = sm.Logit(y_train, X_train)

# Fit the model
result = model.fit()

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.106813
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:           default_time   No. Observations:               435742
Model:                          Logit   Df Residuals:                   435728
Method:                           MLE   Df Model:                           13
Date:                Sun, 28 Jan 2024   Pseudo R-squ.:                 0.06490
Time:                        19:41:43   Log-Likelihood:                -46543.
converged:                       True   LL-Null:                       -49773.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      -3.6576      0.012   -309.737      0.000      -3.681  

In [395]:
X_train = X_train.drop('REtype_CO_orig_time',axis=1)

In [396]:
# Create a logistic regression model
model = sm.Logit(y_train, X_train)

# Fit the model
result = model.fit()

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.106816
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           default_time   No. Observations:               435742
Model:                          Logit   Df Residuals:                   435729
Method:                           MLE   Df Model:                           12
Date:                Sun, 28 Jan 2024   Pseudo R-squ.:                 0.06487
Time:                        19:41:58   Log-Likelihood:                -46544.
converged:                       True   LL-Null:                       -49773.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      -3.6577      0.012   -309.762      0.000      -3.681   

In [397]:
X_test

Unnamed: 0,const,REtype_CO_orig_time,REtype_PU_orig_time,REtype_SF_orig_time,investor_orig_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,uer_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time
401369,1.0,0.000058,-0.021074,-0.029350,0.010347,-0.144770,0.377868,-0.573470,-0.333145,-0.389570,-0.118037,-0.108957,-0.098985,0.136887,-0.895207,0.253288
309608,1.0,-0.000804,0.002972,-0.029350,0.010347,-0.144770,-0.610604,0.326919,-0.333145,0.013494,-0.118037,-0.108957,0.294872,0.136887,0.000000,0.253288
440122,1.0,0.000058,0.002972,0.018187,0.010347,-0.144770,-0.610604,0.326919,-0.060287,0.491559,-0.118037,-0.108957,-0.098985,0.136887,0.163445,-0.950487
411135,1.0,0.000058,-0.021074,-0.029350,0.010347,0.070474,0.377868,-0.476372,0.273474,0.013494,0.305998,0.003948,-0.098985,0.136887,-0.895207,0.253288
366835,1.0,0.000058,0.002972,0.018187,0.010347,-0.144770,0.377868,0.326919,-0.060287,0.491559,-0.118037,-0.108957,-0.098985,0.136887,0.163445,0.253288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133841,1.0,0.000058,0.002972,-0.029350,-0.066914,-0.144770,0.377868,0.326919,0.273474,0.013494,0.305998,-0.108957,-0.098985,0.136887,0.163445,0.253288
616737,1.0,0.000058,0.002972,0.018187,0.010347,0.217914,0.377868,0.326919,-0.060287,-0.389570,-0.129218,0.228378,0.294872,0.227232,0.000000,0.253288
272555,1.0,0.000058,-0.021074,-0.029350,-0.066914,-0.144770,0.377868,0.326919,0.273474,0.000000,-0.078812,-0.108957,-0.098985,0.136887,0.163445,0.253288
84980,1.0,0.000058,0.002972,0.018187,0.010347,-0.144770,-0.610604,0.641860,-0.061019,0.491559,-0.118037,-0.108957,-0.098985,0.000000,0.433394,0.000000


In [398]:
X_test = X_test.drop(['REtype_CO_orig_time', 'REtype_PU_orig_time', 'REtype_SF_orig_time'], axis = 1)

In [399]:
X_train.shape

(435742, 13)

In [400]:
X_test.shape

(186747, 13)

In [402]:
len(X_train.values[0])

13

In [403]:
len(X_test.values[0])

13

In [404]:
X_train

Unnamed: 0,const,investor_orig_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,uer_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time
564015,1.0,0.010347,0.000000,-0.610604,-0.573470,-0.333145,0.013494,-0.118037,0.000000,-0.098985,-0.316658,-0.895207,0.253288
604528,1.0,0.010347,-0.144770,-1.150992,0.641860,-0.060287,-0.389570,-0.129218,-0.108957,0.379538,0.000000,0.433394,0.253288
314717,1.0,0.010347,0.217914,0.377868,0.326919,-0.060287,0.491559,-0.129218,0.228378,0.294872,0.136887,0.000000,0.253288
52221,1.0,0.010347,0.217914,-0.610604,0.326919,-0.060287,0.000000,-0.129218,0.228378,0.294872,0.000000,0.163445,-0.417865
271712,1.0,0.010347,-0.144770,-0.610604,-0.573470,-0.333145,0.013494,-0.118037,-0.108957,-0.098985,-0.167005,0.163445,0.253288
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,1.0,0.010347,-0.144770,-1.150992,0.000000,-0.333145,-0.389570,-0.118037,-0.108957,0.294872,0.000000,0.000000,-0.417865
259178,1.0,0.010347,-0.144770,-0.610604,-0.573470,-0.333145,0.013494,-0.118037,-0.108957,-0.098985,0.136887,-0.895207,-0.729070
365838,1.0,0.010347,-0.144770,0.377868,-0.476372,-0.060287,-0.389570,-0.129218,0.228378,0.294872,0.136887,-0.105519,0.253288
131932,1.0,-0.066914,-0.144770,-0.610604,0.326919,-0.333145,-0.389570,-0.118037,-0.108957,-0.098985,0.136887,0.163445,0.253288


In [405]:
X_test

Unnamed: 0,const,investor_orig_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,uer_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time
401369,1.0,0.010347,-0.144770,0.377868,-0.573470,-0.333145,-0.389570,-0.118037,-0.108957,-0.098985,0.136887,-0.895207,0.253288
309608,1.0,0.010347,-0.144770,-0.610604,0.326919,-0.333145,0.013494,-0.118037,-0.108957,0.294872,0.136887,0.000000,0.253288
440122,1.0,0.010347,-0.144770,-0.610604,0.326919,-0.060287,0.491559,-0.118037,-0.108957,-0.098985,0.136887,0.163445,-0.950487
411135,1.0,0.010347,0.070474,0.377868,-0.476372,0.273474,0.013494,0.305998,0.003948,-0.098985,0.136887,-0.895207,0.253288
366835,1.0,0.010347,-0.144770,0.377868,0.326919,-0.060287,0.491559,-0.118037,-0.108957,-0.098985,0.136887,0.163445,0.253288
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133841,1.0,-0.066914,-0.144770,0.377868,0.326919,0.273474,0.013494,0.305998,-0.108957,-0.098985,0.136887,0.163445,0.253288
616737,1.0,0.010347,0.217914,0.377868,0.326919,-0.060287,-0.389570,-0.129218,0.228378,0.294872,0.227232,0.000000,0.253288
272555,1.0,-0.066914,-0.144770,0.377868,0.326919,0.273474,0.000000,-0.078812,-0.108957,-0.098985,0.136887,0.163445,0.253288
84980,1.0,0.010347,-0.144770,-0.610604,0.641860,-0.061019,0.491559,-0.118037,-0.108957,-0.098985,0.000000,0.433394,0.000000


In [407]:
model.predict(X_test)

ValueError: shapes (435742,13) and (186747,13) not aligned: 13 (dim 1) != 186747 (dim 0)

In [347]:
predictions = model.predict(new_X)

ValueError: shapes (435742,13) and (186747,13) not aligned: 13 (dim 1) != 186747 (dim 0)