# Import Packages & Data

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scorecardpy as sc

In [None]:
application = pd.read_csv('data/IS453 Group Assignment - Application Data.csv')
bureau = pd.read_csv('data/IS453 Group Assignment - Bureau Data.csv')

print(application.shape)
print(bureau.shape)

# filter for applicants that do not own a car
application = application[application['FLAG_OWN_CAR'] == 'N']

# filter bureau dataset for CREDIT_CURRENCY = currency 1
bureau = bureau[bureau['CREDIT_CURRENCY'] == 'currency 1']

# drop some useless and potentially problematic columns
application = application.drop(columns=['FLAG_OWN_CAR', 'OWN_CAR_AGE', 'CODE_GENDER'])
bureau = bureau.drop(columns=['CREDIT_CURRENCY'])

print(application.shape)
print(bureau.shape)

### Identify and drop invalid outliers

In [None]:
# maximum reasonable days_credit_update = 80 years * 365 days = 29,200 days
max_days_credit_update = -(80*365)

# find rows that exceed that threshold
bureau[bureau['DAYS_CREDIT_UPDATE'] < max_days_credit_update]

# sample code

# check number of rows before, drop those rows, and then check after
print(bureau.shape[0])
bureau.drop(bureau[bureau['DAYS_CREDIT_UPDATE'] < max_days_credit_update].index, inplace = True)
print(bureau.shape[0])

In [None]:
print(bureau['DAYS_CREDIT_UPDATE'].max())
print(bureau['DAYS_CREDIT_UPDATE'].min())

In [None]:
# Drop negative values in AMT_CREDIT_SUM_LIMIT
initial = bureau.shape[0]
bureau.drop(bureau[bureau['AMT_CREDIT_SUM_LIMIT'] < 0].index, inplace=True)
final = bureau.shape[0]

print(initial)
print(final)

In [None]:
print(bureau['AMT_CREDIT_SUM_LIMIT'].min())
# print(bureau['DAYS_CREDIT_UPDATE'].min())

### Missing values in Bureau data

In [None]:
pd.Series((bureau.isnull().sum().sort_values(ascending=False)/bureau.shape[0])).map("{0:.0%}".format)

In [None]:
# Drop columns with high percentage of missing values
bureau = bureau.drop(columns=['AMT_ANNUITY', 'AMT_CREDIT_MAX_OVERDUE'])


In [None]:
# Drop columns with high percentage of missing values
bureau = bureau.drop(columns=['DAYS_ENDDATE_FACT', 'AMT_CREDIT_SUM_LIMIT'])


In [None]:
amt_credit_sum_debt = bureau['AMT_CREDIT_SUM_DEBT']
amt_credit_sum_debt.fillna(value = 0, inplace = True)
# bureau['AMT_CREDIT_SUM_DEBT'] = amt_credit_sum_debt

print(amt_credit_sum_debt.value_counts())
print("n/a       ", amt_credit_sum_debt.isna().sum())

In [None]:
days_credit_enddate = bureau['DAYS_CREDIT_ENDDATE']
days_credit_enddate.fillna(value = 0, inplace = True)

print(days_credit_enddate.value_counts())
print("n/a       ", days_credit_enddate.isna().sum())

In [None]:
bureau.info()

In [None]:
pd.Series((bureau.isnull().sum().sort_values(ascending=False)/bureau.shape[0])).map("{0:.0%}".format)

### Missing values in Application data

In [None]:
# Set display option to show all rows
pd.set_option('display.max_rows', None)

# Print the percentage of missing values for each column
print(pd.Series((application.isnull().sum().sort_values(ascending=False) / application.shape[0])).map("{0:.0%}".format))

In [None]:
# Calculate the percentage of missing values for each column
missing_percentages = application.isnull().sum() / application.shape[0]

# Drop columns with more than 65% missing values
columns_to_drop = missing_percentages[missing_percentages > 0.34].index
application.drop(columns=columns_to_drop, inplace=True)

# Print the percentage of missing values for remaining columns (optional)
pd.set_option('display.max_rows', None)
print(pd.Series((application.isnull().sum().sort_values(ascending=False) / application.shape[0])).map("{0:.0%}".format))
print(application.shape)


In [None]:
columns_to_combine = [
    'AMT_REQ_CREDIT_BUREAU_YEAR',
    'AMT_REQ_CREDIT_BUREAU_QRT',
    'AMT_REQ_CREDIT_BUREAU_MON',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_HOUR'
]

# Create a new column that is the sum of these columns
application['AMT_REQ_CREDIT_BUREAU_TOTAL'] = application[columns_to_combine].sum(axis=1)
application.drop(columns=columns_to_combine, inplace=True)
application.head()

In [None]:
print(pd.Series((application.isnull().sum().sort_values(ascending=False) / application.shape[0])).map("{0:.0%}".format))

In [None]:
print(pd.Series((bureau.isnull().sum().sort_values(ascending=False) / bureau.shape[0])).map("{0:.0%}".format))

In [None]:
application.select_dtypes(include=['object', 'category']).columns

In [None]:
bureau.select_dtypes(include=['object', 'category']).columns

# Prepare Data for Preliminary Screening by IV

### Merge datasets

In [None]:
agg_bureau = bureau.copy()

# precompute credit status counts
agg_bureau['NUM_ACTIVE_CREDITS'] = (agg_bureau['CREDIT_ACTIVE'] == 'Active').astype(int)
agg_bureau['NUM_CLOSED_CREDITS'] = (agg_bureau['CREDIT_ACTIVE'] == 'Closed').astype(int)
agg_bureau['NUM_BADDEBT_CREDITS'] = (agg_bureau['CREDIT_ACTIVE'] == 'Bad debt').astype(int)

# precompute top 5 most common credit type counts
agg_bureau['NUM_CONSUMER_CREDIT_LOANS'] = (agg_bureau['CREDIT_TYPE'] == 'Consumer credit').astype(int)
agg_bureau['NUM_CREDIT_CARD_LOANS'] = (agg_bureau['CREDIT_TYPE'] == 'Credit card').astype(int)
agg_bureau['NUM_CAR_LOANS'] = (agg_bureau['CREDIT_TYPE'] == 'Car loan').astype(int)
agg_bureau['NUM_MORTGAGE_LOANS'] = (agg_bureau['CREDIT_TYPE'] == 'Mortgage').astype(int)
agg_bureau['NUM_MICRO_LOANS'] = (agg_bureau['CREDIT_TYPE'] == 'Microloan').astype(int)

# DEBT_CREDIT_RATIO = AMT_CREDIT_SUM_DEBT / AMT_CREDIT_SUM
agg_bureau['DEBT_CREDIT_RATIO'] = agg_bureau['AMT_CREDIT_SUM_DEBT'] / agg_bureau['AMT_CREDIT_SUM']

# flatten bureau data
agg_bureau = agg_bureau.groupby('SK_ID_CURR').agg(
    NUM_PREV_LOANS = ('SK_ID_BUREAU', 'count'),
    NUM_ACTIVE_CREDITS = ('NUM_ACTIVE_CREDITS', 'sum'),
    NUM_CLOSED_CREDITS = ('NUM_CLOSED_CREDITS', 'sum'),
    NUM_CONSUMER_CREDIT_LOANS = ('NUM_CONSUMER_CREDIT_LOANS', 'sum'),
    NUM_CREDIT_CARD_LOANS = ('NUM_CREDIT_CARD_LOANS', 'sum'),
    NUM_CAR_LOANS = ('NUM_CAR_LOANS', 'sum'),
    NUM_MORTGAGE_LOANS = ('NUM_MORTGAGE_LOANS', 'sum'),
    NUM_MICRO_LOANS = ('NUM_MICRO_LOANS', 'sum'),
    DAYS_CREDIT_MIN = ('DAYS_CREDIT', 'min'), # capture the oldest credit application
    DAYS_CREDIT_MAX = ('DAYS_CREDIT', 'max'), # find most recent credit application
    DAYS_CREDIT_MEAN = ('DAYS_CREDIT', 'mean'), # avg days before current application, for prev credit lines
    DAYS_CREDIT_OVERDUE_MAX = ('CREDIT_DAY_OVERDUE', 'max'),
    DAYS_CREDIT_OVERDUE_MEAN = ('CREDIT_DAY_OVERDUE', 'mean'),
    DAYS_CREDIT_ENDDATE_MEAN = ('DAYS_CREDIT_ENDDATE', 'mean'),
    CNT_CREDIT_PROLONG_MAX = ('CNT_CREDIT_PROLONG', 'max'),
    CNT_CREDIT_PROLONG_MEAN = ('CNT_CREDIT_PROLONG', 'mean'),
    # AMT_CREDIT_SUM_LIMIT_MEAN = ('AMT_CREDIT_SUM_LIMIT', 'mean'),
    AMT_CREDIT_SUM_OVERDUE_SUM = ('AMT_CREDIT_SUM_OVERDUE', 'sum'),
    AMT_CREDIT_SUM_OVERDUE_MAX = ('AMT_CREDIT_SUM_OVERDUE', 'max'),
    DAYS_CREDIT_UPDATE_MEAN = ('DAYS_CREDIT_UPDATE', 'mean'),
    DEBT_CREDIT_RATIO_MEAN = ('DEBT_CREDIT_RATIO', 'mean'),
    # AMT_ANNUITY_MEAN = ('AMT_ANNUITY', 'mean'),
).reset_index()


In [None]:
print(application.shape)
print(bureau.shape)
print(agg_bureau.shape)

merged = pd.merge(left=application, right=agg_bureau, on='SK_ID_CURR', how='left')
print(merged.shape)

### Dealing with missing values

In [None]:
merged_working = merged.copy()
cols = merged_working.columns[1:]
merged_working = merged_working.loc[:, cols]

In [None]:
# unable to calculate WOE & IV for ORGANIZATION_TYPE as it is because it has too many possible categories
    # group some values together

mapping = {
    'Business Entity Type 1': 'Business Entity',
    'Business Entity Type 2': 'Business Entity',
    'Business Entity Type 3': 'Business Entity',
    'Trade: type 7': 'Trade',
    'Trade: type 3': 'Trade',
    'Trade: type 2': 'Trade',
    'Trade: type 6': 'Trade',
    'Trade: type 1': 'Trade',
    'Trade: type 4': 'Trade',
    'Trade: type 5': 'Trade',
    'Transport: type 1': 'Transport',
    'Transport: type 2': 'Transport',
    'Transport: type 3': 'Transport',
    'Transport: type 4': 'Transport',
    'Industry: type 3': 'Industry',
    'Industry: type 11': 'Industry',
    'Industry: type 9': 'Industry',
    'Industry: type 7': 'Industry',
    'Industry: type 1': 'Industry',
    'Industry: type 4': 'Industry',
    'Industry: type 5': 'Industry',
    'Industry: type 6': 'Industry',
    'Industry: type 2': 'Industry',
    'Industry: type 10': 'Industry',
    'Industry: type 12': 'Industry',
    'Industry: type 13': 'Industry',
    'Industry: type 8': 'Industry',
    'Government': 'Public Sector',
    'Transport': 'Public Sector',
    'Military': 'Public Sector',
    'Police': 'Public Sector',
    'Hotel': 'Hospitality',
    'Restaurant': 'Hospitality',
    'Bank': 'Financial Services',
    'Insurance': 'Financial Services',
}

merged_working['ORGANIZATION_TYPE'] = merged_working['ORGANIZATION_TYPE'].replace(mapping)
print(merged_working['ORGANIZATION_TYPE'].value_counts())

#### Dropping Rows

In [None]:
# og_shape = iv_above_threshold.shape
# print(og_shape)

# Calculate the percentage of rows with more than 50 missing values
pct_total_rows_missing = merged_working[merged_working.isnull().sum(axis = 1) > 50].shape[0] / merged_working.shape[0]
print(f'Percent of total rows missing more 50 than values: {pct_total_rows_missing:.2%}')

In [None]:
merged_working.shape

In [None]:
# drop rows with more than 50 missing values per row
# for_analysis = merged_working.dropna(thresh = merged_working.shape[1] - 50)

# print(f'Ending row count: {for_analysis.shape[0]}')
# print(f'Percent dropped: {(1 - for_analysis.shape[0] / og_shape[0]):.2%}')

rows_org = merged_working.shape[0]
print(f'Starting row count: {rows_org}')

# drop rows based on a threshold of more than 35 missing values per row
merged_working = merged_working.dropna(thresh = merged_working.shape[1] - 50)
print(f'Ending row count: {merged_working.shape[0]}')
print(f'Percent dropped: {(1 - merged_working.shape[0] / rows_org):.2%}')

#### Dropping Columns

In [None]:
pd.Series(merged_working.isnull().sum().sort_values(ascending=False)/merged_working.shape[0]).map("{0:.0%}".format)

In [None]:
# Calculate the percentage of missing values for each col
missing_data = merged_working.isnull().mean().sort_values(ascending=False)
missing_data_over_50 = missing_data[missing_data > 0.5].apply(lambda x: "{:.0%}".format(x))
missing_data_over_50
merged_working_copy = merged_working.copy()

merged_working = merged_working.drop(columns=missing_data_over_50.index)
merged_working


In [None]:
numeric_data = merged_working.select_dtypes(include='number')
max = numeric_data.max()
min = numeric_data.min()
print("Max \n", max)
print("Min \n", min)

### One-hot encoding

In [None]:
# calculate bin ranges, WOE, and IV for the independent variables
bins = sc.woebin(merged_working, y='TARGET')

# create a dictionary to store the IVs for each variable
iv_dict = {}

# reverse WOE and store IV values in the dictionary
for variable, bindetails in bins.items():
    bins[variable]['woe'] = bins[variable]['woe'] * -1
    iv_value = bindetails['total_iv'][0]
    iv_dict[variable] = iv_value
    display(bindetails)

# Sort the dictionary by IV values in descending order
iv_dict_sorted = dict(sorted(iv_dict.items(), key=lambda item: item[1], reverse=True))

In [None]:
# print sorted IV values
for variable, iv in iv_dict_sorted.items():
    print(f"{variable}: IV = {iv:.4f}")
print(len(iv_dict_sorted))

In [None]:
# Count the number of columns with IV less than 0.02
columns_below_threshold = [variable for variable, iv in iv_dict_sorted.items() if iv < 0.02]

# Print the count
print(f"Number of columns with IV below 0.02: {len(columns_below_threshold)}")

# Drop columns with IV less than 0.02
iv_above_threshold = merged_working.drop(columns=columns_below_threshold)

print(f"Number of columns left after dropping: {len(iv_above_threshold.columns)}")


# Dealing with Rows with too many Missing Values

In [None]:
# drop rows with more than 35 missing values per row
for_analysis = iv_above_threshold.dropna(thresh = iv_above_threshold.shape[1] - 35)

print(f'Ending row count: {for_analysis.shape[0]}')
print(f'Percent dropped: {(1 - for_analysis.shape[0] / og_shape[0]):.2%}')

In [None]:
pd.Series(merged_working.isnull().sum().sort_values(ascending=False)/merged_working.shape[0]).map("{0:.0%}".format)

In [None]:
# Calculate the percentage of missing values for each col
missing_data = merged_working.isnull().mean().sort_values(ascending=False)
missing_data_over_50 = missing_data[missing_data > 0.5].apply(lambda x: "{:.0%}".format(x))
missing_data_over_50
merged_working_copy = merged_working.copy()

merged_working = merged_working.drop(columns=missing_data_over_50)
merged_working.head()


# Uni & Bivariate Analysis

In [None]:
print(for_analysis.columns.tolist())

In [None]:
def print_corr_matrix(data, threshold):

    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

    # Compute the correlation matrix for the numerical columns
    correlation_matrix = data[numeric_columns].corr().round(3)

    # find values in matrix greater than threshold
    temp = correlation_matrix[abs(correlation_matrix) > threshold]

    # Plot the correlation matrix using a heatmap for better visualization
    fig, ax = plt.subplots(figsize=(18,8))
    sns.heatmap(temp, xticklabels=temp.columns, yticklabels=temp.columns, annot=True, ax=ax)
    ax.grid(color='gray', linestyle='--', linewidth=0.5)


### Start Dropping Columns
- numerical columns: dropped by correlation, business logic, and IV
- categorical columns: check value counts and IV, drop if col is potentially problematic

In [None]:
cleaning = for_analysis.copy()

# drop cols related to household (based on business logic, these are irrelevant)
to_drop = [
    'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_AVG', 
    'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAREA_AVG',
    'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE','ELEVATORS_MODE', 
    'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAREA_MODE', 
    'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'ELEVATORS_MEDI',
    'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAREA_MEDI', 
    'TOTALAREA_MODE', 'WALLSMATERIAL_MODE'
]
cleaning = for_analysis.drop(columns=to_drop)

In [None]:
# Create a new variable to capture the percentage of closed credits
cleaning['PCT_CLOSED_CREDITS'] = (merged_working['NUM_CLOSED_CREDITS'] / (merged_working['NUM_CLOSED_CREDITS'] + merged_working['NUM_ACTIVE_CREDITS']))

# drop the 2 original columns
to_drop = [
    'NUM_CLOSED_CREDITS', 'NUM_ACTIVE_CREDITS'
]
cleaning = cleaning.drop(columns=to_drop)

In [None]:
# find high correlations
print_corr_matrix(cleaning, 0.65)

In [None]:
# amt credit, amt annuity, amt goods price have high correlation
    # let's check the amount of null values they have
temp = cleaning[['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']]
print(temp.isna().sum())

# let's also check their IVs
print(round(iv_dict['AMT_CREDIT'],4))
print(round(iv_dict['AMT_ANNUITY'],4))
print(round(iv_dict['AMT_GOODS_PRICE'],4))

# based on all the info, let's drop rows where AMT_GOODS_PRICE is empty
    # then combine AMT_GOODS_PRICE and AMT_CREDIT as a new variable
    # then drop the original 3 columns
cleaning = cleaning.dropna(subset=['AMT_GOODS_PRICE'])
cleaning['PCT_AMT_CREDIT_TO_GOODSPRICE'] = cleaning['AMT_CREDIT'] / cleaning['AMT_GOODS_PRICE']
to_drop = ['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']
cleaning = cleaning.drop(columns=to_drop)

In [None]:
# find number of nulls & IV of variables with high correlations
temp = [
    'FLAG_EMP_PHONE', 'DAYS_EMPLOYED', 
    'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 
    'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_WORK_CITY', 
    'DAYS_CREDIT_MIN', 'DAYS_CREDIT_MEAN', 'DAYS_CREDIT_MAX', 'DAYS_CREDIT_UPDATE_MEAN'
]

print(cleaning[temp].isna().sum())
for col in temp:
    print(f"{col}: {round(iv_dict[col],4)}")

# based on the printed info, we can drop the following columns
to_drop = ['FLAG_EMP_PHONE', 'REGION_RATING_CLIENT', 'LIVE_CITY_NOT_WORK_CITY', 'DAYS_CREDIT_MIN', 'DAYS_CREDIT_MAX', 'DAYS_CREDIT_UPDATE_MEAN']
cleaning = cleaning.drop(columns=to_drop)

In [None]:
# based some potentially iffy / irrelevant categorical columns
    # also checked the IVs for these columns
to_drop = [
    'NAME_EDUCATION_TYPE', # may be biased, IV = 0.0403
    'NAME_FAMILY_STATUS', # may be biased, IV = 0.0227
    'HOUSETYPE_MODE', # may be biased, IV = 0.0232
    'EMERGENCYSTATE_MODE' # housing related, may be irrelevant, IV = 0.0252
]
cleaning = cleaning.drop(columns=to_drop)

In [None]:
print(cleaning.columns.tolist())
print(len(cleaning.columns))

### Look at Categorical Columns
- bin them using scorecard py 
- get the bins and map the values
    - to one hot encode after

In [None]:
print(cleaning.select_dtypes(include='object').columns.tolist())
print(len(cleaning.select_dtypes(include='object').columns))
print(len(cleaning.select_dtypes(include=['float64', 'int64']).columns))

temp = cleaning.copy()

In [None]:
bins = sc.woebin(temp[['TARGET', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE']], y='TARGET')

for variable, bindetails in bins.items():
    bins[variable]['woe'] = bins[variable]['woe'] * -1
    # print('IV: ' + str(round(bindetails['total_iv'][0], 4)))
    # display(bindetails)

##### Calculate WOE and IV manually

In [None]:
def woe_iv(data, variable_name):
    working_data = data.copy()

    # Handle missing values
    working_data[variable_name].fillna('Missing', inplace=True)

    # Create a new dataframe for the calculations
    df = pd.DataFrame()
    df['Bins'] = working_data[variable_name].unique()
    df['Count'] = df['Bins'].map(working_data[variable_name].value_counts())
    df['Events'] = df['Bins'].map(working_data.groupby(variable_name)['TARGET'].sum())
    df['Non_Events'] = df['Count'] - df['Events']
    df['%_Events'] = df['Events'] / sum(df['Events'])
    df['%_Non_Events'] = df['Non_Events'] / sum(df['Non_Events'])
    df['WOE'] = np.log(df['%_Non_Events'] / df['%_Events'])
    df['IV'] = (df['%_Non_Events'] - df['%_Events']) * df['WOE']

    IV = sum(df['IV'])

    return IV, df

##### Handle NAME_INCOME_TYPE variable

In [None]:
print('IV: ' + str(round(bins['NAME_INCOME_TYPE']['total_iv'][0], 4)))
display(bins['NAME_INCOME_TYPE'])

# print the bins
print(bins['NAME_INCOME_TYPE']['bin'].tolist())

iv, df = woe_iv(temp, 'NAME_INCOME_TYPE')
df.sort_values(by='IV')

In [None]:
# change the breaks bc original breaks don't make sense
breaks = {'NAME_INCOME_TYPE': [
    'Working%,%Maternity leave%,%Businessman%,%Student', 'State servant%,%Unemployed%,%Commercial associate', 'Pensioner'
]}
bins2 = sc.woebin(temp[['TARGET', 'NAME_INCOME_TYPE']], y='TARGET', breaks_list=breaks)
for variable, bindetails in bins2.items():
    bins2[variable]['woe'] = bins2[variable]['woe'] * -1
    display(bindetails)

In [None]:
# save the bins into temp dataframe
mapping = {
    'Businessman': 'Working_Businessman_Maternity leave_Student',
    'Maternity leave': 'Working_Businessman_Maternity leave_Student',
    'Student': 'Working_Businessman_Maternity leave_Student',
    'Working': 'Working_Businessman_Maternity leave_Student',
    'Commercial associate': 'State servant_Unemployed_Commercial associate',
    'State servant': 'State servant_Unemployed_Commercial associate',
    'Unemployed': 'State servant_Unemployed_Commercial associate',
}
# save IVs of the individual bins for later
iv_dict_categorical = {
    'NAME_INCOME_TYPE_Working_Businessman_Maternity leave_Student': 0.024634,
    'NAME_INCOME_TYPE_State servant_Unemployed_Commercial associate': 0.003169,
    'NAME_INCOME_TYPE_Pensioner': 0.039381,
}
temp['NAME_INCOME_TYPE'] = temp['NAME_INCOME_TYPE'].replace(mapping)

##### Handle OCCUPATION_TYPE variable

In [None]:
print('IV: ' + str(round(bins['OCCUPATION_TYPE']['total_iv'][0], 4)))
display(bins['OCCUPATION_TYPE'])

# print the bins
print(bins['OCCUPATION_TYPE']['bin'].tolist())

In [None]:
temp['OCCUPATION_TYPE'].fillna('missing', inplace=True)
mapping = {
    'Accountants': 'bin1',
    'Core staff': 'bin1',
    'Managers': 'bin1',
    'IT staff': 'bin1',
    'High skill tech staff': 'bin1',
    'HR staff': 'bin1',
    'Private service staff': 'bin1',
    'Medicine staff': 'bin1',
    'Secretaries': 'bin1',
    'Realty agents': 'bin1',
    'Cleaning staff': 'bin2',
    'Sales staff': 'bin2',
    'Cooking staff': 'bin2',
    'Security staff': 'bin3',
    'Laborers': 'bin3',
    'Waiters/barmen staff': 'bin3',
    'Drivers': 'bin3',
    'Low-skill Laborers': 'bin3'
}
temp['OCCUPATION_TYPE'] = temp['OCCUPATION_TYPE'].replace(mapping)

# save the IVs
iv_dict_categorical['OCCUPATION_TYPE_bin1'] = 0.014435
iv_dict_categorical['OCCUPATION_TYPE_bin2'] = 0.006753
iv_dict_categorical['OCCUPATION_TYPE_bin3'] = 0.045982
iv_dict_categorical['OCCUPATION_TYPE_missing'] = 0.023020

##### Handle ORGANIZATION_TYPE variable

In [None]:
print('IV: ' + str(round(bins['ORGANIZATION_TYPE']['total_iv'][0], 4)))
display(bins['ORGANIZATION_TYPE'])

# print the bins
print(bins['ORGANIZATION_TYPE']['bin'].tolist())

In [None]:
mapping = {
    'University': 'bin1',
    'Culture': 'bin1',
    'Security Ministries': 'bin1',
    'XNA': 'bin1',
    'Financial Services': 'bin1',
    'School': 'bin1',
    'Services': 'bin2',
    'Public Sector': 'bin2',
    'Medicine': 'bin2',
    'Religion': 'bin2',
    'Emergency': 'bin2',
    'Kindergarten': 'bin2',
    'Other': 'bin3',
    'Electricity': 'bin3',
    'Telecom': 'bin3',
    'Housing': 'bin3',
    'Postal': 'bin3',
    'Cleaning': 'bin3',
    'Trade': 'bin3',
    'Industry': 'bin3',
    'Legal Services': 'bin3',
    'Business Entity': 'bin3',
    'Mobile': 'bin3',
    'Advertising': 'bin3',
    'Transport': 'bin3',
    'Self-employed': 'bin3',
    'Hospitality': 'bin3',
    'Agriculture': 'bin3',
    'Security': 'bin3',
    'Construction': 'bin3',
    'Realtor': 'bin3',
}
temp['ORGANIZATION_TYPE'] = temp['ORGANIZATION_TYPE'].replace(mapping)

# save the IVs
iv_dict_categorical['ORGANIZATION_TYPE_bin1'] = 0.047089
iv_dict_categorical['ORGANIZATION_TYPE_bin2'] = 0.003997
iv_dict_categorical['ORGANIZATION_TYPE_bin3'] = 0.024165

##### Dummy encoding and then calculate correlation

In [None]:
cats = temp.select_dtypes(include='object').columns.tolist()
encoded = pd.get_dummies(temp, columns=cats, drop_first=False, dtype=int)
encoded.head()

In [None]:
# drop those with low IV, below 0.02
print(iv_dict_categorical)
to_drop = [key for key, value in iv_dict_categorical.items() if value < 0.02]
encoded = encoded.drop(columns=to_drop)

In [None]:
# check correlation
print_corr_matrix(encoded, 0.65)

In [None]:
# check those with high correlation
print(iv_dict['DAYS_EMPLOYED'])
print(iv_dict_categorical['NAME_INCOME_TYPE_Pensioner'])
print(iv_dict_categorical['OCCUPATION_TYPE_missing'])
print(iv_dict_categorical['ORGANIZATION_TYPE_bin1'])
print(iv_dict_categorical['ORGANIZATION_TYPE_bin3'])

# based on this info, let's drop these columns
to_drop = ['NAME_INCOME_TYPE_Pensioner', 'OCCUPATION_TYPE_missing', 'ORGANIZATION_TYPE_bin1', 'ORGANIZATION_TYPE_bin3']
encoded = encoded.drop(columns=to_drop)

# Drop more columns (yall can play around w what you drop here)
- for scorecard tuning

In [None]:
encoded.head()

In [None]:
bins = sc.woebin(encoded, y='TARGET')

# initialise dict
iv_dict = {}

for variable, bindetails in bins.items():
    bins[variable]['woe'] = bins[variable]['woe'] * -1
    iv_dict[variable] = round(bindetails['total_iv'][0], 4)

In [None]:
iv_dict = dict(sorted(iv_dict.items(), key=lambda item: item[1], reverse=True))
print(f'Total number of rows: {len(encoded)}')

for key, value in iv_dict.items():
    print(f'{key}: IV = {value}')
    print(f'\tNumber of missing values: {encoded[key].isna().sum()}')
    print(f'\tPercentage of missing values: {round(encoded[key].isna().sum() / len(encoded),3)}')

print(encoded.columns.tolist())

In [None]:
print_corr_matrix(encoded, 0.5)

In [None]:
to_drop = [
    'FLAG_DOCUMENT_12', 'NUM_CREDIT_CARD_LOANS', 'REG_CITY_NOT_LIVE_CITY', # iv below 0.04
    'FLAG_DOCUMENT_3', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'REG_CITY_NOT_WORK_CITY', # iv below 0.04
    'DAYS_BIRTH', # potentially biased + relatively high correlation
    'EXT_SOURCE_1', # many missing values (more than half the dataset)
]

encoded = encoded.drop(columns=to_drop)

In [None]:
encoded

# Bin WOEs to achieve monotonicity
- source for Binning function: https://github.com/jstephenj14/Monotonic-WOE-Binning-Algorithm/blob/master/monotonic_binning/monotonic_woe_binning.py
    - derives the bins where monotonicity is achieved
- implementation: https://lelesgaray.github.io/blog/scorecard/

In [None]:
# !pip install monotonic-binning

In [None]:
from monotonic_binning.monotonic_woe_binning import Binning 

# split data into test and train
train, test = sc.split_df(encoded, 'TARGET', ratio=0.7, seed=42).values()

x_vars = train.drop(['TARGET'], axis=1).columns
y_var = train['TARGET']

def get_breaks_for_monotonicity(x_vars, y_var_name):
  bin_object = Binning(y_var_name, n_threshold=50, y_threshold=10, p_threshold=0.35, sign=False)
  breaks = {}
  for var in x_vars:
    bin_object.fit(train[[y_var_name, var]])
    breaks[var] = (bin_object.bins[1:-1].tolist())
    print(f'check: {var} fitting done')
  return breaks

new_breaks = get_breaks_for_monotonicity(x_vars, 'TARGET')

In [None]:
bins = sc.woebin(encoded, y='TARGET', breaks_list=new_breaks, positive='bad|0') # change positive to reverse WOE sign
sc.woebin_plot(bins)

# Generate Scorecard

In [None]:
# import stuff
from sklearn import linear_model, metrics
from sklearn.linear_model import LogisticRegression
import pprint

In [None]:
# encode WOE values
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    train_woe = sc.woebin_ply(train, bins)
    test_woe = sc.woebin_ply(test, bins)

In [None]:
# create the X, y parts of data for train and test
y_train = train_woe.loc[:, 'TARGET']
X_train = train_woe.loc[:, train_woe.columns != 'TARGET']
y_test = test_woe.loc[:, 'TARGET']
X_test = test_woe.loc[:, train_woe.columns != 'TARGET']

# create and fit model
lr = linear_model.LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train)

# find coefficients
coeff = pd.Series(np.concatenate([lr.intercept_, lr.coef_[0]]),index = np.concatenate([['intercept'], lr.feature_names_in_]))
coeff

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    card = sc.scorecard(bins, lr, X_train.columns, points0=600, odds0=1/20, pdo=20, basepoints_eq0=True)

pprint.pprint(card)

In [None]:
# print evaluation metrics of the model
y_pred = lr.predict(X_test)

print('Confusion matrix:')
print(metrics.confusion_matrix(y_test, y_pred))
print('PCC measures:')
print(metrics.classification_report(y_test, y_pred))

In [None]:
# performance roc
train_pred = lr.predict_proba(X_train)[:, 1]
test_pred = lr.predict_proba(X_test)[:, 1]
train_perf = sc.perf_eva(y_train, train_pred, plot_type = ['roc'], title = 'train')
test_perf = sc.perf_eva(y_test, test_pred, plot_type = ['roc'], title = 'test')