## Read Data

In [15]:
import pandas as pd
fptrain = "../../../data/sba_7a_loans_train.parquet"
fptest = "../../../data/sba_7a_loans_test.parquet"
df_train = pd.read_parquet(fptrain)
df_test = pd.read_parquet(fptest)
df = pd.concat([df_train, df_test])

## Do the required preprocessing
Note: This step is **critical**

In [16]:
df["NaicsCode"] = df["NaicsCode"].apply(lambda x: x.replace(".0", ""))
df = df.drop(["BorrName"], axis=1)
dtypes_toset = {"BorrZip": 'str', "BankZip": "str", "BankFDICNumber": 'str',\
                "NaicsCode": 'str', "FranchiseCode": 'str', \
                "BusinessAge" : 'str', "LoanStatus": 'str'}

## Zip Code example

In [17]:
df["BorrZip"] = df["BorrZip"].apply(lambda x : str(x)[:3])
df["BankZip"] = df["BankZip"].apply(lambda x : str(x)[:3])

In [18]:

from optbinning import OptimalBinning

In [19]:
df

Unnamed: 0,BankFDICNumber,BankZip,BorrZip,NaicsCode,FranchiseCode,BusinessAge,LoanStatus,SBAGuaranteedApproval
0,Not Applicable,871,145,484121,Not Applicable,Change of Ownership,PIF,3525000.0
1,58665,284,654,531130,Not Applicable,Change of Ownership,PIF,654750.0
2,6560,432,774,449121,S0659,"Startup, Loan Funds will Open Business",PIF,150000.0
3,4767,802,852,423420,Not Applicable,Existing or more than 2 years old,PIF,3052500.0
4,33555,338,230,624410,Not Applicable,"Startup, Loan Funds will Open Business",PIF,187500.0
...,...,...,...,...,...,...,...,...
4588,4255,484,410,454110,Not Applicable,Existing or more than 2 years old,PIF,79600.0
4589,17308,968,968,812112,Not Applicable,"Startup, Loan Funds will Open Business",CHGOFF,15000.0
4590,6560,432,483,424990,Not Applicable,Existing or more than 2 years old,PIF,717750.0
4591,5304,542,530,312140,Not Applicable,Unanswered,PIF,175000.0


In [20]:
df["LoanStatus"] = df["LoanStatus"].apply(lambda x: 1.0 if x == "CHGOFF" else 0.0)

In [21]:
variable = "BorrZip"
x = df[variable].values
y = df["LoanStatus"].values

In [22]:

optb = OptimalBinning(name=variable, dtype="categorical", solver="cp")

In [23]:
optb.fit(x, y)

In [24]:
optb.status

'OPTIMAL'

## Borr Zip Code Splits (Optimal)

In [25]:
optb.splits

[array(['540', '580', '583', '584', '585', '586', '587', '575', '588',
        '591', '592', '593', '594', '595', '596', '590', '597', '573',
        '571', '548', '549', '550', '556', '557', '558', '572', '559',
        '561', '562', '563', '564', '566', '567', '560', '547', '598',
        '602', '632', '634', '636', '637', '638', '639', '629', '642',
        '644', '649', '651', '653', '654', '655', '643', '599', '628',
        '626', '607', '608', '609', '610', '611', '612', '627', '613',
        '617', '618', '619', '620', '623', '624', '616', '546', '545',
        '543', '455', '457', '464', '465', '466', '467', '451', '468',
        '470', '471', '472', '473', '474', '475', '469', '476', '449',
        '447', '409', '411', '414', '415', '421', '422', '448', '423',
        '425', '426', '427', '429', '434', '437', '424', '477', '478',
        '484', '512', '515', '516', '523', '524', '526', '508', '527',
        '534', '535', '537', '538', '997', '542', '528', '507', '506',
      

In [26]:
binning_table = optb.binning_table

In [27]:
binning_table.build()

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"[540, 580, 583, 584, 585, 586, 587, 575, 588, ...",5873,0.255782,5872,1,0.00017,5.643737,1.507405,0.090073
1,"[605, 841, 460, 410, 981, 461, 394, 402, 950, ...",1155,0.050303,1139,16,0.013853,1.231103,0.04532,0.005332
2,"[801, 660, 488, 727, 972, 971, 554, 917, 450, ...",1429,0.062236,1399,30,0.020994,0.808102,0.028605,0.003481
3,"[453, 320, 847, 577, 463, 921, 530, 553, 853, ...",1155,0.050303,1124,31,0.02684,0.556448,0.012184,0.001504
4,"[551, 834, 601, 640, 305, 980, 296, 846, 804, ...",1403,0.061104,1358,45,0.032074,0.372892,0.007195,0.000894
5,"[531, 630, 914, 752, 840, 919, 951, 307, 194, ...",2133,0.092897,2048,85,0.03985,0.147754,0.001897,0.000237
6,"[906, 370, 787, 161, 310, 500, 232, 974, 326, ...",1537,0.06694,1464,73,0.047495,-0.035746,8.7e-05,1.1e-05
7,"[327, 770, 215, 201, 302, 781, 648, 989, 894, ...",1194,0.052001,1127,67,0.056114,-0.211592,0.002565,0.00032
8,"[300, 993, 323, 895, 604, 502, 741, 481, 957, ...",1274,0.055485,1196,78,0.061224,-0.304185,0.005904,0.000735
9,"[213, 223, 233, 391, 303, 381, 803, 922, 337, ...",1947,0.084796,1806,141,0.072419,-0.484104,0.024852,0.003077
