# System Specifications Information

In [1]:
# Check Colab Pro / Local instance specs
# !df -h
# !cat /proc/cpuinfo
# !cat /proc/meminfo

# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

# Notebook Setup

In [2]:
import os
import re
import gc
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Connecting Google Drive as primary storage for data

In [3]:
# Mount Google Drive and confirm access permissions to permitaccess for data located on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
# Specify project directory personal filepaths under Google Drive

user_proj_path = ''

proj_dir = '/content/drive/MyDrive/' + user_proj_path + 'plodi/'
proj_dir_data_raw = proj_dir + 'data/raw/'
proj_dir_data_proc = proj_dir + 'data/processed/'

dirs = [proj_dir, proj_dir_data_raw, proj_dir_data_proc]

for dir in dirs:
    print('-----------------------------------------------------------------------')
    print('Directory contents for ', dir)
    %ls {dir}

-----------------------------------------------------------------------
Directory contents for  /content/drive/MyDrive/plodi/
[0m[01;34mdata[0m/
-----------------------------------------------------------------------
Directory contents for  /content/drive/MyDrive/plodi/data/raw/
2-6%20digit_2017_Codes.xlsx      public_up_to_150k_11_230630.csv  public_up_to_150k_6_230630.csv
2-6%20digit_2022_Codes.xlsx      public_up_to_150k_12_230630.csv  public_up_to_150k_7_230630.csv
6-digit_2017_Codes.xlsx          public_up_to_150k_1_230630.csv   public_up_to_150k_8_230630.csv
6-digit_2022_Codes.xlsx          public_up_to_150k_2_230630.csv   public_up_to_150k_9_230630.csv
ppp-data-dictionary.xlsx         public_up_to_150k_3_230630.csv   sba_ppp_combined.csv
public_150k_plus_230630.csv      public_up_to_150k_4_230630.csv
public_up_to_150k_10_230630.csv  public_up_to_150k_5_230630.csv
-----------------------------------------------------------------------
Directory contents for  /content/drive/MyD

In [13]:
# drive.mount("/content/drive", force_remount=True) # Run if the Google drive needs to be remounted

Mounted at /content/drive


In [12]:
# Run after completing Notebook run to gracefully disconnect Gdrive
drive.flush_and_unmount()

In [None]:
#pd.set_option('display.max_columns', None)
#df_ppp_data.head(10)

# Downloading primary data sets

In [None]:
sba_ppp_dict_url = 'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/aab8e9f9-36d1-42e1-b3ba-e59c79f1d7f0/download/ppp-data-dictionary.xlsx'

# Dynamic time-limited URLs for loans requiring SBA.gov queries for project specified date ranges
sba_ppp_url_list = [
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/4b3c3e7a-1286-4883-b857-d37058f9693c/download/public_150k_plus_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/c95195f6-0af6-4b84-8c65-e7cd6b940cc2/download/public_up_to_150k_1_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/eaa51a51-ef19-4c22-affe-61ede7253c6f/download/public_up_to_150k_2_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/eaa51a51-ef19-4c22-affe-61ede7253c6f/download/public_up_to_150k_3_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/b4ec101e-ad78-4a25-a058-ab03b049766b/download/public_up_to_150k_4_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/ea284b67-d0b7-4e65-bc48-663e9bb6dac1/download/public_up_to_150k_5_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/ef56afe8-08f8-4bfa-8a58-29690f5baae0/download/public_up_to_150k_6_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/81f1e2be-28a2-4854-bfe1-1e0d408f9fd0/download/public_up_to_150k_7_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/6ada73a2-8176-4e71-8689-30490d9f8a2f/download/public_up_to_150k_8_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/985f0c28-e799-4940-94a9-96a7c5c604a6/download/public_up_to_150k_9_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/3487edaa-92b3-47f4-b147-06fd6d79f786/download/public_up_to_150k_10_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/3487edaa-92b3-47f4-b147-06fd6d79f786/download/public_up_to_150k_11_230630.csv',
    'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/3487edaa-92b3-47f4-b147-06fd6d79f786/download/public_up_to_150k_12_230630.csv',]


# Setting NAICS data URLs

naics_2017_2to6_digit_url = 'https://www.census.gov/naics/2017NAICS/2-6%20digit_2017_Codes.xlsx'
naics_2017_6_digit_url = 'https://www.census.gov/naics/2017NAICS/6-digit_2017_Codes.xlsx'
naics_2022_2to6_digit_url = 'https://www.census.gov/naics/2022NAICS/2-6%20digit_2022_Codes.xlsx'
naics_2022_6_digit_url = 'https://www.census.gov/naics/2022NAICS/6-digit_2022_Codes.xlsx'

%cd $proj_dir_data_raw # Set working directory for source data download

# Run first time to download SBA PPP data to Google Drive

!curl -O $sba_ppp_dict_url
for url in sba_ppp_url_list:
  !curl -O {url}

# Run first time to download NAICS codes to Google Drive

!curl -O {naics_2017_2to6_digit_url}
!curl -O {naics_2017_6_digit_url}
!curl -O {naics_2022_2to6_digit_url}
!curl -O {naics_2022_6_digit_url}

In [None]:
!ls

2-6%20digit_2017_Codes.xlsx  public_up_to_150k_10_230630.csv  public_up_to_150k_4_230630.csv
2-6%20digit_2022_Codes.xlsx  public_up_to_150k_11_230630.csv  public_up_to_150k_5_230630.csv
6-digit_2017_Codes.xlsx      public_up_to_150k_12_230630.csv  public_up_to_150k_6_230630.csv
6-digit_2022_Codes.xlsx      public_up_to_150k_1_230630.csv   public_up_to_150k_7_230630.csv
ppp-data-dictionary.xlsx     public_up_to_150k_2_230630.csv   public_up_to_150k_8_230630.csv
public_150k_plus_230630.csv  public_up_to_150k_3_230630.csv   public_up_to_150k_9_230630.csv


In [None]:
# Load individual SBA PPP files and export as one combined file
ppp_files_li = [filename for filename in os.listdir(proj_dir_data_raw) if filename.startswith('public_')]
df_ppp_data = pd.concat([pd.read_csv(proj_dir_data_raw+filename, header=0) for filename in ppp_files_li], ignore_index=True)

filename = 'sba_ppp_combined.csv'

# Only run once if need to combine individual SBA PPP files into one
df_ppp_data.to_csv(proj_dir_data_raw+filename, index=False)

  df_ppp_data = pd.concat([pd.read_csv(proj_dir_data_raw+filename, header=0) for filename in ppp_files_li], ignore_index=True)


In [None]:
!ls

2-6%20digit_2017_Codes.xlsx	 public_up_to_150k_11_230630.csv  public_up_to_150k_6_230630.csv
2-6%20digit_2022_Codes.xlsx	 public_up_to_150k_12_230630.csv  public_up_to_150k_7_230630.csv
6-digit_2017_Codes.xlsx		 public_up_to_150k_1_230630.csv   public_up_to_150k_8_230630.csv
6-digit_2022_Codes.xlsx		 public_up_to_150k_2_230630.csv   public_up_to_150k_9_230630.csv
ppp-data-dictionary.xlsx	 public_up_to_150k_3_230630.csv   sba_ppp_combined.csv
public_150k_plus_230630.csv	 public_up_to_150k_4_230630.csv
public_up_to_150k_10_230630.csv  public_up_to_150k_5_230630.csv


In [None]:
# Only run once if need to combine individual SBA PPP files into one
df_ppp_data.to_csv(proj_dir_data_raw+filename, index=False)

In [30]:
# Load SBA PPP Data Dictionary into data frame
filename = 'ppp-data-dictionary.xlsx'
df_ppp_dict = pd.read_excel(proj_dir_data_raw+filename, header=0)

In [34]:
print(df_ppp_dict.shape, '\n')
display(df_ppp_dict)

(53, 2) 



Unnamed: 0,Field Name,Field Description
0,LoanNumber,Loan Number (unique identifier)
1,DateApproved,Loan Funded Date
2,SBAOfficeCode,SBA Origination Office Code
3,ProcessingMethod,Loan Delivery Method (PPP for first draw; PPS ...
4,BorrowerName,Borrower Name
5,BorrowerAddress,Borrower Street Address
6,BorrowerCity,Borrower City
7,BorrowerState,Borrower State
8,BorrowerZip,Borrower Zip Code
9,LoanStatusDate,Loan Status Date\n- Loan Status Date is blank...


In [36]:
filename = 'sba_ppp_combined.csv'

# Load SBA PPP files into data frame from combined file
df_ppp_data = pd.read_csv(proj_dir_data_raw+filename, header=0)

  df_ppp_data = pd.read_csv(proj_dir_data_raw+filename, header=0)


In [37]:
df_ppp_data.describe

<bound method NDFrame.describe of           LoanNumber DateApproved  SBAOfficeCode ProcessingMethod  \
0         9595657403   05/20/2020         1086.0              PPP   
1         9608808102   07/28/2020         1086.0              PPP   
2         9611377305   05/02/2020         1086.0              PPP   
3         9620717204   04/28/2020         1086.0              PPP   
4         9645637309   05/02/2020         1086.0              PPP   
...              ...          ...            ...              ...   
11765547  4395967002   04/03/2020          897.0              PPP   
11765548  6985647108   04/14/2020          897.0              PPP   
11765549  7996438405   02/12/2021          897.0              PPS   
11765550  9054647103   04/15/2020          897.0              PPP   
11765551  9184687004   04/09/2020          897.0              PPP   

                                               BorrowerName  \
0                                 ALIVE & WELL HEALING ARTS   
1          

In [42]:
print('\n\nLoan Data # of NA by Feature \n', df_ppp_data.isnull().sum(axis = 0))



Loan Data # of NA by Feature 
 LoanNumber                            0
DateApproved                          0
SBAOfficeCode                        28
ProcessingMethod                      0
BorrowerName                         51
BorrowerAddress                     200
BorrowerCity                        185
BorrowerState                       165
BorrowerZip                         177
LoanStatusDate                   437122
LoanStatus                            0
Term                                  0
SBAGuarantyPercentage                 0
InitialApprovalAmount                 0
CurrentApprovalAmount                 0
UndisbursedAmount                  1062
FranchiseName                  11608197
ServicingLenderLocationID            28
ServicingLenderName                  28
ServicingLenderAddress               28
ServicingLenderCity                  28
ServicingLenderState                 28
ServicingLenderZip                   28
RuralUrbanIndicator                   0
Hubzone

In [43]:
df_ppp_data.nunique()

LoanNumber                     9065552
DateApproved                       246
SBAOfficeCode                       76
ProcessingMethod                     2
BorrowerName                   7054608
BorrowerAddress                7401361
BorrowerCity                     66601
BorrowerState                       58
BorrowerZip                    4025231
LoanStatusDate                    1005
LoanStatus                           3
Term                               136
SBAGuarantyPercentage                1
InitialApprovalAmount          1013948
CurrentApprovalAmount          1021708
UndisbursedAmount                   41
FranchiseName                     3467
ServicingLenderLocationID         5121
ServicingLenderName               4594
ServicingLenderAddress            4858
ServicingLenderCity               3040
ServicingLenderState                55
ServicingLenderZip                5079
RuralUrbanIndicator                  2
HubzoneIndicator                     2
LMIIndicator             

# Load Final Processed Data Set

In [None]:
# Set of cols to use on working modeling data file

usecols = ['LoanNumber',
 'Term',
 'InitialApprovalAmount',
 'CurrentApprovalAmount',
 'UndisbursedAmount',
 'JobsReported',
 'NAICSCode',
 'UTILITIES_PROCEED',
 'PAYROLL_PROCEED',
 'MORTGAGE_INTEREST_PROCEED',
 'RENT_PROCEED',
 'REFINANCE_EIDL_PROCEED',
 'HEALTH_CARE_PROCEED',
 'DEBT_INTEREST_PROCEED',
 'ForgivenessAmount',
 'ProcessingMethod_cat',
 'LoanStatus_cat',
 'RuralUrbanIndicator_cat',
 'HubzoneIndicator_cat',
 'LMIIndicator_cat',
 'Race_cat',
 'Ethnicity_cat',
 'BusinessType_cat',
 'Gender_cat',
 'Veteran_cat',
 'NonProfit_cat',
 'Verified_Address',
 'avg_employee_pay',
 'pay_ratio',
 'pay_ratio_binary',
 'pay_100k',
#  'Fraud',
 'Labelled_Loan_Binary']

dtype_dict = {'LoanNumber': np.uint,
 'Term': np.intc,
 'InitialApprovalAmount': np.single,
 'CurrentApprovalAmount': np.single,
 'UndisbursedAmount': np.single,
 'JobsReported': np.intc,
 'NAICSCode': np.intc,
 'UTILITIES_PROCEED': np.single,
 'PAYROLL_PROCEED': np.single,
 'MORTGAGE_INTEREST_PROCEED': np.single,
 'RENT_PROCEED': np.single,
 'REFINANCE_EIDL_PROCEED': np.single,
 'HEALTH_CARE_PROCEED': np.single,
 'DEBT_INTEREST_PROCEED': np.single,
 'ForgivenessAmount': np.single,
 'ProcessingMethod_cat': np.byte,
 'LoanStatus_cat': np.byte,
 'RuralUrbanIndicator_cat': np.byte,
 'HubzoneIndicator_cat': np.byte,
 'LMIIndicator_cat': np.byte,
 'Race_cat': np.byte,
 'Ethnicity_cat': np.byte,
 'BusinessType_cat': np.byte,
 'Gender_cat': np.byte,
 'Veteran_cat': np.byte,
 'NonProfit_cat': np.byte,
 'Verified_Address': np.byte,
 'avg_employee_pay': np.single,
 'pay_ratio': np.single,
 'pay_ratio_binary': np.byte,
 'pay_100k': np.byte,
#  'Fraud': np.byte,
 'Labelled_Loan_Binary': np.byte}

index_col = 'LoanNumber'

# filepath = 's3://sagemaker-us-west-1-945035589481/modelling_data.csv'
filename = 'modelling_data.csv'
filepath = proj_dir_data_proc+filename
df_data = pd.read_csv(filepath, header=0, index_col=index_col, usecols=usecols, dtype=dtype_dict)

In [None]:
df_data.columns.tolist()

['Term',
 'InitialApprovalAmount',
 'CurrentApprovalAmount',
 'UndisbursedAmount',
 'JobsReported',
 'NAICSCode',
 'UTILITIES_PROCEED',
 'PAYROLL_PROCEED',
 'MORTGAGE_INTEREST_PROCEED',
 'RENT_PROCEED',
 'REFINANCE_EIDL_PROCEED',
 'HEALTH_CARE_PROCEED',
 'DEBT_INTEREST_PROCEED',
 'ForgivenessAmount',
 'ProcessingMethod_cat',
 'LoanStatus_cat',
 'RuralUrbanIndicator_cat',
 'HubzoneIndicator_cat',
 'LMIIndicator_cat',
 'Race_cat',
 'Ethnicity_cat',
 'BusinessType_cat',
 'Gender_cat',
 'Veteran_cat',
 'NonProfit_cat',
 'Verified_Address',
 'avg_employee_pay',
 'pay_ratio',
 'pay_ratio_binary',
 'pay_100k',
 'Labelled_Loan_Binary']

In [None]:
print('\n\nLoan Number # of NA \n', df_data.isnull().sum(axis = 0))



Loan Number # of NA 
 Term                         0
InitialApprovalAmount        0
CurrentApprovalAmount        0
UndisbursedAmount            0
JobsReported                 0
NAICSCode                    0
UTILITIES_PROCEED            0
PAYROLL_PROCEED              0
MORTGAGE_INTEREST_PROCEED    0
RENT_PROCEED                 0
REFINANCE_EIDL_PROCEED       0
HEALTH_CARE_PROCEED          0
DEBT_INTEREST_PROCEED        0
ForgivenessAmount            0
ProcessingMethod_cat         0
LoanStatus_cat               0
RuralUrbanIndicator_cat      0
HubzoneIndicator_cat         0
LMIIndicator_cat             0
Race_cat                     0
Ethnicity_cat                0
BusinessType_cat             0
Gender_cat                   0
Veteran_cat                  0
NonProfit_cat                0
Verified_Address             0
avg_employee_pay             0
pay_ratio                    0
pay_ratio_binary             0
pay_100k                     0
Labelled_Loan_Binary         0
dtype: int64


In [None]:
np.isinf(df_data).values.sum()

0

In [None]:
df_data.head(5)

Unnamed: 0_level_0,Term,InitialApprovalAmount,CurrentApprovalAmount,UndisbursedAmount,JobsReported,NAICSCode,UTILITIES_PROCEED,PAYROLL_PROCEED,MORTGAGE_INTEREST_PROCEED,RENT_PROCEED,...,BusinessType_cat,Gender_cat,Veteran_cat,NonProfit_cat,Verified_Address,avg_employee_pay,pay_ratio,pay_ratio_binary,pay_100k,Labelled_Loan_Binary
LoanNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3383618704,60,13540.0,13540.0,0.0,1,51,1.0,13538.0,0.0,0.0,...,20,1,0,0,0,64.991997,0.909467,0,0,0
5776278404,60,6205.399902,6205.399902,0.0,1,62,0.0,6205.399902,0.0,0.0,...,18,2,1,0,1,29.785919,0.609532,0,0,0
5966978904,60,27900.0,27900.0,0.0,4,23,1.0,27898.0,0.0,0.0,...,21,2,1,0,1,33.48,0.570875,0,0,0
9647148304,60,68307.0,68307.0,0.0,7,33,1.0,68304.0,0.0,0.0,...,5,1,0,0,0,46.839085,1.0,0,0,0
3136537210,24,8700.0,8700.0,0.0,1,81,0.0,8700.0,0.0,0.0,...,5,2,1,0,0,41.759998,1.497966,0,0,0


In [None]:
df_data.describe

<bound method NDFrame.describe of             Term  InitialApprovalAmount  CurrentApprovalAmount  \
LoanNumber                                                       
3383618704    60           1.354000e+04           1.354000e+04   
5776278404    60           6.205400e+03           6.205400e+03   
5966978904    60           2.790000e+04           2.790000e+04   
9647148304    60           6.830700e+04           6.830700e+04   
3136537210    24           8.700000e+03           8.700000e+03   
...          ...                    ...                    ...   
7368587102    24           3.420000e+04           4.380000e+04   
5355747104    24           2.820000e+04           2.820000e+04   
6463348808    60           2.079100e+04           2.079100e+04   
6414248501    60           2.083250e+04           2.083250e+04   
6749737207    24           1.018847e+06           1.018847e+06   

            UndisbursedAmount  JobsReported  NAICSCode  UTILITIES_PROCEED  \
LoanNumber                    

In [None]:
print(df_data.min(axis=0))
print(df_data.max(axis=0))

Term                              0.0
InitialApprovalAmount       -199659.0
CurrentApprovalAmount             0.0
UndisbursedAmount                 0.0
JobsReported                     -6.0
NAICSCode                        10.0
UTILITIES_PROCEED                 0.0
PAYROLL_PROCEED                   0.0
MORTGAGE_INTEREST_PROCEED         0.0
RENT_PROCEED                      0.0
REFINANCE_EIDL_PROCEED            0.0
HEALTH_CARE_PROCEED               0.0
DEBT_INTEREST_PROCEED             0.0
ForgivenessAmount                 0.0
ProcessingMethod_cat              0.0
LoanStatus_cat                    0.0
RuralUrbanIndicator_cat           0.0
HubzoneIndicator_cat              0.0
LMIIndicator_cat                  0.0
Race_cat                          0.0
Ethnicity_cat                     0.0
BusinessType_cat                  0.0
Gender_cat                        0.0
Veteran_cat                       0.0
NonProfit_cat                     0.0
Verified_Address                  0.0
avg_employee

In [None]:
for label in df_data.columns.tolist():
  print(label, ': ', df_data[label].dtype)

Term :  int32
InitialApprovalAmount :  float32
CurrentApprovalAmount :  float32
UndisbursedAmount :  float32
JobsReported :  int32
NAICSCode :  int32
UTILITIES_PROCEED :  float32
PAYROLL_PROCEED :  float32
MORTGAGE_INTEREST_PROCEED :  float32
RENT_PROCEED :  float32
REFINANCE_EIDL_PROCEED :  float32
HEALTH_CARE_PROCEED :  float32
DEBT_INTEREST_PROCEED :  float32
ForgivenessAmount :  float32
ProcessingMethod_cat :  int8
LoanStatus_cat :  int8
RuralUrbanIndicator_cat :  int8
HubzoneIndicator_cat :  int8
LMIIndicator_cat :  int8
Race_cat :  int8
Ethnicity_cat :  int8
BusinessType_cat :  int8
Gender_cat :  int8
Veteran_cat :  int8
NonProfit_cat :  int8
Verified_Address :  int8
avg_employee_pay :  float32
pay_ratio :  float32
pay_ratio_binary :  int8
pay_100k :  int8
Labelled_Loan_Binary :  int8


# PyOD XGBOD

In [None]:
from pyod.models.xgbod import XGBOD

In [None]:
label_col = 'Labelled_Loan_Binary'
test_size = 0.1
train_size = 0.1
random_state = 21

clf_name = 'XGBOD'
contamination = 0.08

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_data.drop(['Labelled_Loan_Binary'], axis=1), df_data['Labelled_Loan_Binary'],
                                                    test_size=test_size, train_size=train_size, random_state=random_state, stratify=df_data['Labelled_Loan_Binary'].values)

In [None]:
clf = XGBOD(random_state=random_state, silent=0)
clf.fit(X_train,y_train)

Parameters: { "silent" } are not used.



XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no...x_features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=21,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=1, nthread=None,
   objective='binary:logistic', random_state=21, reg_alpha=0, reg_lambda=1,
   scale_pos_weight=1, silent=0,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, Fal

In [None]:
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

In [None]:
np.sum(y_train)
np.sum(y_train_pred)

In [None]:
# evaluate and print the results
print('Training Data:')
evaluate_print(clf_name, y_train, y_train_scores)

Training Data:
XGBOD ROC:0.9994, precision @ rank n:0.9667


In [None]:
y_test_pred = clf.predict(X_test)

In [None]:
y_test_scores = clf.decision_function(X_test)

In [None]:
np.sum(y_test_pred)

0

In [None]:
out_results = X_test.copy(deep=True)

In [None]:
out_results['y_test_pred'] = y_test_pred.tolist()
out_results['y_test_scores'] = y_test_scores.tolist()

In [None]:
out_results.to_csv(proj_dir_data_proc+'test-run-data.csv')

In [None]:
def count_stat(vector):
    # Because it is '0' and '1', we can run a count statistic.
    unique, counts = np.unique(vector, return_counts=True)
    return dict(zip(unique, counts))

print("The training data:", count_stat(y_train_pred))
print("The test data:", count_stat(y_test_pred))

In [None]:
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'estimator_list': [KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=1, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=3, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, 

# XGBOD Downsampled

In [None]:
label_col = 'Labelled_Loan_Binary'
test_size = 0.2
train_size = 0.8
random_state = 21

clf_name = 'XGBOD'
contamination = 0.08
n_jobs = -1

In [None]:
n_samples = math.ceil(
    df_data.loc[df_data['Labelled_Loan_Binary'] == 1].shape[0] / contamination)

df_downsampled = resample(
    df_data.loc[df_data['Labelled_Loan_Binary'] == 0], replace=False,
    n_samples=n_samples, random_state=random_state)

df_downsampled = pd.concat([df_downsampled, df_data.loc[df_data['Labelled_Loan_Binary'] == 1]], ignore_index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_downsampled.drop(['Labelled_Loan_Binary'], axis=1), df_downsampled['Labelled_Loan_Binary'],
                                                    test_size=test_size, train_size=train_size, random_state=random_state, stratify=df_downsampled['Labelled_Loan_Binary'].values)

In [None]:
clf = XGBOD(random_state=random_state, contamination=contamination, silent=False, n_jobs = n_jobs)
clf.fit(X_train,y_train)

Parameters: { "contamination", "silent" } are not used.



XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no...x_features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=21,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=-1, nthread=None,
   objective='binary:logistic', random_state=21, reg_alpha=0, reg_lambda=1,
   scale_pos_weight=1, silent=False,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False

In [None]:
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# evaluate and print the results
print('Training Data:')
evaluate_print(clf_name, y_train, y_train_scores)

Training Data:
XGBOD ROC:0.9264, precision @ rank n:0.6246


In [None]:
y_test_pred = clf.predict(X_test)
y_test_scores = clf.decision_function(X_test)
# evaluate and print the results
print('Training Data:')
evaluate_print(clf_name, y_test, y_test_scores)

Training Data:
XGBOD ROC:0.8335, precision @ rank n:0.5267


In [None]:
def count_stat(vector):
    # Because it is '0' and '1', we can run a count statistic.
    unique, counts = np.unique(vector, return_counts=True)
    return dict(zip(unique, counts))

print("The training data:", count_stat(y_train_pred))
print("The test data:", count_stat(y_test_pred))

The training data: {0: 7848, 1: 273}
The test data: {0: 1964, 1: 67}


In [None]:
print(classification_report(y_test, y_test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9557    0.9979    0.9763      1881
           1     0.9403    0.4200    0.5806       150

    accuracy                         0.9552      2031
   macro avg     0.9480    0.7089    0.7785      2031
weighted avg     0.9546    0.9552    0.9471      2031



In [None]:
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'estimator_list': [KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=1, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=3, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, 