# System Specifications Information

In [None]:
# Check Colab Pro / Local instance specs
# !df -h
# !cat /proc/cpuinfo
# !cat /proc/meminfo

# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

# Import Libraries

In [None]:
import os
import re
import gc
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import resample
%matplotlib inline

In [None]:
%pip install pyod

from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print

Collecting pyod
  Downloading pyod-1.1.2.tar.gz (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-1.1.2-py3-none-any.whl size=190292 sha256=08bbafc3d1e6b72fa24ef925348396d116061271a2288baf25b480711a82887f
  Stored in directory: /root/.cache/pip/wheels/81/1b/61/aa85b78c3c0c8871f4231e3f4a03bb23cecb7db829498380ee
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.1.2


# Setting up Google Drive access if not running locally

In [None]:
# Mount Google Drive. Confirm access permissions to permit this notebook to access your Google Drive files
from google.colab import drive
drive.mount('/content/drive')

# change proj_gdrive_dir string to personal gdrive project code path

#for Mike
#proj_dir = '/content/drive/MyDrive/' + 'datasci-210-project/fraud-detect/'

proj_dir = '/content/drive/MyDrive/' + 'datasci-210/datasci-210-project/fraud-detect/'
proj_dir_data_raw = proj_dir + 'data/raw/' # Assign data folder under project code path. Create separately in Gdrive
proj_dir_data_proc = proj_dir + 'data/processed/'

dirs = ['proj_dir', 'proj_dir_data_raw', 'proj_dir_data_proc']

for dir in dirs:
    print('-----------------------------------------------------------------------')
    print('Directory contents for ', dir)
    !ls ${dir}

Mounted at /content/drive
-----------------------------------------------------------------------
Directory contents for  proj_dir
drive  sample_data
-----------------------------------------------------------------------
Directory contents for  proj_dir_data_raw
drive  sample_data
-----------------------------------------------------------------------
Directory contents for  proj_dir_data_proc
drive  sample_data


In [None]:
# drive.mount("/content/drive", force_remount=True) # Run if the Google drive needs to be remounted

In [None]:
# Run after completing Notebook run to gracefully disconnect Gdrive
# drive.flush_and_unmount()

In [None]:
#pd.set_option('display.max_columns', None)
#df_ppp_data.head(10)

# Load Final Processed Data Set

In [None]:
# Set of cols to use on working modeling data file

usecols = ['LoanNumber',
 'Term',
 'InitialApprovalAmount',
 'CurrentApprovalAmount',
 'UndisbursedAmount',
 'JobsReported',
 'NAICSCode',
 'UTILITIES_PROCEED',
 'PAYROLL_PROCEED',
 'MORTGAGE_INTEREST_PROCEED',
 'RENT_PROCEED',
 'REFINANCE_EIDL_PROCEED',
 'HEALTH_CARE_PROCEED',
 'DEBT_INTEREST_PROCEED',
 'ForgivenessAmount',
 'ProcessingMethod_cat',
 'LoanStatus_cat',
 'RuralUrbanIndicator_cat',
 'HubzoneIndicator_cat',
 'LMIIndicator_cat',
 'Race_cat',
 'Ethnicity_cat',
 'BusinessType_cat',
 'Gender_cat',
 'Veteran_cat',
 'NonProfit_cat',
 'Verified_Address',
 'avg_employee_pay',
 'pay_ratio',
 'pay_ratio_binary',
 'pay_100k',
#  'Fraud',
 'Labelled_Loan_Binary']

dtype_dict = {'LoanNumber': np.uint,
 'Term': np.intc,
 'InitialApprovalAmount': np.single,
 'CurrentApprovalAmount': np.single,
 'UndisbursedAmount': np.single,
 'JobsReported': np.intc,
 'NAICSCode': np.intc,
 'UTILITIES_PROCEED': np.single,
 'PAYROLL_PROCEED': np.single,
 'MORTGAGE_INTEREST_PROCEED': np.single,
 'RENT_PROCEED': np.single,
 'REFINANCE_EIDL_PROCEED': np.single,
 'HEALTH_CARE_PROCEED': np.single,
 'DEBT_INTEREST_PROCEED': np.single,
 'ForgivenessAmount': np.single,
 'ProcessingMethod_cat': np.byte,
 'LoanStatus_cat': np.byte,
 'RuralUrbanIndicator_cat': np.byte,
 'HubzoneIndicator_cat': np.byte,
 'LMIIndicator_cat': np.byte,
 'Race_cat': np.byte,
 'Ethnicity_cat': np.byte,
 'BusinessType_cat': np.byte,
 'Gender_cat': np.byte,
 'Veteran_cat': np.byte,
 'NonProfit_cat': np.byte,
 'Verified_Address': np.byte,
 'avg_employee_pay': np.single,
 'pay_ratio': np.single,
 'pay_ratio_binary': np.byte,
 'pay_100k': np.byte,
#  'Fraud': np.byte,
 'Labelled_Loan_Binary': np.byte}

index_col = 'LoanNumber'

# filepath = 's3://sagemaker-us-west-1-945035589481/modelling_data.csv'
filename = 'modelling_data.csv'
filepath = proj_dir_data_proc+filename
df_data = pd.read_csv(filepath, header=0, index_col=index_col, usecols=usecols, dtype=dtype_dict)

In [None]:
df_data.columns.tolist()

['Term',
 'InitialApprovalAmount',
 'CurrentApprovalAmount',
 'UndisbursedAmount',
 'JobsReported',
 'NAICSCode',
 'UTILITIES_PROCEED',
 'PAYROLL_PROCEED',
 'MORTGAGE_INTEREST_PROCEED',
 'RENT_PROCEED',
 'REFINANCE_EIDL_PROCEED',
 'HEALTH_CARE_PROCEED',
 'DEBT_INTEREST_PROCEED',
 'ForgivenessAmount',
 'ProcessingMethod_cat',
 'LoanStatus_cat',
 'RuralUrbanIndicator_cat',
 'HubzoneIndicator_cat',
 'LMIIndicator_cat',
 'Race_cat',
 'Ethnicity_cat',
 'BusinessType_cat',
 'Gender_cat',
 'Veteran_cat',
 'NonProfit_cat',
 'Verified_Address',
 'avg_employee_pay',
 'pay_ratio',
 'pay_ratio_binary',
 'pay_100k',
 'Labelled_Loan_Binary']

In [None]:
print('\n\nLoan Number # of NA \n', df_data.isnull().sum(axis = 0))



Loan Number # of NA 
 Term                         0
InitialApprovalAmount        0
CurrentApprovalAmount        0
UndisbursedAmount            0
JobsReported                 0
NAICSCode                    0
UTILITIES_PROCEED            0
PAYROLL_PROCEED              0
MORTGAGE_INTEREST_PROCEED    0
RENT_PROCEED                 0
REFINANCE_EIDL_PROCEED       0
HEALTH_CARE_PROCEED          0
DEBT_INTEREST_PROCEED        0
ForgivenessAmount            0
ProcessingMethod_cat         0
LoanStatus_cat               0
RuralUrbanIndicator_cat      0
HubzoneIndicator_cat         0
LMIIndicator_cat             0
Race_cat                     0
Ethnicity_cat                0
BusinessType_cat             0
Gender_cat                   0
Veteran_cat                  0
NonProfit_cat                0
Verified_Address             0
avg_employee_pay             0
pay_ratio                    0
pay_ratio_binary             0
pay_100k                     0
Labelled_Loan_Binary         0
dtype: int64


In [None]:
np.isinf(df_data).values.sum()

0

In [None]:
df_data.head(5)

Unnamed: 0_level_0,Term,InitialApprovalAmount,CurrentApprovalAmount,UndisbursedAmount,JobsReported,NAICSCode,UTILITIES_PROCEED,PAYROLL_PROCEED,MORTGAGE_INTEREST_PROCEED,RENT_PROCEED,...,BusinessType_cat,Gender_cat,Veteran_cat,NonProfit_cat,Verified_Address,avg_employee_pay,pay_ratio,pay_ratio_binary,pay_100k,Labelled_Loan_Binary
LoanNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3383618704,60,13540.0,13540.0,0.0,1,51,1.0,13538.0,0.0,0.0,...,20,1,0,0,0,64.991997,0.909467,0,0,0
5776278404,60,6205.399902,6205.399902,0.0,1,62,0.0,6205.399902,0.0,0.0,...,18,2,1,0,1,29.785919,0.609532,0,0,0
5966978904,60,27900.0,27900.0,0.0,4,23,1.0,27898.0,0.0,0.0,...,21,2,1,0,1,33.48,0.570875,0,0,0
9647148304,60,68307.0,68307.0,0.0,7,33,1.0,68304.0,0.0,0.0,...,5,1,0,0,0,46.839085,1.0,0,0,0
3136537210,24,8700.0,8700.0,0.0,1,81,0.0,8700.0,0.0,0.0,...,5,2,1,0,0,41.759998,1.497966,0,0,0


In [None]:
df_data.describe

<bound method NDFrame.describe of             Term  InitialApprovalAmount  CurrentApprovalAmount  \
LoanNumber                                                       
3383618704    60           1.354000e+04           1.354000e+04   
5776278404    60           6.205400e+03           6.205400e+03   
5966978904    60           2.790000e+04           2.790000e+04   
9647148304    60           6.830700e+04           6.830700e+04   
3136537210    24           8.700000e+03           8.700000e+03   
...          ...                    ...                    ...   
7368587102    24           3.420000e+04           4.380000e+04   
5355747104    24           2.820000e+04           2.820000e+04   
6463348808    60           2.079100e+04           2.079100e+04   
6414248501    60           2.083250e+04           2.083250e+04   
6749737207    24           1.018847e+06           1.018847e+06   

            UndisbursedAmount  JobsReported  NAICSCode  UTILITIES_PROCEED  \
LoanNumber                    

In [None]:
print(df_data.min(axis=0))
print(df_data.max(axis=0))

Term                              0.0
InitialApprovalAmount       -199659.0
CurrentApprovalAmount             0.0
UndisbursedAmount                 0.0
JobsReported                     -6.0
NAICSCode                        10.0
UTILITIES_PROCEED                 0.0
PAYROLL_PROCEED                   0.0
MORTGAGE_INTEREST_PROCEED         0.0
RENT_PROCEED                      0.0
REFINANCE_EIDL_PROCEED            0.0
HEALTH_CARE_PROCEED               0.0
DEBT_INTEREST_PROCEED             0.0
ForgivenessAmount                 0.0
ProcessingMethod_cat              0.0
LoanStatus_cat                    0.0
RuralUrbanIndicator_cat           0.0
HubzoneIndicator_cat              0.0
LMIIndicator_cat                  0.0
Race_cat                          0.0
Ethnicity_cat                     0.0
BusinessType_cat                  0.0
Gender_cat                        0.0
Veteran_cat                       0.0
NonProfit_cat                     0.0
Verified_Address                  0.0
avg_employee

In [None]:
for label in df_data.columns.tolist():
  print(label, ': ', df_data[label].dtype)

Term :  int32
InitialApprovalAmount :  float32
CurrentApprovalAmount :  float32
UndisbursedAmount :  float32
JobsReported :  int32
NAICSCode :  int32
UTILITIES_PROCEED :  float32
PAYROLL_PROCEED :  float32
MORTGAGE_INTEREST_PROCEED :  float32
RENT_PROCEED :  float32
REFINANCE_EIDL_PROCEED :  float32
HEALTH_CARE_PROCEED :  float32
DEBT_INTEREST_PROCEED :  float32
ForgivenessAmount :  float32
ProcessingMethod_cat :  int8
LoanStatus_cat :  int8
RuralUrbanIndicator_cat :  int8
HubzoneIndicator_cat :  int8
LMIIndicator_cat :  int8
Race_cat :  int8
Ethnicity_cat :  int8
BusinessType_cat :  int8
Gender_cat :  int8
Veteran_cat :  int8
NonProfit_cat :  int8
Verified_Address :  int8
avg_employee_pay :  float32
pay_ratio :  float32
pay_ratio_binary :  int8
pay_100k :  int8
Labelled_Loan_Binary :  int8


# PyOD XGBOD

In [None]:
from pyod.models.xgbod import XGBOD

In [None]:
label_col = 'Labelled_Loan_Binary'
test_size = 0.1
train_size = 0.1
random_state = 21

clf_name = 'XGBOD'
contamination = 0.08

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_data.drop(['Labelled_Loan_Binary'], axis=1), df_data['Labelled_Loan_Binary'],
                                                    test_size=test_size, train_size=train_size, random_state=random_state, stratify=df_data['Labelled_Loan_Binary'].values)

In [None]:
clf = XGBOD(random_state=random_state, silent=0)
clf.fit(X_train,y_train)

Parameters: { "silent" } are not used.



XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no...x_features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=21,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=1, nthread=None,
   objective='binary:logistic', random_state=21, reg_alpha=0, reg_lambda=1,
   scale_pos_weight=1, silent=0,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, Fal

In [None]:
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

In [None]:
np.sum(y_train)
np.sum(y_train_pred)

In [None]:
# evaluate and print the results
print('Training Data:')
evaluate_print(clf_name, y_train, y_train_scores)

Training Data:
XGBOD ROC:0.9994, precision @ rank n:0.9667


In [None]:
y_test_pred = clf.predict(X_test)

In [None]:
y_test_scores = clf.decision_function(X_test)

In [None]:
np.sum(y_test_pred)

0

In [None]:
out_results = X_test.copy(deep=True)

In [None]:
out_results['y_test_pred'] = y_test_pred.tolist()
out_results['y_test_scores'] = y_test_scores.tolist()

In [None]:
out_results.to_csv(proj_dir_data_proc+'test-run-data.csv')

In [None]:
def count_stat(vector):
    # Because it is '0' and '1', we can run a count statistic.
    unique, counts = np.unique(vector, return_counts=True)
    return dict(zip(unique, counts))

print("The training data:", count_stat(y_train_pred))
print("The test data:", count_stat(y_test_pred))

In [None]:
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'estimator_list': [KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=1, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=3, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, 

# XGBOD Downsampled

In [None]:
label_col = 'Labelled_Loan_Binary'
test_size = 0.2
train_size = 0.8
random_state = 21

clf_name = 'XGBOD'
contamination = 0.08
n_jobs = -1

In [None]:
n_samples = math.ceil(
    df_data.loc[df_data['Labelled_Loan_Binary'] == 1].shape[0] / contamination)

df_downsampled = resample(
    df_data.loc[df_data['Labelled_Loan_Binary'] == 0], replace=False,
    n_samples=n_samples, random_state=random_state)

df_downsampled = pd.concat([df_downsampled, df_data.loc[df_data['Labelled_Loan_Binary'] == 1]], ignore_index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_downsampled.drop(['Labelled_Loan_Binary'], axis=1), df_downsampled['Labelled_Loan_Binary'],
                                                    test_size=test_size, train_size=train_size, random_state=random_state, stratify=df_downsampled['Labelled_Loan_Binary'].values)

In [None]:
clf = XGBOD(random_state=random_state, contamination=contamination, silent=False, n_jobs = n_jobs)
clf.fit(X_train,y_train)

Parameters: { "contamination", "silent" } are not used.



XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no...x_features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=21,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=-1, nthread=None,
   objective='binary:logistic', random_state=21, reg_alpha=0, reg_lambda=1,
   scale_pos_weight=1, silent=False,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False

In [None]:
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# evaluate and print the results
print('Training Data:')
evaluate_print(clf_name, y_train, y_train_scores)

Training Data:
XGBOD ROC:0.9264, precision @ rank n:0.6246


In [None]:
y_test_pred = clf.predict(X_test)
y_test_scores = clf.decision_function(X_test)
# evaluate and print the results
print('Training Data:')
evaluate_print(clf_name, y_test, y_test_scores)

Training Data:
XGBOD ROC:0.8335, precision @ rank n:0.5267


In [None]:
def count_stat(vector):
    # Because it is '0' and '1', we can run a count statistic.
    unique, counts = np.unique(vector, return_counts=True)
    return dict(zip(unique, counts))

print("The training data:", count_stat(y_train_pred))
print("The test data:", count_stat(y_test_pred))

The training data: {0: 7848, 1: 273}
The test data: {0: 1964, 1: 67}


In [None]:
print(classification_report(y_test, y_test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9557    0.9979    0.9763      1881
           1     0.9403    0.4200    0.5806       150

    accuracy                         0.9552      2031
   macro avg     0.9480    0.7089    0.7785      2031
weighted avg     0.9546    0.9552    0.9471      2031



In [None]:
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'estimator_list': [KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=1, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=3, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, 