# System Specifications Information

In [None]:
# Check Colab Pro / Local instance specs
# !df -h
# !cat /proc/cpuinfo
# !cat /proc/meminfo

# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

# Import Libraries

In [2]:
import os
import re
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
# %pip install deepod
# from deepod.models.tabular import
# from deepod.models.tabular import RoSAS

In [None]:
%pip install pyod

from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print

# Seting up local disk access and working directory

In [3]:
proj_dir = '~/' + 'plodi/'

proj_dir_data_raw = proj_dir + 'data/raw/' # Assign data folder under project code path. Create separately in Gdrive
proj_dir_data_proc = proj_dir + 'data/processed/'

dirs = ['proj_dir', 'proj_dir_data_raw', 'proj_dir_data_proc']

for dir in dirs:
    print('-----------------------------------------------------------------------')
    print('Directory contents for ', dir)
    !ls ${dir}

-----------------------------------------------------------------------
Directory contents for  proj_dir
'Combining loan and census data.ipynb'
 Copy_of_fraud_detect_model_sc_20231116.ipynb
 Copy_of_fraud_detect_model_sc_20231116.ipynb:Zone.Identifier
 borrower_name_processing_20231014.ipynb
 fraud_detect_eda.ipynb
 fraud_detect_model_isolationforest_20231019.ipynb
 fraud_detect_model_semisupervised_20231022.ipynb
-----------------------------------------------------------------------
Directory contents for  proj_dir_data_raw
'Combining loan and census data.ipynb'
 Copy_of_fraud_detect_model_sc_20231116.ipynb
 Copy_of_fraud_detect_model_sc_20231116.ipynb:Zone.Identifier
 borrower_name_processing_20231014.ipynb
 fraud_detect_eda.ipynb
 fraud_detect_model_isolationforest_20231019.ipynb
 fraud_detect_model_semisupervised_20231022.ipynb
-----------------------------------------------------------------------
Directory contents for  proj_dir_data_proc
'Combining loan and census data.ipynb'
 

# Setting up Google Drive access if not running locally

In [None]:
# Mount Google Drive. Confirm access permissions to permit this notebook to access your Google Drive files
from google.colab import drive
drive.mount('/content/drive')

# change proj_gdrive_dir string to personal gdrive project code path

#for Mike
#proj_dir = '/content/drive/MyDrive/' + 'datasci-210-project/fraud-detect/'

proj_dir = '/content/drive/MyDrive/' + 'datasci-210/datasci-210-project/fraud-detect/'
proj_dir_data_raw = proj_dir + 'data/raw/' # Assign data folder under project code path. Create separately in Gdrive
proj_dir_data_proc = proj_dir + 'data/processed/'
!ls $proj_dir
print('-----------------------------------------------------------------------')
!ls $proj_dir_data_raw
print('-----------------------------------------------------------------------')
!ls $proj_dir_data_proc

In [None]:
# drive.mount("/content/drive", force_remount=True) # Run if the Google drive needs to be remounted

In [None]:
# Run after completing Notebook run to gracefully disconnect Gdrive
# drive.flush_and_unmount()

In [None]:
#pd.set_option('display.max_columns', None)
#df_ppp_data.head(10)

# Load Preprocessed PPP Data

In [None]:
dtype_dict = {
    'LoanNumber':'str',
    'DateApproved':'str',
    'SBAOfficeCode':'str',
    'ProcessingMethod':'str',
    'BorrowerName':'str',
    'BorrowerAddress':'str',
    'BorrowerCity':'str',
    'BorrowerState':'str',
    'BorrowerZip':'str',
    'LoanStatusDate':'str',
    'LoanStatus':'str',
    # 'Term':'Int',
    # 'SBAGuarantyPercentage':'int',
    'InitialApprovalAmount':'float',
    'CurrentApprovalAmount':'float',
    'UndisbursedAmount':'float',
    'FranchiseName':'str',
    'ServicingLenderLocationID':'str',
    'ServicingLenderName':'str',
    'ServicingLenderAddress':'str',
    'ServicingLenderCity':'str',
    'ServicingLenderState':'str',
    'ServicingLenderZip':'str',
    'RuralUrbanIndicator':'str',
    'HubzoneIndicator':'str',
    'LMIIndicator':'str',
    'BusinessAgeDescription':'str',
    'ProjectCity':'str',
    'ProjectCountyName':'str',
    'ProjectState':'str',
    'ProjectZip':'str',
    'CD':'str',
    # 'JobsReported':'int',
    'NAICSCode':'str',
    'Race':'str',
    'Ethnicity':'str',
    'UTILITIES_PROCEED':'float',
    'PAYROLL_PROCEED':'float',
    'MORTGAGE_INTEREST_PROCEED':'float',
    'RENT_PROCEED':'float',
    'REFINANCE_EIDL_PROCEED':'float',
    'HEALTH_CARE_PROCEED':'float',
    'DEBT_INTEREST_PROCEED':'float',
    'BusinessType':'str',
    'OriginatingLenderLocationID':'str',
    'OriginatingLender':'str',
    'OriginatingLenderCity':'str',
    'OriginatingLenderState':'str',
    'Gender':'str',
    'Veteran':'str',
    'NonProfit':'str',
    'ForgivenessAmount':'float',
    'ForgivenessDate':'str',
    'Fraud':'float',
    'USPSAPI_Result':'str'
}

parse_dates = ['DateApproved', 'LoanStatusDate', 'ForgivenessDate']
#filename = 'sba_ppp_combined_cat.csv'
filename = 'final_data.csv'
df_labelled = pd.read_csv(proj_dir_data_proc+filename, header=0, dtype=dtype_dict, parse_dates=parse_dates)
#df_labelled.columns

In [None]:
#df_labelled.info(verbose=True)
#df_labelled.dtypes

###Clean the USPS API Data


In [None]:
# Set verified to be 1 and all else to 0
#df_labelled.USPSAPI_Result.value_counts(dropna=False)
#df_labelled.USPSAPI_Result.unique()
df_labelled['Verified_Address'] = df_labelled['USPSAPI_Result'].map({' USPSAPI_Result: True': 1, ' USPSAPI_Result: More Information Needed to Validate Address': 0}).fillna(0)
df_labelled.Verified_Address.value_counts(dropna=False)

###Read In CBSA Data and Merge with Labelled Data

In [None]:
#clean NAICS code for merging
df_labelled['NAICSCode_2digits'] = df_labelled['NAICSCode'].astype(str).str[:2]
# Create the implied employee pay per year
df_labelled['avg_employee_pay'] = df_labelled['InitialApprovalAmount']/df_labelled['JobsReported']/1000
#df_labelled.head()

In [None]:
# Read in the CBSA data and average out the yearly data by state/NAICS codes
cbsa_data = pd.read_excel('/content/drive/MyDrive/datasci-210-project/US_Census_Data/US_Census_data.xlsx')
census_grouped = cbsa_data.groupby(['State_AB','NAICS Code'])['Average annual payroll ($1,000)'].mean().reset_index()
census_grouped['NAICS Code'] = census_grouped['NAICS Code'].astype(str)
#census_grouped.head(10)

In [None]:
cbsa_data.head()

In [None]:
# Merge onto the loan data
merged_df = df_labelled.merge(census_grouped, left_on = ['BorrowerState','NAICSCode_2digits'], right_on = ['State_AB','NAICS Code'], how = 'left')
merged_df.head(10)

In [None]:
# Create average pay ration to NAICS data and truncate values
merged_df['pay_ratio'] = merged_df['avg_employee_pay']/merged_df['Average annual payroll ($1,000)']
merged_df['pay_ratio'] = merged_df['pay_ratio'].clip(0, 10)
merged_df.pay_ratio.hist(bins=30, range=[-1,2])
#merged_df.head()

In [None]:
#merged_df.info(verbose=True)
#merged_df.isnull().sum()

In [None]:
#merged_df.info(verbose=True)
#merged_df.dtypes

In [None]:
# Read in and merge on the cleaned labelled data
#dtype_dict = {
    'LoanNumber':'str',
    'Unnamed: 0': 'int64',
    'BorrowerName':'str',
    'BorrowerNameProc':'str',
    'index': 'int64',
    'State':'str',
    'Lookup':'str',
    'Fraud': 'int64',
    'Status': 'str',
    'lookupproc':'str',
    'lookup_li':'str',
    'case_index': 'int64'
}
#labelled_loans = pd.read_csv(proj_dir_data_proc+'labelled_unique.csv', header=0, dtype = dtype_dict)
#labelled_loans.shape

# Merge on the case data, only keeping the fraud status
#df_labelled = df_ppp_data.merge(labelled_loans, left_on = 'LoanNumber', right_on = 'LoanNumber', how = 'left')
#df_labelled.columns

In [None]:
trunc_cols = ['LoanNumber',
              'Term',
              'InitialApprovalAmount',
              'CurrentApprovalAmount',
              'UndisbursedAmount',
              'JobsReported',
              'NAICSCode',
              'UTILITIES_PROCEED',
              'PAYROLL_PROCEED',
              'MORTGAGE_INTEREST_PROCEED',
              'RENT_PROCEED',
              'REFINANCE_EIDL_PROCEED',
              'HEALTH_CARE_PROCEED',
              'DEBT_INTEREST_PROCEED',
              'ForgivenessAmount',
              'ProcessingMethod_cat',
              'LoanStatus_cat',
              'RuralUrbanIndicator_cat',
              'HubzoneIndicator_cat',
              'LMIIndicator_cat',
              'Race_cat',
              'Ethnicity_cat',
              'BusinessType_cat',
              'Gender_cat',
              'Veteran_cat',
              'NonProfit_cat',
              'Verified_Address',
              'pay_ratio',
              'Fraud']

# Subset varaibles and downsample
df_ppp_trunc = merged_df[trunc_cols].sample(frac=1, replace=False, weights=None, random_state=None, axis=0, ignore_index=False)
df_ppp_trunc.shape

In [None]:
# Check the number of missing values
df_ppp_trunc.isnull().sum()

In [None]:
# Fill in NA values for all but fraud
df_ppp_trunc_clean = df_ppp_trunc
na_cols = ['NAICSCode']
df_ppp_trunc_clean[na_cols] = df_ppp_trunc[na_cols].fillna(value=10)
df_ppp_trunc_clean['NAICSCode'] = df_ppp_trunc_clean['NAICSCode'].astype(str).str.extract('(^\d{2})').astype(int)

na_cols = ['UndisbursedAmount',
           'JobsReported',
           'UTILITIES_PROCEED',
           'PAYROLL_PROCEED',
           'MORTGAGE_INTEREST_PROCEED',
           'RENT_PROCEED',
           'REFINANCE_EIDL_PROCEED',
           'HEALTH_CARE_PROCEED',
           'DEBT_INTEREST_PROCEED',
           'ForgivenessAmount'
          ]

df_ppp_trunc_clean[na_cols] = df_ppp_trunc[na_cols].fillna(value=0)
df_ppp_trunc_clean['pay_ratio'] = df_ppp_trunc['pay_ratio'].fillna(value=1)
# df_ppp_trunc_clean

In [None]:
df_ppp_trunc_clean.isnull().sum()

In [None]:
df_ppp_trunc_clean.pay_ratio.hist()

In [None]:
# Save cleaned up data
df_ppp_trunc_clean.to_csv(proj_dir_data_proc+'modelling_data.csv')

# Load Final Processed Data Set

In [None]:
filename = 'modelling_data.csv'
df_data = pd.read_csv(proj_dir_data_proc+filename, header=0)

In [None]:
df_data.columns.tolist()

In [None]:
print('\n\nLoan Number # of NA \n', df_data.isnull().sum(axis = 0))

In [None]:
df_data.head(5)

In [None]:
for label in df_data.columns.tolist():
  print(label, ': ', df_data[label].dtype)

In [None]:
df_data = df_data.fillna(value={'Fraud':0})

# PyOD XGBOD

In [None]:
from pyod.models.xgbod import XGBOD

In [None]:
df_pyod = df_data.copy(deep=True)
cols = {'Unnamed: 0', 'Fraud_Aug', 'Verified_Address', 'pay_ratio', 'Labelled_Loan_Binary'}
df_pyod = df_pyod.drop(cols, axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_pyod.drop(['Fraud'], axis=1), df_pyod['Fraud'],
                                                    test_size=0.01, train_size=0.01, random_state=21)

contamination = 0.001

In [None]:
clf_name = 'XGBOD'

In [None]:
clf = XGBOD(n_components=1,random_state=21)
clf.fit(X_train,y_train)

In [None]:
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

In [None]:
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)

In [None]:
X_train

In [None]:
y1.sum

In [None]:
x = X_train['InitialApprovalAmount']
y1 = y_train
y2 = y_train_pred

plt.scatter(x, y1, y2, 'r', 'bs', 'g^')

plt.show()

# PyOD Autoencoder

In [None]:
df_pyod = df_data.copy(deep=True)

In [None]:
cols = {'Unnamed: 0', 'LoanNumber', 'Fraud_Aug', 'Verified_Address', 'pay_ratio', 'Fraud', 'pay_ratio', 'pay_100k', 'Fraud_Aug'}
df_pyod = df_pyod.drop(cols, axis=1)

In [None]:
%pip install pyod            # normal install
%pip install --upgrade pyod  # or update if needed

In [None]:
from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print

In [None]:
hidden_neurons = [10,20,10]
epochs = 1
batch_size = 32
contamination = 0.001
random_state = 21

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_pyod.drop(['Labelled_Loan_Binary'], axis=1), df_pyod['Labelled_Loan_Binary'],
                                                    test_size=0.001, train_size=0.001, random_state=random_state)


In [None]:
X_train.shape

In [None]:
# train AutoEncoder detector
clf_name = 'AutoEncoder'
clf = AutoEncoder(epochs=30, verbose=2,  contamination=contamination)
clf.fit(X_train)

# SKLearn Semi-supervised learning

In [None]:
df_sklearn = df_data.copy(deep=True)

In [None]:
cols = {'Unnamed: 0', 'LoanNumber', 'Fraud_Aug', 'Verified_Address', 'pay_ratio', 'Fraud', 'pay_ratio', 'pay_100k', 'Fraud_Aug'}
df_sklearn = df_sklearn.drop(cols, axis=1)

In [None]:
df_sklearn.columns.tolist

In [None]:
df_sklearn.loc[df_sklearn['Labelled_Loan_Binary'] == 0, ['Labelled_Loan_Binary_sklearn']] = -1
df_sklearn = df_sklearn.fillna(value={'Labelled_Loan_Binary_sklearn':1})

In [None]:
from sklearn.semi_supervised import LabelSpreading

In [None]:
label_spread = LabelSpreading(kernel="knn", n_neighbors=7, alpha=0.01, n_jobs=-1)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(df_sklearn.drop('Fraud_sklearn', axis=1), df_sklearn['Fraud_sklearn'],
#                                                     test_size=0.0001, train_size=0.0001, random_state=21)

X_train, X_test, y_train, y_test = train_test_split(df_sklearn.drop(['Labelled_Loan_Binary'], axis=1), df_sklearn[['Labelled_Loan_Binary']],
                                                    test_size=0.1, train_size=0.1, random_state=21)

In [None]:
X_train = np.array(X_train).squeeze()
y_train = np.array(y_train).squeeze()

In [None]:
y_test.sum()

In [None]:
label_spread.fit(X_train, y_train)

In [None]:
preds = label_spread.predict(X_test)

In [None]:
preds.sum()

In [None]:
print(label_spread.score(X_test, y_test))

# RoSAS

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_data.drop('Fraud', axis=1), df_data['Fraud'],
                                                    test_size=0.001, train_size=0.001, random_state=21)

In [None]:
clf = RoSAS()
clf.fit(X_train, y=y_train)
scores = clf.decision_function(X_test)

# XGBOD

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_data.drop('Fraud', axis=1), df_data['Fraud'],
                                                    test_size=0.01, train_size=0.01, random_state=21)

In [None]:
clf_name = 'XGBOD'
clf = XGBOD(random_state=21)
clf.fit(X_train, y_train)

In [None]:


# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores


In [None]:
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)

In [None]:
# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

In [None]:
# evaluate and print the results
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

# PyOD Anomoly Detection

In [None]:
# Read in the modeling data
df_labelled = pd.read_csv(proj_dir_data_proc+'modelling_data.csv', header=0)
df_ppp_trunc_clean = df_labelled.loc[:, df_labelled.columns != 'Fraud']

In [None]:
df_ppp_trunc_clean.columns

In [None]:
%pip install pyod
from pyod.models.copod import COPOD

In [None]:
# COPOD (Copula Based Outlier Detector)
# 1 thread 6m on A100
clf_copod = COPOD(contamination = 0.01, n_jobs = 1)
clf_copod.fit(df_ppp_trunc_clean)

In [None]:
pred_copod = clf_copod.predict(df_ppp_trunc_clean)

In [None]:
x = df_ppp_trunc_clean['CurrentApprovalAmount']
y1 = df_ppp_trunc_clean['PAYROLL_PROCEED']
y2 = df_ppp_trunc_clean['UTILITIES_PROCEED']
y3 = df_ppp_trunc_clean['Verified_Address']
y4 = df_ppp_trunc_clean['pay_ratio']

labels = ['PAYROLL_PROCEED', 'UTILITIES_PROCEED', 'Verified_Address', 'pay_ratio']

cmap = 'winter'
cmap1 = 'binary'

fig, ax = plt.subplots(nrows=2, ncols=2, figsize = [10,8])

a0 = ax[0, 0].scatter(x, y1, c=pred_copod, s=3, cmap=cmap)
a1 = ax[0, 0].scatter(x, y1, c=df_labelled.Fraud, s=3, cmap=cmap1)
a2 = ax[0, 1].scatter(x, y2, c=pred_copod, s=3, cmap=cmap)
a3 = ax[0, 1].scatter(x, y2, c=df_labelled.Fraud, s=3, cmap=cmap1)
a4 = ax[1, 0].scatter(x, y3, c=pred_copod, s=3, cmap=cmap)
a5 = ax[1, 0].scatter(x, y3, c=df_labelled.Fraud, s=3, cmap=cmap1)
a6 = ax[1, 1].scatter(x, y4, c=pred_copod, s=3, cmap=cmap)
a7 = ax[1, 1].scatter(x, y4, c=df_labelled.Fraud, s=3, cmap=cmap1)


ax[0, 0].set_title('Payroll Spending')
ax[0, 1].set_title('Utilities Spending')
ax[1, 0].set_title('Verified Address')
ax[1, 1].set_title('Pay Ratio')

ax[0, 0].set_ylabel('Payroll Spending')
ax[0, 1].set_ylabel('Utilities Spending')
ax[1, 0].set_ylabel('Verified Address')
ax[1, 1].set_ylabel('Pay Ratio')

ax[1, 0].set_xlabel('Approved Loan Amount')
ax[1, 1].set_xlabel('Approved Loan Amount')

cbar0 = fig.colorbar(a0, location='bottom')
cbar1 = fig.colorbar(a3, location='bottom')

fig.tight_layout(pad=5.0)
fig.suptitle('Unsupervised COPOD Outlier Detection Plots comparing Approved Loan Amounts to Borrower Spending Overlaid With Fraud Status', wrap=True, horizontalalignment='center')

text = 'Figure: COPOD model with unsupervised learning setting an anomaly (contamination) rate of 1% based on varying estimates of potential fraudulent loans of 10-15%'

plt.figtext(0.5, 0.01, text, wrap=True, horizontalalignment='center', fontsize=12)
plt.show()

# t-SNE Dimensionality Reduction

In [None]:

# Apply t-SNE
tsne = TSNE(n_components=2, n_jobs = -1)
loan_tsne = tsne.fit_transform(df_ppp_trunc_clean)

In [None]:
# Visualize t-SNE output
plt.scatter(loan_tsne[:,0], loan_tsne[:,1], s=0.01) # , c=iris.target
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE Output')
plt.show()

In [None]:
i_forest_tsne = IsolationForest(n_estimators=200, contamination=0.01, verbose=2, n_jobs=-1)
i_forest_tsne.fit(loan_tsne)

In [None]:
pred_tsne = i_forest_tsne.predict(loan_tsne)

In [None]:
x = df_ppp_trunc_clean['ForgivenessAmount']
y1 = df_ppp_trunc_clean['RuralUrbanIndicator_cat']
y2 = df_ppp_trunc_clean['NonProfit_cat']
y3 = df_ppp_trunc_clean['Race_cat']
y4 = df_ppp_trunc_clean['NAICSCode']

cmap = 'winter'

a0 = ax[0, 0].scatter(x, y1, c=pred, s=3, cmap=cmap)

ax[0, 0].set_title('Rural / Urban')
ax[0, 0].set_ylabel('Rural / Urban')
ax[1, 0].set_xlabel('Forgiveness Amount')


cbar0 = fig.colorbar(a0, ax=ax[0][0])

fig.tight_layout(pad=5.0)
fig.suptitle('Unsupervised IsolationForest Outlier Detection Plots comparing Forgiveness Amount to Borrower Characteristics', wrap=True, horizontalalignment='center')




In [None]:
fig, ax = plt.subplots(figsize = [10,10])
ax.set_title('Unsupervised IsolationForest outlier detection after t-SNE Dimensionality Reduction on PPP loan borrower data')

cmap = plt.get_cmap('coolwarm', 2)

plt.scatter(loan_tsne[:, 0], loan_tsne[:, 1], c=pred_tsne, s=0.01, cmap=cmap)

plt.colorbar(ticks=[-1,1])
text = 'Figure: t-SNE dimensionality reduction of sub-sampled (~1%) primary loan data set including 24 borrower and loan features excluding address and loan servicing features. IsolationForest model with unsupervised learning setting an anomaly (contamination) rate of 1% based on varying estimates of potential fraudulent loans of 10-15%.'

plt.figtext(0.5, 0.01, text, wrap=True, horizontalalignment='center', fontsize=12)

In [None]:
pred_scores = -1*i_forest.score_samples(loan_tsne)

In [None]:
plt.scatter(loan_tsne[:, 0], loan_tsne[:, 1], c=pred_scores, s=0.1, cmap='coolwarm')
plt.colorbar(label='Simplified Anomaly Score')
plt.show()

# PCA Analysis

In [None]:
# Apply PCA
pca = PCA(n_components=2)
loan_pca = pca.fit_transform(df_ppp_trunc_clean)

In [None]:
# Visualize PCA output
plt.scatter(loan_pca[:,0], loan_pca[:,1])
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.title('PCA Output')
plt.show()

In [None]:
i_forest_pca = IsolationForest(n_estimators=200, contamination=0.01, verbose=2)

i_forest_pca.fit(loan_pca)


In [None]:
pred_pca = i_forest.predict(loan_pca)

In [None]:
plt.scatter(loan_pca[:, 0], loan_pca[:, 1], c=pred_pca, s=2, cmap='coolwarm')

In [None]:
pred_scores_pca = -1*i_forest.score_samples(loan_pca)

In [None]:
plt.scatter(loan_pca[:, 0], loan_pca[:, 1], c=pred_scores_pca, s=0.1, cmap='coolwarm')
plt.colorbar(label='Simplified Anomaly Score')
plt.show()

# Isolation Forest without Dimensionality Reduction

In [None]:
i_forest = IsolationForest(n_estimators=200, contamination=0.001, verbose=2, n_jobs = 1)
i_forest.fit(df_ppp_trunc_clean)

In [None]:
pred = i_forest.predict(df_ppp_trunc_clean)

In [None]:
# trunc_cols = ['LoanNumber', 'Term', 'InitialApprovalAmount',
#        'CurrentApprovalAmount', 'UndisbursedAmount', 'JobsReported',
#        'NAICSCode', 'UTILITIES_PROCEED',
#        'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
#        'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
#        'DEBT_INTEREST_PROCEED', 'ForgivenessAmount', 'ProcessingMethod_cat', 'LoanStatus_cat',
#        'RuralUrbanIndicator_cat', 'HubzoneIndicator_cat', 'LMIIndicator_cat',
#        'Race_cat', 'Ethnicity_cat', 'BusinessType_cat', 'Gender_cat',
#        'Veteran_cat', 'NonProfit_cat']

In [None]:
plt.scatter(df_ppp_trunc_clean['CurrentApprovalAmount'], df_ppp_trunc_clean['PAYROLL_PROCEED'], c=pred, s=2, cmap='RdBu')

In [None]:
x = df_ppp_trunc_clean['CurrentApprovalAmount']
y1 = df_ppp_trunc_clean['PAYROLL_PROCEED']
y2 = df_ppp_trunc_clean['UTILITIES_PROCEED']
y3 = df_ppp_trunc_clean['MORTGAGE_INTEREST_PROCEED']
y4 = df_ppp_trunc_clean['RENT_PROCEED']

labels = ['PAYROLL_PROCEED', 'UTILITIES_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED']

cmap = 'winter'
cmap1 = 'binary'

fig, ax = plt.subplots(nrows=2, ncols=2, figsize = [10,8])

a0 = ax[0, 0].scatter(x, y1, c=pred, s=3, cmap=cmap)
a1 = ax[0, 0].scatter(x, y1, c=df_labelled.Fraud, s=3, cmap=cmap1)
a2 = ax[0, 1].scatter(x, y2, c=pred, s=3, cmap=cmap)
a3 = ax[0, 1].scatter(x, y2, c=df_labelled.Fraud, s=3, cmap=cmap1)
a4 = ax[1, 0].scatter(x, y3, c=pred, s=3, cmap=cmap)
a5 = ax[1, 0].scatter(x, y3, c=df_labelled.Fraud, s=3, cmap=cmap1)
a6 = ax[1, 1].scatter(x, y4, c=pred, s=3, cmap=cmap)
a7 = ax[1, 1].scatter(x, y4, c=df_labelled.Fraud, s=3, cmap=cmap1)

ax[0, 0].set_title('Payroll Spending')
ax[0, 1].set_title('Utilities Spending')
ax[1, 0].set_title('Mortgage Interest Spending')
ax[1, 1].set_title('Rent Spending')

ax[0, 0].set_ylabel('Payroll Spending')
ax[0, 1].set_ylabel('Utilities Spending')
ax[1, 0].set_ylabel('Mortgage Interest Spending')
ax[1, 1].set_ylabel('Rent Spending')

ax[1, 0].set_xlabel('Approved Loan Amount')
ax[1, 1].set_xlabel('Approved Loan Amount')

cbar0 = fig.colorbar(a0, location='bottom')
cbar1 = fig.colorbar(a3, location='bottom')

fig.tight_layout(pad=5.0)
fig.suptitle('Unsupervised IsolationForest Outlier Detection Plots comparing Approved Loan Amounts to Borrower Spending With Fraud Status', wrap=True, horizontalalignment='center')

text = 'Figure: IsolationForest model with unsupervised learning setting an anomaly (contamination) rate of 1% based on varying estimates of potential fraudulent loans of 10-15%'

plt.figtext(0.5, 0.01, text, wrap=True, horizontalalignment='center', fontsize=12)
plt.show()

In [None]:
# trunc_cols = ['LoanNumber', 'Term', 'InitialApprovalAmount',
#        'CurrentApprovalAmount', 'UndisbursedAmount', 'JobsReported',
#        'NAICSCode', 'UTILITIES_PROCEED',
#        'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
#        'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
#        'DEBT_INTEREST_PROCEED', 'ForgivenessAmount', 'ProcessingMethod_cat', 'LoanStatus_cat',
#        'RuralUrbanIndicator_cat', 'HubzoneIndicator_cat', 'LMIIndicator_cat',
#        'Race_cat', 'Ethnicity_cat', 'BusinessType_cat', 'Gender_cat',
#        'Veteran_cat', 'NonProfit_cat']

In [None]:
x = df_ppp_trunc_clean['ForgivenessAmount']
y1 = df_ppp_trunc_clean['RuralUrbanIndicator_cat']
y2 = df_ppp_trunc_clean['NonProfit_cat']
y3 = df_ppp_trunc_clean['Race_cat']
y4 = df_ppp_trunc_clean['NAICSCode']

cmap = 'winter'

fig, ax = plt.subplots(nrows=2, ncols=2, figsize = [10,8])
a0 = ax[0, 0].scatter(x, y1, c=pred, s=3, cmap=cmap)
a1 = ax[0, 1].scatter(x, y2, c=pred, s=3, cmap=cmap)
a2 = ax[1, 0].scatter(x, y3, c=pred, s=3, cmap=cmap)
a3 = ax[1, 1].scatter(x, y4, c=pred, s=3, cmap=cmap)

ax[0, 0].set_title('Rural / Urban')
ax[0, 1].set_title('Nonprofit Status')
ax[1, 0].set_title('Race (Self-Reported)')
ax[1, 1].set_title('NAICS Category')

ax[0, 0].set_ylabel('Rural / Urban')
ax[0, 1].set_ylabel('Nonprofit Status')
ax[1, 0].set_ylabel('Race (Self-Reported)')
ax[1, 1].set_ylabel('NAICS Category')

ax[1, 0].set_xlabel('Forgiveness Amount')
ax[1, 1].set_xlabel('Forgiveness Amount')

cbar0 = fig.colorbar(a0, ax=ax[0][0])
cbar1 = fig.colorbar(a1, ax=ax[0][1])
cbar2 = fig.colorbar(a2, ax=ax[1][0])
cbar3 = fig.colorbar(a3, ax=ax[1][1])

fig.tight_layout(pad=5.0)
fig.suptitle('Unsupervised IsolationForest Outlier Detection Plots comparing Forgiveness Amount to Borrower Characteristics', wrap=True, horizontalalignment='center')

text = 'Figure: IsolationForest model with unsupervised learning setting an anomaly (contamination) rate of 1% based on varying estimates of potential fraudulent loans of 10-15%'

plt.figtext(0.5, 0.01, text, wrap=True, horizontalalignment='center', fontsize=12)
plt.show()