# Semi-Supervised Models

Here, we deploy the test / train split created in the data preprocessing notebook to run the semi-supervised model XGBOD.

# Notebook Setup

In [None]:
# Check instance specs
# !df -h
# !cat /proc/cpuinfo
# !cat /proc/meminfo

In [None]:
import os
import re
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import seaborn as sns
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn import svm
from xgboost import plot_tree
from xgboost import plot_importance
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
%matplotlib inline

In [None]:
# Installation of PyOD library for XGBOD semi-supervised model
%pip install pyod
from pyod.models.xgbod import XGBOD
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print



## Notebook Functions / Variables

In [None]:
random_state = 21
contamination = 0.08 # Identified target ranking outlier rate based lit review
n_jobs = -1 # Value for models to use multiple cores

## Connecting Google Drive for data storage


In [None]:
# Mount Google Drive and confirm access permissions to permitaccess for data located on Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify project directory personal filepaths under Google Drive
user_proj_path = ''

proj_dir = '/content/drive/MyDrive/' + user_proj_path + 'plodi/'
proj_dir_data_raw = proj_dir + 'data/raw/'
proj_dir_data_proc = proj_dir + 'data/processed/'

dirs = [proj_dir, proj_dir_data_raw, proj_dir_data_proc]

for dir in dirs:
    print('-----------------------------------------------------------------------')
    print('Directory contents for ', dir)
    %ls {dir}

Mounted at /content/drive
-----------------------------------------------------------------------
Directory contents for  /content/drive/MyDrive/plodi/
[0m[01;34mdata[0m/  [01;34mnotebooks[0m/
-----------------------------------------------------------------------
Directory contents for  /content/drive/MyDrive/plodi/data/raw/
2-6%20digit_2017_Codes.xlsx      public_up_to_150k_11_230630.csv  public_up_to_150k_6_230630.csv
2-6%20digit_2022_Codes.xlsx      public_up_to_150k_12_230630.csv  public_up_to_150k_7_230630.csv
6-digit_2017_Codes.xlsx          public_up_to_150k_1_230630.csv   public_up_to_150k_8_230630.csv
6-digit_2022_Codes.xlsx          public_up_to_150k_2_230630.csv   public_up_to_150k_9_230630.csv
ppp-data-dictionary.xlsx         public_up_to_150k_3_230630.csv   sba_ppp_combined.csv
public_150k_plus_230630.csv      public_up_to_150k_4_230630.csv   sba_ppp_merged.csv
public_up_to_150k_10_230630.csv  public_up_to_150k_5_230630.csv   US_Census_data.xlsx
---------------------

In [None]:
# drive.mount("/content/drive", force_remount=True) # Run if the Google drive needs to be remounted

# Run after completing Notebook run to gracefully disconnect Gdrive
# drive.flush_and_unmount()

Mounted at /content/drive


# Load Preprocessed Data and Model Downsampled Test / Train Data

In [None]:
# Read in the full modeling data and downsampled train/test data
index_col = 'LoanNumber'

df_model_data = pd.read_csv(proj_dir_data_proc+'modelling_data_cat_preproc.csv', header=0, index_col=index_col)
X_train = pd.read_csv(proj_dir_data_proc+'x_train.csv', header=0, index_col=index_col)
X_test = pd.read_csv(proj_dir_data_proc+'x_test.csv', header=0, index_col=index_col)
y_train = pd.read_csv(proj_dir_data_proc+'y_train.csv', header=0, index_col=index_col)
y_test = pd.read_csv(proj_dir_data_proc+'y_test.csv', header=0, index_col=index_col)

# Semi-Supervised Learning

## XGBOD (Extreme Gradient Boosting Outlier Detection)

In [None]:
clf_name = 'XGBOD'
clf = XGBOD(random_state=random_state, contamination=contamination, silent=False, n_jobs=n_jobs)
clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
Parameters: { "contamination", "silent" } are not used.



XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no...x_features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=21,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=-1, nthread=None,
   objective='binary:logistic', random_state=21, reg_alpha=0, reg_lambda=1,
   scale_pos_weight=1, silent=False,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False

In [None]:
# Training Results
print('Training Data:')
evaluate_print(clf_name, y_train, clf.decision_scores_)

Training Data:
XGBOD ROC:0.9254, precision @ rank n:0.6008


In [None]:
# Run model on test data
y_test_pred = clf.predict(X_test)
y_test_scores = clf.decision_function(X_test)

print('Test Data:')
evaluate_print(clf_name, y_test, y_test_scores)

Test Data:
XGBOD ROC:0.8059, precision @ rank n:0.464


In [None]:
print(classification_report(y_test, y_test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9362    0.9986    0.9664      1441
           1     0.9310    0.2160    0.3506       125

    accuracy                         0.9361      1566
   macro avg     0.9336    0.6073    0.6585      1566
weighted avg     0.9358    0.9361    0.9173      1566



In [None]:
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'estimator_list': [KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=1, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, n_jobs=1, n_neighbors=3, novelty=True, p=2),
  KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
    metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    radius=1.0),
  LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
    metric_params=None, 