## Introduction

This notebook runs through co-training model as a semi-supervised example on the full processed data set and results are not directly comparable to the performance of the other model's on the train/test set.

In [None]:
import os
import re
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
!pip install pyod
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier



In [None]:
from pyod.models.xgbod import XGBOD
import matplotlib.pyplot as plt
from xgboost import plot_tree
from xgboost import plot_importance
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [None]:
# Mount Google Drive. Confirm access permissions to permit this notebook to access your Google Drive files
from google.colab import drive
drive.mount('/content/drive')
# drive.mount("/content/drive", force_remount=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
loan = pd.read_csv('/content/drive/MyDrive/datasci-210-project/fraud-detect/data/processed/modelling_data_cat_preproc.csv')


In [None]:
columns_list = loan.columns. tolist()
columns_list

['LoanNumber',
 'InitialApprovalAmount',
 'UTILITIES_PROCEED',
 'PAYROLL_PROCEED',
 'MORTGAGE_INTEREST_PROCEED',
 'RENT_PROCEED',
 'REFINANCE_EIDL_PROCEED',
 'HEALTH_CARE_PROCEED',
 'DEBT_INTEREST_PROCEED',
 'ForgivenessAmount',
 'pay_ratio',
 'pay_ratio_binary',
 'SBAOfficeCode_cat',
 'ProcessingMethod_cat',
 'LoanStatus_cat',
 'Term_cat',
 'ServicingLenderName_cat',
 'RuralUrbanIndicator_cat',
 'HubzoneIndicator_cat',
 'LMIIndicator_cat',
 'BusinessAgeDescription_cat',
 'CD_cat',
 'Race_cat',
 'Ethnicity_cat',
 'BusinessType_cat',
 'OriginatingLender_cat',
 'Gender_cat',
 'Veteran_cat',
 'NonProfit_cat',
 'NAICSCode_2digits_cat',
 'Verified_Address_cat',
 'CaseLabel_cat']

In [None]:
unique_values = loan['CaseLabel_cat'].unique()
unique_values

array([0, 1])

In [None]:
unlabeled_data = loan[loan['CaseLabel_cat'] == 0]
X_unlabeled = unlabeled_data.drop(['CaseLabel_cat','LoanNumber'], axis=1)

In [None]:
#import the same train and test datasets as other models
X_train = pd.read_csv('/content/drive/MyDrive/datasci-210-project/fraud-detect/data/processed/x_train 20231212.csv')
X_test = pd.read_csv('/content/drive/MyDrive/datasci-210-project/fraud-detect/data/processed/x_test 20231212.csv')
y_train = pd.read_csv('/content/drive/MyDrive/datasci-210-project/fraud-detect/data/processed/y_train 20231212.csv')
y_test = pd.read_csv('/content/drive/MyDrive/datasci-210-project/fraud-detect/data/processed/y_test 20231212.csv')

In [None]:
columns_to_convert = ['InitialApprovalAmount']

In [None]:
X_train= X_train.drop(['Unnamed: 0','pay_100k'], axis=1)
y_train = y_train['Labelled_Loan_Binary']

X_test= X_test.drop(['Unnamed: 0','pay_100k'], axis=1)
y_test = y_test['Labelled_Loan_Binary']


In [None]:
#Co-training involves iteratively training classifiers on different view of data,
#selecting confident predictions, adding them to the labeled set, and retraining
#the classifiers. This process continues to refine the model's performance by leveraging
#both labled and unlabled data.

#initializer two independent clasisfier
classifer1 = MultinomialNB()
classifer2 = DecisionTreeClassifier()

In [None]:
# divide the labeled into two subsets, withe each subset having a different view of the data by selecting different columns
features1 = ['pay_ratio','SBAOfficeCode_cat','PAYROLL_PROCEED','InitialApprovalAmount','Term_cat','NAICSCode_2digits_cat']
features2 = ['OriginatingLender_cat','ForgivenessAmount','Race_cat','CD_cat','OriginatingLender_cat','ServicingLenderName_cat']

In [None]:
#train each classifier using its repsective labeled subset
classifer1.fit(X_train[features1], y_train)
classifer2.fit(X_train[features2], y_train)

In [None]:
X_unlabeled[columns_to_convert] = X_unlabeled[columns_to_convert].astype(int)

In [None]:
#make predictions on the unlabeled data using each classifier
predictions1 = classifer1.predict(X_unlabeled[features1])
predictions2 = classifer2.predict(X_unlabeled[features2])

In [None]:
#identifity instances where classifiers agree with high confidence on predictions
agree_indices = (predictions1 == predictions2)
confident_agree_indices = agree_indices

In [None]:
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)
#pd.concat([X_train, X_confident])

In [None]:
#add the confidently predicted instances to the labeled dataset
if len(X_unlabeled[confident_agree_indices]) > 0:
    X_confident = X_unlabeled[confident_agree_indices]
    y_confident = predictions1[confident_agree_indices]  # or predictions2, they should agree
    y_confident_series = pd.DataFrame(y_confident)
    y_confident_series = y_confident_series.rename(columns={0: 'Labelled_Loan_Binary'})

    # Convert X_confident to a DataFrame if it's a NumPy array
    X_confident_df = pd.DataFrame(X_confident, columns=X_unlabeled.columns)

    # Append the new data to the existing DataFrames
    X_train = pd.concat([X_train_df, X_confident_df], ignore_index=True)
    y_train = pd.concat([y_train_df, y_confident_series], ignore_index=True)

In [None]:
combined_data = np.column_stack((X_train, y_train))

X_train = combined_data[:, :-1]
y_train = combined_data[:, -1]

In [None]:
#retain the classifers on the updated labeled dataset
classifer1.fit(X_train, y_train)  # Retrain on updated labeled data

In [None]:
predictions_test = classifer1.predict(X_test)  # Evaluate on test set
accuracy = accuracy_score(y_test, predictions_test)
precision = precision_score(y_test, predictions_test)
recall = recall_score(y_test, predictions_test)
f1 = f1_score(y_test, predictions_test)
print(classification_report(y_test, predictions_test,digits = 4))

              precision    recall  f1-score   support

           0     0.9306    0.7349    0.8212      1441
           1     0.1075    0.3680    0.1664       125

    accuracy                         0.7056      1566
   macro avg     0.5190    0.5515    0.4938      1566
weighted avg     0.8649    0.7056    0.7690      1566



