<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/PPP_Loan_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
import pandas as pd
import numpy as np
import hashlib
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
ppp_data_raw = pd.read_csv("/content/drive/MyDrive/NCU/Dissertation/Data/PPP_Loan_apps.csv")
ppp_data_raw

In [None]:
nan_counts = ppp_data.isna().sum()
print(nan_counts)

In [None]:
distinct_loan_status = ppp_data['ProcessingMethod'].unique()
print(distinct_loan_status)

generate a sample known fraud dataset

In [None]:

import numpy as np

num_records = 100

known_fraud_cases = {
    'LoanNumber': np.random.randint(1000000000, 9999999999, size=num_records),
    'DateApproved': pd.to_datetime(np.random.choice(pd.date_range('2020-01-01', '2021-12-31'), size=num_records)).strftime('%Y-%m-%d'),
    'BorrowerName': ['Company' + str(i) for i in range(1, num_records + 1)],
    'FraudAmount': np.random.randint(100000, 500000, size=num_records),
    'LoanStatus': np.random.choice(['Charged Off', 'Paid in Full', 'Exempt'], size=num_records),
    'FraudDescription': np.random.choice(['Submitted false payroll information', 'Created a shell company to receive loan', 'Misused funds for personal expenses', 'Other fraudulent activity'], size=num_records),
    'Source': np.random.choice(['DOJ', 'SBA OIG', 'PRAC', 'Other'], size=num_records)
}

known_fraud_cases = pd.DataFrame(known_fraud_cases)
print(known_fraud_cases)

Step 1: Data Collection - Loading PPP Data and Known Fraud Cases
We’ll first load both the PPP dataset and the known_fraud_cases.csv file created from official sources.

In [None]:
# Add fraud label to known fraud cases
known_fraud_cases['fraud_label'] = 1  # Mark known fraud cases as fraudulent
ppp_data = ppp_data_raw.copy()
# Mark all other cases in the main dataset as non-fraudulent initially
ppp_data['fraud_label'] = 0  # Initial assumption of non-fraud for main data

# Combine the datasets for integrated analysis semi-supervised, make a copy of unlabeled data for unsupervised
ppp_supervised = pd.concat([ppp_data, known_fraud_cases], ignore_index=True)
ppp_unsupervised = ppp_data.copy()

Step 2: \Preprocessing for unspervised Learning: Combined Dataset
The combined dataset includes known fraud cases and general PPP data. Our focus here is on ensuring that critical fields are preserved, categorical fields are encoded, and any essential fields with missing values are handled in alignment with Chapter 3.

In [None]:
# Step 1: Hash PII Fields (BorrowerName, BorrowerAddress, FranchiseName, ServicingLenderAddress, ServicingLenderName, OriginatingLender)
def hash_column(df, column):
    df[column] = df[column].apply(lambda x: int(hashlib.sha256(str(x).encode()).hexdigest(), 16) % (10 ** 8))
    return df

# Hash specified PII columns
pii_columns = ['BorrowerName', 'BorrowerAddress', 'FranchiseName', 'ServicingLenderAddress', 'ServicingLenderName', 'OriginatingLender']
for col in pii_columns:
    ppp_unsupervised = hash_column(ppp_unsupervised, col)

# Step 2: Convert Date Columns to Year, Month, Day Features (DateApproved, LoanStatusDate, ForgivenessDate)
date_columns = ['DateApproved', 'LoanStatusDate', 'ForgivenessDate']
for date_col in date_columns:
    ppp_unsupervised[date_col] = pd.to_datetime(ppp_unsupervised[date_col], errors='coerce')  # Convert invalid dates to NaT
    ppp_unsupervised[f'{date_col}_year'] = ppp_unsupervised[date_col].dt.year.fillna(0).astype(int)
    ppp_unsupervised[f'{date_col}_month'] = ppp_unsupervised[date_col].dt.month.fillna(0).astype(int)
    ppp_unsupervised[f'{date_col}_day'] = ppp_unsupervised[date_col].dt.day.fillna(0).astype(int)
    ppp_unsupervised.drop(columns=[date_col], inplace=True)  # Drop original date column

# Step 3: Label Encode Categorical Columns
label_encode_cols = [
    'BorrowerCity', 'BorrowerState', 'LoanStatus', 'ServicingLenderCity', 'ServicingLenderState',
    'BusinessAgeDescription', 'ProjectCity', 'ProjectCountyName', 'ProjectState', 'BusinessType',
    'OriginatingLenderCity', 'OriginatingLenderState', 'Race', 'Ethnicity'
]

label_encoders = {}
for col in label_encode_cols:
    le = LabelEncoder()
    ppp_unsupervised[col] = le.fit_transform(ppp_unsupervised[col].astype(str))
    label_encoders[col] = le

# Step 4: One-Hot Encode Binary Columns
one_hot_cols = [
    'ProcessingMethod', 'RuralUrbanIndicator', 'HubzoneIndicator', 'LMIIndicator', 'Gender',
    'Veteran', 'NonProfit'
]
ppp_unsupervised = pd.get_dummies(ppp_unsupervised, columns=one_hot_cols, drop_first=True)

# Step 5: Mode Impute for NAICSCode
ppp_unsupervised['NAICSCode'] = ppp_unsupervised['NAICSCode'].fillna(ppp_unsupervised['NAICSCode'].mode()[0])

# Step 6: Median Impute for Financial and Numeric Columns
numeric_cols = [
    'CurrentApprovalAmount', 'InitialApprovalAmount', 'JobsReported', 'ForgivenessAmount',
    'UTILITIES_PROCEED', 'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED', 'DEBT_INTEREST_PROCEED', 'UndisbursedAmount'
]
for col in numeric_cols:
    ppp_unsupervised[col] = ppp_unsupervised[col].fillna(ppp_unsupervised[col].median())

# Step 7: Convert Location Identifiers to Numeric (BorrowerZip, ServicingLenderZip, ProjectZip, CD)
location_cols = ['BorrowerZip', 'ServicingLenderZip', 'ProjectZip', 'CD']
for col in location_cols:
    ppp_unsupervised[col] = pd.to_numeric(ppp_unsupervised[col], errors='coerce')
    ppp_unsupervised[col] = ppp_unsupervised[col].fillna(0).astype(int)  # Convert NaNs to 0 and cast to integer

# Step 8: Derived Column - ForgivenessAmountRatio
ppp_unsupervised['ForgivenessAmountRatio'] = ppp_unsupervised['ForgivenessAmount'] / ppp_unsupervised['CurrentApprovalAmount']
ppp_unsupervised['ForgivenessAmountRatio'].fillna(0, inplace=True)  # Fill NaNs with 0 for any division by zero cases

# Step 9: Verify No Missing Values
missing_values = ppp_unsupervised.isnull().sum().sum()
print("Total Missing Values after Preprocessing:", missing_values)

# Retain LoanNumber and SBAOfficeCode without modification

print("Processed ppp_data sample:\n", ppp_unsupervised.head())
print("Processed data column types:\n", ppp_unsupervised.dtypes)


Preprocessing for semi-upervised Learning

In [None]:
# Step 1: Hash PII Fields (BorrowerName, BorrowerAddress, FranchiseName, ServicingLenderAddress, ServicingLenderName, OriginatingLender)
def hash_column(df, column):
    df[column] = df[column].apply(lambda x: int(hashlib.sha256(str(x).encode()).hexdigest(), 16) % (10 ** 8))
    return df

# Hash specified PII columns
pii_columns = ['BorrowerName', 'BorrowerAddress', 'FranchiseName', 'ServicingLenderAddress', 'ServicingLenderName', 'OriginatingLender']
for col in pii_columns:
    ppp_supervised = hash_column(ppp_supervised, col)

# Step 2: Convert Date Columns to Year, Month, Day Features (DateApproved, LoanStatusDate, ForgivenessDate)
date_columns = ['DateApproved', 'LoanStatusDate', 'ForgivenessDate']
for date_col in date_columns:
    ppp_supervised[date_col] = pd.to_datetime(ppp_supervised[date_col], errors='coerce')  # Convert invalid dates to NaT
    ppp_supervised[f'{date_col}_year'] = ppp_supervised[date_col].dt.year.fillna(0).astype(int)
    ppp_supervised[f'{date_col}_month'] = ppp_supervised[date_col].dt.month.fillna(0).astype(int)
    ppp_supervised[f'{date_col}_day'] = ppp_supervised[date_col].dt.day.fillna(0).astype(int)
    ppp_supervised.drop(columns=[date_col], inplace=True)  # Drop original date column

# Step 3: Label Encode Categorical Columns
label_encode_cols = [
    'BorrowerCity', 'BorrowerState', 'LoanStatus', 'ServicingLenderCity', 'ServicingLenderState',
    'BusinessAgeDescription', 'ProjectCity', 'ProjectCountyName', 'ProjectState', 'BusinessType',
    'OriginatingLenderCity', 'OriginatingLenderState', 'Race', 'Ethnicity'
]

label_encoders = {}
for col in label_encode_cols:
    le = LabelEncoder()
    ppp_supervised[col] = le.fit_transform(ppp_supervised[col].astype(str))
    label_encoders[col] = le

# Step 4: One-Hot Encode Binary Columns
one_hot_cols = [
    'ProcessingMethod', 'RuralUrbanIndicator', 'HubzoneIndicator', 'LMIIndicator', 'Gender',
    'Veteran', 'NonProfit'
]
ppp_supervised = pd.get_dummies(ppp_supervised, columns=one_hot_cols, drop_first=True)

# Step 5: Mode Impute for NAICSCode
ppp_supervised['NAICSCode'] = ppp_supervised['NAICSCode'].fillna(ppp_supervised['NAICSCode'].mode()[0])

# Step 6: Median Impute for Financial and Numeric Columns
numeric_cols = [
    'CurrentApprovalAmount', 'InitialApprovalAmount', 'JobsReported', 'ForgivenessAmount',
    'UTILITIES_PROCEED', 'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED', 'DEBT_INTEREST_PROCEED', 'UndisbursedAmount'
]
for col in numeric_cols:
    ppp_supervised[col] = ppp_supervised[col].fillna(ppp_supervised[col].median())

# Step 7: Convert Location Identifiers to Numeric (BorrowerZip, ServicingLenderZip, ProjectZip, CD)
location_cols = ['BorrowerZip', 'ServicingLenderZip', 'ProjectZip', 'CD']
for col in location_cols:
    ppp_supervised[col] = pd.to_numeric(ppp_supervised[col], errors='coerce')
    ppp_supervised[col] = ppp_supervised[col].fillna(0).astype(int)  # Convert NaNs to 0 and cast to integer

# Step 8: Derived Column - ForgivenessAmountRatio
ppp_supervised['ForgivenessAmountRatio'] = ppp_supervised['ForgivenessAmount'] / ppp_supervised['CurrentApprovalAmount']
ppp_supervised['ForgivenessAmountRatio'].fillna(0, inplace=True)  # Fill NaNs with 0 for any division by zero cases

# Step 9: Verify No Missing Values
missing_values = ppp_supervised.isnull().sum().sum()
print("Total Missing Values after Preprocessing:", missing_values)

# Retain LoanNumber and SBAOfficeCode without modification

print("Processed ppp_data sample:\n", ppp_supervised.head())
print("Processed data column types:\n", ppp_supervised.dtypes)


unsupervised learning



In [None]:
# Step 1: Load ppp_unsupervised data, excluding the fraud label
X_unsupervised = ppp_unsupervised.drop(columns=['fraud'], errors='ignore')

In [None]:
# Step 2: Fit PCA with all components to find the optimal number based on explained variance
pca_full = PCA()
pca_full.fit(X_unsupervised)

# Calculate cumulative explained variance
explained_variance_ratio = np.cumsum(pca_full.explained_variance_ratio_)

# Plot cumulative explained variance to find the "elbow"
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Cumulative Explained Variance')
plt.grid(True)
plt.show()

# Determine optimal number of components to retain 98% variance
optimal_components = np.argmax(explained_variance_ratio >= 0.98) + 1
print(f"Optimal number of components to retain 98% variance: {optimal_components}")

In [None]:
# Step 3: Apply PCA with the determined optimal number of components
pca = PCA(n_components=optimal_components)
X_reduced = pca.fit_transform(X_unsupervised)

In [None]:
# Step 4: K-Means Clustering with Optimal K (Elbow Method)
sum_of_squared_distances = []
K_range = range(1, 11)  # Test K values from 1 to 10

for K in K_range:
    kmeans = KMeans(n_clusters=K, random_state=42)
    kmeans.fit(X_reduced)
    sum_of_squared_distances.append(kmeans.inertia_)

# Plot Elbow Curve for K
plt.figure(figsize=(8, 5))
plt.plot(K_range, sum_of_squared_distances, marker='o')
plt.title('Elbow Method for Optimal K in K-Means')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Distances (Inertia)')
plt.grid(True)
plt.show()

In [None]:
# Based on the Elbow Plot, select optimal K (e.g., K=4 if the elbow is observed there)
optimal_K = 10  # Update this based on the elbow plot observation

# Run K-Means with optimal K
kmeans = KMeans(n_clusters=optimal_K, random_state=42)
kmeans_labels = kmeans.fit_predict(X_reduced)

# Step 5: DBSCAN for Anomaly Detection (optional, on PCA-reduced data)
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_reduced)

# Add cluster labels to the ppp_unsupervised DataFrame
ppp_unsupervised['kmeans_cluster'] = kmeans_labels
ppp_unsupervised['dbscan_cluster'] = dbscan_labels

# Visualization of K-Means Clustering in Reduced Dimensions
plt.figure(figsize=(8, 5))

# Check if X_reduced has more than one column before plotting
if X_reduced.shape[1] > 1:
    plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=kmeans_labels, cmap='viridis', s=10)
    plt.ylabel('Principal Component 2')
else:
    # If only one component, plot against a range of values
    plt.scatter(X_reduced[:, 0], range(len(X_reduced)), c=kmeans_labels, cmap='viridis', s=10)
    plt.ylabel('Data Point Index') # Update y-axis label if only one principal component

plt.title(f'K-Means Clustering on PCA-Reduced PPP Unsupervised Data (K={optimal_K})')
plt.xlabel('Principal Component 1')
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
import seaborn as sns
# Load original data without PCA
X_original = ppp_unsupervised.drop(columns=['fraud'], errors='ignore')

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_original)

# Visualize DBSCAN clustering with pair plots or other features
ppp_unsupervised['dbscan_cluster'] = dbscan_labels
sns.pairplot(ppp_unsupervised, hue='dbscan_cluster', palette='viridis', plot_kws={'alpha':0.5})
plt.show()