In [1]:
import pandas as pd
import re

# === STEP 1: Load Raw PPP Loan Data ===
def load_ppp_loan_data(filepath):
    """
    Loads PPP loan data from CSV with encoding fallback.
    """
    try:
        df = pd.read_csv(filepath, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(filepath, encoding='ISO-8859-1')

    print(f" Loaded {len(df):,} records.")
    return df


# === STEP 2: Clean Column Names ===
def clean_column_names(df):
    """
    Standardizes column names: lowercase, underscores, no dashes or spaces.
    """
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df


# === STEP 3: Filter and Clean Core Records ===
def clean_records(df):
    """
    Removes duplicates, filters zero/negative loans, and parses dates.
    """
    if 'loannumber' in df.columns:
        df = df.drop_duplicates(subset='loannumber')

    if 'currentapprovalamount' in df.columns:
        df = df[pd.to_numeric(df['currentapprovalamount'], errors='coerce') > 0]

    if 'dateapproved' in df.columns:
        df['dateapproved'] = pd.to_datetime(df['dateapproved'], errors='coerce')
        df = df[df['dateapproved'].notna()]

    return df


# === STEP 4 : Normalize Borrower Names ===
def normalize_entity(entity):
    if isinstance(entity, str):
        entity = entity.lower().replace(".", "").strip()
        entity = re.sub(r"\s+(llc|inc|corp|ltd)\.?$", "", entity)
    else:
        entity = ""
    return entity

def normalize_borrower_names(df):
    """
    Adds normalized borrower name column for fuzzy matching and modeling.
    """
    if 'borrowername' in df.columns:
        df['normalized_borrowername'] = df['borrowername'].apply(normalize_entity)
    return df


# === MASTER FUNCTION TO RUN THESE STEPS ===
def preprocess_ppp_loans(filepath):
    df = load_ppp_loan_data(filepath)
    df = clean_column_names(df)
    df = clean_records(df)
    df = normalize_borrower_names(df)
    print(f" Final dataset: {len(df):,} cleaned records.")
    return df


In [2]:
df_cleaned = preprocess_ppp_loans("Data/Raw/PPP_Loan_apps.csv")
df_cleaned.to_csv("Data/Cleaned/clean_ppp_loans22apr25.csv", index=False)


 Loaded 968,525 records.
 Final dataset: 968,525 cleaned records.
