<a target="_blank" href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/PRAC_Manual_review.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
import json
import pandas as pd

# Load press releases
with open("Data/Raw/pandemic_reports.json", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
prac_df = pd.DataFrame(data)

# Preview and save
prac_df.to_csv("Data/Processed/prac_press_releases.csv", index=False)
print(f"Saved {len(prac_df)} PRAC press releases to CSV.")


Saved 2523 PRAC press releases to CSV.


In [2]:
import json
import pandas as pd

# Load full PRAC press release dataset
with open("Data/Raw/pandemic_reports.json", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
prac_df = pd.DataFrame(data)

# Combine title + body text to search
prac_df["full_text"] = prac_df["title"].fillna("") + " " + prac_df["press_release"].fillna("")

# Filter for "PPP" or "Paycheck Protection Program"
ppp_related = prac_df[prac_df["full_text"].str.contains(r"\b(ppp|paycheck protection program)\b", case=False, na=False)]

# Save filtered results to CSV
ppp_related = ppp_related.drop(columns=["full_text"])
ppp_related.to_csv("Data/Processed/prac_ppp_only.csv", index=False)

print(f"✅ Saved {len(ppp_related)} PPP-related press releases to CSV for manual labeling.")


✅ Saved 1270 PPP-related press releases to CSV for manual labeling.


  ppp_related = prac_df[prac_df["full_text"].str.contains(r"\b(ppp|paycheck protection program)\b", case=False, na=False)]


In [4]:
# Load cleaned PPP loan data (from your earlier pipeline)
loan_df = pd.read_csv("Data/Cleaned/clean_ppp_loans22apr25.csv")

# Ensure borrower names are normalized
loan_df["normalized_borrowername"] = loan_df["borrowername"].str.lower().str.replace(".", "").str.strip()

In [10]:
def search_borrower_name(df, query, case_insensitive=True, show_columns=None, max_results=10):
    """
    Search the loan DataFrame for a borrower name containing the given query.
    
    Parameters:
    - df: DataFrame of PPP loans
    - query: String or substring to search for
    - case_insensitive: Whether to match case-insensitively
    - show_columns: List of columns to show (defaults to key fields)
    - max_results: Max number of rows to return
    
    Returns:
    - DataFrame with matching rows
    """
    if case_insensitive:
        mask = df['borrowername'].str.contains(query, case=False, na=False)
    else:
        mask = df['borrowername'].str.contains(query, na=False)
    
    result_df = df[mask].copy()
    if show_columns is None:
        show_columns = ['loannumber', 'borrowername', 'currentapprovalamount', 'dateapproved', 'projectcity', 'projectstate']
    
    display(result_df[show_columns].head(max_results))
    print(f"🔍 Found {len(result_df)} matching results for '{query}'.")

    return result_df

# Example usage
search_borrower_name(loan_df, "Peace of Mind Services", case_insensitive=True)


Unnamed: 0,loannumber,borrowername,currentapprovalamount,dateapproved,projectcity,projectstate
441677,9490137905,PEACE OF MIND SERVICES INC,182709.0,2020-06-19,Baltimore,MD


🔍 Found 1 matching results for 'Peace of Mind Services'.


Unnamed: 0,unnamed:_0,loannumber,dateapproved,sbaofficecode,processingmethod,borrowername,borroweraddress,borrowercity,borrowerstate,borrowerzip,...,originatinglenderlocationid,originatinglender,originatinglendercity,originatinglenderstate,gender,veteran,nonprofit,forgivenessamount,forgivenessdate,normalized_borrowername
441677,441677,9490137905,2020-06-19,373,PPP,PEACE OF MIND SERVICES INC,3732 COLUMBUS DR,Baltimore,MD,21215-6122,...,456756,Cross River Bank,TEANECK,NJ,Female Owned,Non-Veteran,,,,peace of mind services inc


In [None]:
import pandas as pd

# Load your manually tagged fraud loan data
fraud_groups = pd.read_csv("Data/Processed/manual_fraud_loan_ids.csv")

# Split and explode semicolon-separated Loan_app column
fraud_groups["Loan_app"] = fraud_groups["Loan_app"].astype(str)
fraud_groups["Loan_app"] = fraud_groups["Loan_app"].str.split(";")
fraud_df = fraud_groups.explode("Loan_app")

# Clean up: remove whitespace, convert to integer
fraud_df["Loan_app"] = fraud_df["Loan_app"].str.strip()
fraud_df = fraud_df[fraud_df["Loan_app"] != ""]
fraud_df["LoanNumber"] = fraud_df["Loan_app"].astype("int64")

# Final labeled dataset
fraud_labeled = fraud_df[["LoanNumber"]].drop_duplicates()
fraud_labeled["is_fraudulent"] = 1

# Save it for merging
fraud_labeled.to_csv("Data/Processed/known_fraud_loans.csv", index=False)
print(f" Extracted {len(fraud_labeled)} labeled fraudulent loans.")


 Extracted 302 labeled fraudulent loans.


In [4]:
import pandas as pd

# Load cleaned PPP loan dataset
loan_df = pd.read_csv("Data/Cleaned/clean_ppp_loans22apr25.csv")

# Load labeled fraudulent loan numbers
fraud_labeled = pd.read_csv("Data/Processed/known_fraud_loans.csv")

# Ensure column name is lowercase for merge compatibility
#fraud_labeled.rename(columns={"LoanNumber": "loannumber"}, inplace=True)

# Merge and fill missing labels with 0
loan_df = loan_df.merge(fraud_labeled, on="loannumber", how="left")
loan_df["is_fraudulent"] = loan_df["is_fraudulent"].fillna(0).astype(int)

# Save final labeled dataset
loan_df.to_csv("Data/Processed/ppp_loans_labeled29apr25.csv", index=False)

print(f" Final dataset: {len(loan_df)} loans, {loan_df['is_fraudulent'].sum()} labeled as fraudulent.")


 Final dataset: 968525 loans, 301 labeled as fraudulent.
