<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/prac_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# NLP + matching tools
import spacy
import re
import pandas as pd
import json
from rapidfuzz import process, fuzz
from tqdm import tqdm

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")  # Optionally 'en_core_web_trf' for better accuracy


In [7]:
# Load press releases (PRAC/DOJ reports)
with open("Data/Raw/pandemic_reports.json", encoding="utf-8") as file:
    press_releases = json.load(file)

print(f" Loaded {len(press_releases)} press releases.")


 Loaded 2523 press releases.


In [10]:
# Load cleaned PPP loan data (from your earlier pipeline)
loan_df = pd.read_csv("Data/Cleaned/clean_ppp_loans22apr25.csv")

# Ensure borrower names are normalized
loan_df["normalized_borrowername"] = loan_df["borrowername"].str.lower().str.replace(".", "").str.strip()


In [11]:
# Improved regex for monetary values
MONEY_REGEX = r"\$[\d,.]+(?:\s?(million|billion|thousand|k|m|b))?"

def parse_money(amount_str):
    multipliers = {'thousand':1e3, 'million':1e6, 'billion':1e9, 'k':1e3, 'm':1e6, 'b':1e9}
    amount_str = amount_str.lower().replace(",", "").replace("$", "").strip()
    match = re.match(r"([\d.]+)\s?(thousand|million|billion|k|m|b)?", amount_str)
    if match:
        number = float(match.group(1))
        multiplier = multipliers.get(match.group(2), 1)
        return number * multiplier
    return None

def extract_entities(text):
    doc = nlp(text)
    orgs = {ent.text.strip() for ent in doc.ents if ent.label_ == "ORG"}
    people = {ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"}
    locations = {ent.text.strip() for ent in doc.ents if ent.label_ in ["GPE", "LOC"]}
    dates = {ent.text.strip() for ent in doc.ents if ent.label_ == "DATE" and re.search(r"\d{4}", ent.text)}
    money_mentions = {ent.text.strip() for ent in doc.ents if ent.label_ == "MONEY"}

    money_from_regex = re.findall(MONEY_REGEX, text)
    parsed_money = [parse_money(m[0] if isinstance(m, tuple) else m) for m in money_from_regex]
    parsed_money = [amt for amt in parsed_money if amt]

    return {
        "orgs": list(orgs),
        "people": list(people),
        "locations": list(locations),
        "dates": list(dates),
        "money_raw": list(money_mentions),
        "loan_amounts": parsed_money
    }


In [12]:
def match_press_release_to_loans(entities, loan_df, top_n=3, score_threshold=85, amount_tolerance=1000):
    matches = []
    name_pool = loan_df["normalized_borrowername"].tolist()

    for name in entities["orgs"] + entities["people"]:
        top_matches = process.extract(name.lower(), name_pool, scorer=fuzz.token_sort_ratio, limit=top_n)
        for match_name, score, idx in top_matches:
            if score < score_threshold:
                continue
            loan_candidate = loan_df.iloc[idx]
            for amount in entities["loan_amounts"]:
                if abs(amount - loan_candidate["currentapprovalamount"]) < amount_tolerance:
                    matches.append({
                        "MatchedLoanNumber": loan_candidate["loannumber"],
                        "MatchedBorrowerName": loan_candidate["borrowername"],
                        "LoanAmount": loan_candidate["currentapprovalamount"],
                        "LoanApprovalDate": loan_candidate["dateapproved"],
                        "MatchScore": score,
                        "AmountDifference": abs(amount - loan_candidate["currentapprovalamount"])
                    })
    return matches


In [13]:
matched_results = []

for release in tqdm(press_releases):
    content = release.get("press_release", "")
    entities = extract_entities(content)
    matches = match_press_release_to_loans(entities, loan_df)

    for match in matches:
        result = {
            "Title": release.get("title"),
            "Date": release.get("date"),
            "Link": release.get("link"),
            **match,
            "is_fraudulent": 1
        }
        matched_results.append(result)

print(f"\n Total matched fraud cases: {len(matched_results)}")


100%|███████████████████████████████████████████████████████████████████████████| 2523/2523 [10:37:00<00:00, 15.15s/it]


 Total matched fraud cases: 0





In [None]:
matched_df = pd.DataFrame(matched_results)
matched_df.to_csv("Data/Processed/matched_fraud_cases24apr25.csv", index=False)
matched_df.to_json("Data/Processed/matched_fraud_cases24apr25.json", orient="records", indent=2)

print(" Matched fraud cases saved.")


In [15]:
from rapidfuzz import process, fuzz
import re

# Define org stopwords to filter irrelevant matches
ORG_STOPWORDS = {
    "fbi", "doj", "irs", "sba", "small business administration", "congress",
    "department of justice", "internal revenue service", "government",
    "city of", "state of", "u.s. department of", "us department of",
    "united states", "us", "u.s.", "the government", "usa"
}

# Normalization function
def normalize_org_name(name):
    if isinstance(name, str):
        name = name.lower().replace(".", "").strip()
        name = re.sub(r"\s+(llc|inc|corp|ltd)\.?$", "", name)
        return name
    return ""

# Apply normalization to loan data
loan_df["normalized_borrowername"] = loan_df["borrowername"].apply(normalize_org_name)
name_pool = loan_df["normalized_borrowername"].tolist()

# Run test on first 5 press releases
for release in press_releases[:5]:
    print("\n" + "=" * 60)
    print(f"📄 TITLE: {release.get('title')}")
    content = release.get("press_release", "")
    
    # Extract entities
    entities = extract_entities(content)

    # Normalize and filter orgs
    orgs = [
        normalize_org_name(name) for name in entities["orgs"]
        if normalize_org_name(name) not in ORG_STOPWORDS and len(name.strip()) > 3
    ]
    
    print("🔎 Filtered ORG entities:", orgs)
    print("💰 Extracted Loan Amounts:", entities["loan_amounts"])

    matched = False

    for org in orgs:
        matches = process.extract(org, name_pool, scorer=fuzz.token_sort_ratio, limit=3)
        
        for match_name, score, idx in matches:
            if score < 70:
                continue
            candidate = loan_df.iloc[idx]
            for amt in entities["loan_amounts"]:
                if abs(amt - candidate["currentapprovalamount"]) < 1000:
                    print(f"\n✅ MATCH FOUND!")
                    print(f"→ Matched Name: {match_name} (Score: {score})")
                    print(f"→ Loan #: {candidate['loannumber']}")
                    print(f"→ Amount: ${candidate['currentapprovalamount']:.2f}")
                    matched = True
                    break
            if matched:
                break
        if matched:
            break

    if not matched:
        print("❌ No matches found.")



📄 TITLE: Baltimore Man Sentenced to Federal Prison for Role in Maryland Unemployment Insurance Scheme
🔎 Filtered ORG entities: ['covid-19', 'national center for disaster fraud', 'the federal bureau of investigation – baltimore field office', 'department', 'the department of justice’s', 'the us department of justice', 'national capital region', 'hotline', 'the coronavirus aid, relief, and economic security', 'bank of america – detection', 'us department of labor’s office of inspector general', 'the maryland department of labor']
💰 Extracted Loan Amounts: []
❌ No matches found.

📄 TITLE: St. Louis Woman Accused of $177,000 Pandemic-Era Tax Credit Scheme
🔎 Filtered ORG entities: ['covid-19', 'us treasury', 'district court']
💰 Extracted Loan Amounts: []
❌ No matches found.

📄 TITLE: Three Pike County, Alabama Residents Charged with Federal Rental Assistance Program Fraud
🔎 Filtered ORG entities: ['montgomery', 'covid-19', 'the emergency rental assistance program', 'schemillia fenn']
💰 Ext

In [16]:
# Same setup as before
from rapidfuzz import process, fuzz
import re

ORG_STOPWORDS = {
    "fbi", "doj", "irs", "sba", "small business administration", "congress",
    "department of justice", "internal revenue service", "government",
    "city of", "state of", "u.s. department of", "us department of",
    "united states", "us", "u.s.", "the government", "usa"
}

def normalize_name(name):
    if isinstance(name, str):
        name = name.lower().replace(".", "").strip()
        name = re.sub(r"\s+(llc|inc|corp|ltd|dba)\.?$", "", name)
        return name
    return ""

loan_df["normalized_borrowername"] = loan_df["borrowername"].apply(normalize_name)
name_pool = loan_df["normalized_borrowername"].tolist()

# Test first 5 press releases
for release in press_releases[:5]:
    print("\n" + "=" * 60)
    print(f"📄 TITLE: {release.get('title')}")
    content = release.get("press_release", "")
    
    entities = extract_entities(content)

    orgs = [
        normalize_name(name) for name in entities["orgs"]
        if normalize_name(name) not in ORG_STOPWORDS and len(name.strip()) > 3
    ]
    people = [
        normalize_name(name) for name in entities["people"]
        if len(name.strip()) > 3
    ]

    print("🔎 ORG entities:", orgs)
    print("🧑 PERSON entities:", people)
    print("💰 Loan Amounts:", entities["loan_amounts"])

    matched = False

    # 1. Try ORG matches
    for name in orgs:
        matches = process.extract(name, name_pool, scorer=fuzz.token_sort_ratio, limit=3)
        for match_name, score, idx in matches:
            if score < 70:
                continue
            candidate = loan_df.iloc[idx]
            for amt in entities["loan_amounts"]:
                if abs(amt - candidate["currentapprovalamount"]) < 1000:
                    print(f"\n✅ ORG MATCH!")
                    print(f"→ Name: {match_name} (Score: {score})")
                    print(f"→ Loan #: {candidate['loannumber']}, Amount: ${candidate['currentapprovalamount']:.2f}")
                    matched = True
                    break
            if matched:
                break
        if matched:
            break

    # 2. If no ORG match, try PERSON name
    if not matched:
        for name in people:
            matches = process.extract(name, name_pool, scorer=fuzz.token_sort_ratio, limit=3)
            for match_name, score, idx in matches:
                if score < 70:
                    continue
                candidate = loan_df.iloc[idx]
                for amt in entities["loan_amounts"]:
                    if abs(amt - candidate["currentapprovalamount"]) < 1000:
                        print(f"\n✅ PERSON MATCH!")
                        print(f"→ Name: {match_name} (Score: {score})")
                        print(f"→ Loan #: {candidate['loannumber']}, Amount: ${candidate['currentapprovalamount']:.2f}")
                        matched = True
                        break
                if matched:
                    break
            if matched:
                break

    if not matched:
        print("❌ No matches found.")



📄 TITLE: Baltimore Man Sentenced to Federal Prison for Role in Maryland Unemployment Insurance Scheme
🔎 ORG entities: ['covid-19', 'national center for disaster fraud', 'the federal bureau of investigation – baltimore field office', 'department', 'the department of justice’s', 'the us department of justice', 'national capital region', 'hotline', 'the coronavirus aid, relief, and economic security', 'bank of america – detection', 'us department of labor’s office of inspector general', 'the maryland department of labor']
🧑 PERSON entities: ['devante smith', 'julie r rubin', 'kelly o hayes', 'hayes', 'tiia woods', 'charge william j delbagno', 'https://wwwjusticegov/disaster-fraud/ncdf-disaster-complaint-formus', 'evelyn lombardo cusson', 'w springer', 'harry m gruber', 'smith']
💰 Loan Amounts: []
❌ No matches found.

📄 TITLE: St. Louis Woman Accused of $177,000 Pandemic-Era Tax Credit Scheme
🔎 ORG entities: ['covid-19', 'us treasury', 'district court']
🧑 PERSON entities: ['ayana j brown'

In [24]:
def search_borrower_name(df, query, case_insensitive=True, show_columns=None, max_results=10):
    """
    Search the loan DataFrame for a borrower name containing the given query.
    
    Parameters:
    - df: DataFrame of PPP loans
    - query: String or substring to search for
    - case_insensitive: Whether to match case-insensitively
    - show_columns: List of columns to show (defaults to key fields)
    - max_results: Max number of rows to return
    
    Returns:
    - DataFrame with matching rows
    """
    if case_insensitive:
        mask = df['borrowername'].str.contains(query, case=False, na=False)
    else:
        mask = df['borrowername'].str.contains(query, na=False)
    
    result_df = df[mask].copy()
    if show_columns is None:
        show_columns = ['loannumber', 'borrowername', 'currentapprovalamount', 'dateapproved', 'projectcity', 'projectstate']
    
    display(result_df[show_columns].head(max_results))
    print(f"🔍 Found {len(result_df)} matching results for '{query}'.")

    return result_df

# Example usage
search_borrower_name(loan_df, "Crowns Construction", case_insensitive=True)


Unnamed: 0,loannumber,borrowername,currentapprovalamount,dateapproved,projectcity,projectstate
439582,1716008001,CROWNS CONSTRUCTION LLC,210897.0,2020-06-22,baltimore,MD


🔍 Found 1 matching results for 'Crowns Construction'.


Unnamed: 0,unnamed:_0,loannumber,dateapproved,sbaofficecode,processingmethod,borrowername,borroweraddress,borrowercity,borrowerstate,borrowerzip,...,originatinglenderlocationid,originatinglender,originatinglendercity,originatinglenderstate,gender,veteran,nonprofit,forgivenessamount,forgivenessdate,normalized_borrowername
439582,439582,1716008001,2020-06-22,373,PPP,CROWNS CONSTRUCTION LLC,700 bonaparte ave,baltimore,MD,21218-6737,...,456756,Cross River Bank,TEANECK,NJ,Female Owned,Non-Veteran,,,,crowns construction


In [25]:
def match_entities_to_loans_with_aggregation(entities, loan_df, score_threshold=75, amount_tolerance=500000):
    """
    Match ORG or PERSON names to multiple PPP loans, and compare total amount to press release fraud amount.
    Returns a list of matching loan records if the total is within range.
    """
    all_matches = []
    name_pool = loan_df["normalized_borrowername"].tolist()
    candidates = []

    # Try matching both org and person entities
    candidate_names = entities["orgs"] + entities["people"]
    candidate_names = [normalize_name(n) for n in candidate_names if len(n.strip()) > 3]

    for name in candidate_names:
        top_matches = process.extract(name, name_pool, scorer=fuzz.token_sort_ratio, limit=5)
        for match_name, score, idx in top_matches:
            if score >= score_threshold:
                matched_loan = loan_df.iloc[idx]
                candidates.append({
                    "match_name": match_name,
                    "score": score,
                    "loan": matched_loan
                })

    if not entities["loan_amounts"]:
        return [], "no_amount_extracted"

    fraud_amount_est = max(entities["loan_amounts"])  # Assume largest mentioned amount is the total fraud
    total_matched_amount = sum([c["loan"]["currentapprovalamount"] for c in candidates])
    amount_diff = abs(fraud_amount_est - total_matched_amount)

    if amount_diff <= amount_tolerance:
        print(f"🟢 Matched total: ${total_matched_amount:,.2f} (expected ~${fraud_amount_est:,.2f})")
        return [c["loan"] for c in candidates], "aggregated_amount_match"

    return [], "no_match"



In [29]:
test_results = []

for release in press_releases[:5]:
    print("\n" + "=" * 60)
    print(f"📄 {release['title']}")
    content = release.get("press_release", "")

    entities = extract_entities(content)
    matched_loans, match_type = match_entities_to_loans_with_aggregation(entities, loan_df)

    print(f"→ ORGs: {entities['orgs']}")
    print(f"→ People: {entities['people']}")
    print(f"→ Loan Amounts (from text): {entities['loan_amounts']}")
    
    if matched_loans:
        print(f"✅ MATCHED {len(matched_loans)} loans via aggregation")
        for loan in matched_loans:
            print(f"   - {loan['borrowername']} (${loan['currentapprovalamount']:,.2f})")
        
        # Store matched loans for later use
        for loan in matched_loans:
            test_results.append({
                "Title": release.get("title"),
                "Date": release.get("date"),
                "Link": release.get("link"),
                "LoanNumber": loan["loannumber"],
                "BorrowerName": loan["borrowername"],
                "LoanAmount": loan["currentapprovalamount"],
                "LoanApprovalDate": loan["dateapproved"],
                "is_fraudulent": 1,
                "MatchType": match_type
            })
    else:
        print("❌ No aggregated match.")



📄 Baltimore Man Sentenced to Federal Prison for Role in Maryland Unemployment Insurance Scheme
🟡 No fraud amount extracted, but name-based matches found.
→ NATIONAL CENTER FOR LEARNING DISABILITIE ($215,662.00)
→ NATIONAL CAPITAL SECURITY, LLC. ($170,832.00)
→ LEGION CAPITAL CORP ($159,200.00)
→ NATIONAL CAPITOL PIZZA LLC ($151,800.00)
→ REGIONAL INTERNATIONAL CORP ($1,732,585.00)
→ CAPITAL REGION OB-GYN, LLC ($192,445.00)
→ HOTLINES INC ($197,395.00)
→ NINE HOTEL LLC ($466,203.00)
→ THOMLINE LLC ($417,056.00)
→ HOTLIX INC. ($194,082.00)
→ HOTLIX INC. ($181,287.00)
→ AMERICAN LEAK DETECTION ($189,000.00)
→ DEVONTE SMITH ($254,800.00)
→ SMITH & DEAN INC ($1,371,800.00)
→ DANIEL SMITH INC ($347,500.00)
→ DANIEL SMITH INC ($347,500.00)
→ SMITH SEVEN LLC ($189,000.00)
→ HAYNES INC ($2,468,798.75)
→ BAYES INC ($591,992.00)
→ BAYES INC ($425,700.00)
→ HAGES INC ($175,000.00)
→ SCHAYES, INC. ($321,330.00)
→ WOODS TANK INC. ($1,137,989.38)
→ EFTIHIA FOODS INC ($175,365.00)
→ TINWOODS INC. ($1

In [27]:
matched_df = pd.DataFrame(test_results)
display(matched_df.head())
print(f"\nTotal matched loans: {len(matched_df)}")



Total matched loans: 0


In [28]:
def is_relevant_org(name):
    """
    Filters out common government orgs or irrelevant entities.
    """
    stop_keywords = [
        'department', 'program', 'committee', 'office', 'division', 'sba',
        'covid', 'irs', 'fbi', 'justice', 'congress', 'relief', 'pandemic', 'response'
    ]
    name_clean = normalize_name(name)
    if name_clean in ORG_STOPWORDS:
        return False
    return not any(keyword in name_clean for keyword in stop_keywords)


def match_entities_to_loans_with_aggregation(entities, loan_df, score_threshold=75, amount_tolerance=500000):
    """
    Matches both ORG and PERSON entities to loans, using loan aggregation if fraud amount is extracted.
    Falls back to name-only match when amount is missing.
    """
    name_pool = loan_df["normalized_borrowername"].tolist()
    candidates = []

    # Combine ORG and PERSON names (with filtering on ORGs)
    candidate_names = [n for n in entities["orgs"] if is_relevant_org(n)] + entities["people"]
    candidate_names = [normalize_name(n) for n in candidate_names if len(n.strip()) > 3]

    for name in candidate_names:
        top_matches = process.extract(name, name_pool, scorer=fuzz.token_sort_ratio, limit=5)
        for match_name, score, idx in top_matches:
            if score >= score_threshold:
                loan = loan_df.iloc[idx]
                candidates.append({
                    "match_name": match_name,
                    "score": score,
                    "loan": loan
                })

    # === CASE 1: Amount available (high-confidence match) ===
    if entities["loan_amounts"]:
        fraud_amount_est = max(entities["loan_amounts"])  # Use highest value as total
        total_matched_amount = sum([c["loan"]["currentapprovalamount"] for c in candidates])
        amount_diff = abs(fraud_amount_est - total_matched_amount)

        if amount_diff <= amount_tolerance:
            print(f"🟢 Matched total: ${total_matched_amount:,.2f} (expected ~${fraud_amount_est:,.2f})")
            return [c["loan"] for c in candidates], "aggregated_amount_match"
        else:
            print(f"⚠️ Total loan amount (${total_matched_amount:,.2f}) not close to expected fraud amount (${fraud_amount_est:,.2f})")
            return [], "amount_mismatch"

    # === CASE 2: No amount, but strong name matches (lower-confidence) ===
    elif candidates:
        print("🟡 No fraud amount extracted, but name-based matches found.")
        for c in candidates:
            print(f"→ {c['loan']['borrowername']} (${c['loan']['currentapprovalamount']:,.2f})")
        return [c["loan"] for c in candidates], "name_only_no_amount"

    # === CASE 3: Nothing matched ===
    print("❌ No relevant matches found.")
    return [], "no_match"
