<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/prac_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# NLP + matching tools
import spacy
import re
import pandas as pd
import json
from rapidfuzz import process, fuzz
from tqdm import tqdm

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")  # Optionally 'en_core_web_trf' for better accuracy


In [7]:
# Load press releases (PRAC/DOJ reports)
with open("Data/Raw/pandemic_reports.json", encoding="utf-8") as file:
    press_releases = json.load(file)

print(f" Loaded {len(press_releases)} press releases.")


 Loaded 2523 press releases.


In [10]:
# Load cleaned PPP loan data (from your earlier pipeline)
loan_df = pd.read_csv("Data/Cleaned/clean_ppp_loans22apr25.csv")

# Ensure borrower names are normalized
loan_df["normalized_borrowername"] = loan_df["borrowername"].str.lower().str.replace(".", "").str.strip()


In [11]:
# Improved regex for monetary values
MONEY_REGEX = r"\$[\d,.]+(?:\s?(million|billion|thousand|k|m|b))?"

def parse_money(amount_str):
    multipliers = {'thousand':1e3, 'million':1e6, 'billion':1e9, 'k':1e3, 'm':1e6, 'b':1e9}
    amount_str = amount_str.lower().replace(",", "").replace("$", "").strip()
    match = re.match(r"([\d.]+)\s?(thousand|million|billion|k|m|b)?", amount_str)
    if match:
        number = float(match.group(1))
        multiplier = multipliers.get(match.group(2), 1)
        return number * multiplier
    return None

def extract_entities(text):
    doc = nlp(text)
    orgs = {ent.text.strip() for ent in doc.ents if ent.label_ == "ORG"}
    people = {ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"}
    locations = {ent.text.strip() for ent in doc.ents if ent.label_ in ["GPE", "LOC"]}
    dates = {ent.text.strip() for ent in doc.ents if ent.label_ == "DATE" and re.search(r"\d{4}", ent.text)}
    money_mentions = {ent.text.strip() for ent in doc.ents if ent.label_ == "MONEY"}

    money_from_regex = re.findall(MONEY_REGEX, text)
    parsed_money = [parse_money(m[0] if isinstance(m, tuple) else m) for m in money_from_regex]
    parsed_money = [amt for amt in parsed_money if amt]

    return {
        "orgs": list(orgs),
        "people": list(people),
        "locations": list(locations),
        "dates": list(dates),
        "money_raw": list(money_mentions),
        "loan_amounts": parsed_money
    }


In [12]:
def match_press_release_to_loans(entities, loan_df, top_n=3, score_threshold=85, amount_tolerance=1000):
    matches = []
    name_pool = loan_df["normalized_borrowername"].tolist()

    for name in entities["orgs"] + entities["people"]:
        top_matches = process.extract(name.lower(), name_pool, scorer=fuzz.token_sort_ratio, limit=top_n)
        for match_name, score, idx in top_matches:
            if score < score_threshold:
                continue
            loan_candidate = loan_df.iloc[idx]
            for amount in entities["loan_amounts"]:
                if abs(amount - loan_candidate["currentapprovalamount"]) < amount_tolerance:
                    matches.append({
                        "MatchedLoanNumber": loan_candidate["loannumber"],
                        "MatchedBorrowerName": loan_candidate["borrowername"],
                        "LoanAmount": loan_candidate["currentapprovalamount"],
                        "LoanApprovalDate": loan_candidate["dateapproved"],
                        "MatchScore": score,
                        "AmountDifference": abs(amount - loan_candidate["currentapprovalamount"])
                    })
    return matches


In [None]:
matched_results = []

for release in tqdm(press_releases):
    content = release.get("press_release", "")
    entities = extract_entities(content)
    matches = match_press_release_to_loans(entities, loan_df)

    for match in matches:
        result = {
            "Title": release.get("title"),
            "Date": release.get("date"),
            "Link": release.get("link"),
            **match,
            "is_fraudulent": 1
        }
        matched_results.append(result)

print(f"\n Total matched fraud cases: {len(matched_results)}")


  9%|██████▊                                                                      | 222/2523 [52:38<9:11:11, 14.37s/it]

In [None]:
matched_df = pd.DataFrame(matched_results)
matched_df.to_csv("Data/Processed/matched_fraud_cases.csv", index=False)
matched_df.to_json("Data/Processed/matched_fraud_cases.json", orient="records", indent=2)

print(" Matched fraud cases saved.")
