<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/prac_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import spacy
import re
from rapidfuzz import fuzz, process
import json

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Improved regex for monetary amounts
MONEY_REGEX = r"\$[\d,.]+(?:\s?(million|billion|thousand|k|m|b))?"

def normalize_entity(entity):
    if isinstance(entity, str):
        entity = entity.lower().replace(".", "").strip()
        entity = re.sub(r"\s+(llc|inc|corp|ltd)\.?$", "", entity)
    else:
        entity = ""
    return entity

def parse_money(amount_str):
    multipliers = {'thousand':1e3, 'million':1e6, 'billion':1e9, 'k':1e3, 'm':1e6, 'b':1e9}
    amount_str = amount_str.lower().replace(",", "").replace("$", "").strip()
    match = re.match(r"([\d.]+)\s?(thousand|million|billion|k|m|b)?", amount_str)
    if match:
        number = float(match.group(1))
        multiplier = multipliers.get(match.group(2), 1)
        return number * multiplier
    return None

def extract_entities(text):
    doc = nlp(text)
    entities = {
        "names": list({normalize_entity(ent.text) for ent in doc.ents if ent.label_ in ["PERSON", "ORG"]}),
        "locations": list({ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]}),
        "dates": list({ent.text for ent in doc.ents if ent.label_ == "DATE" and re.search(r"\d{4}", ent.text)}),
        "money": list({ent.text for ent in doc.ents if ent.label_ == "MONEY"})
    }
    loan_amounts = re.findall(MONEY_REGEX, text)
    entities["loan_amounts"] = [parse_money(amt[0] if isinstance(amt, tuple) else amt) for amt in loan_amounts]
    entities["loan_amounts"] = [amt for amt in entities["loan_amounts"] if amt]
    return entities

# Load press releases from JSON
with open("/content/drive/MyDrive/NCU/Dissertation/Data/pandemic_reports.json", "r") as file:
    press_releases = json.load(file)

# Load loan applications data
loan_df = pd.read_csv("/content/drive/MyDrive/NCU/Dissertation/Data/PPP_Loan_apps.csv")
loan_df["NormalizedBorrowerName"] = loan_df["BorrowerName"].apply(normalize_entity)

matched_results = []

for release in press_releases:
    content = release["press_release"]
    entities = extract_entities(content)

    matched_app = None

    for name in entities["names"]:
        match_name, score, idx = process.extractOne(name, loan_df["NormalizedBorrowerName"], scorer=fuzz.token_sort_ratio)
        if score >= 85:
            potential_match = loan_df.iloc[idx]

            # Loan amount matching
            amount_matched = False
            for amount in entities["loan_amounts"]:
                if abs(amount - potential_match["CurrentApprovalAmount"]) < 1000:
                    amount_matched = True
                    break

            if amount_matched:
                matched_app = potential_match
                break

    if matched_app is not None:
        result = {
            "Title": release["title"],
            "Date": release["date"],
            "Link": release["link"],
            "MatchedLoanNumber": matched_app["LoanNumber"],
            "MatchedBorrowerName": matched_app["BorrowerName"],
            "LoanAmount": matched_app["CurrentApprovalAmount"],
            "LoanApprovalDate": matched_app["DateApproved"],
            "is_fraudulent": 1
        }
        matched_results.append(result)

# Save matched results
matched_df = pd.DataFrame(matched_results)
matched_df.to_csv("content/drive/MyDrive/NCU/Dissertation/Data/matched_fraud_cases_22mar25.csv", index=False)
matched_df.to_json("content/drive/MyDrive/NCU/Dissertation/Data/matched_fraud_cases_22mar25.json", orient="records", indent=4)

print(f"Matched {len(matched_results)} fraud cases saved.")
