<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/prac_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import json

base_url = "https://pandemicoversight.gov"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36'
}

def safe_request(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e} - URL: {url}")
        return None

def get_reports(page_number):
    url = f"https://pandemicoversight.gov/oversight/reports?f%5B0%5D=report_type_taxonomy%3A85&page={page_number}"
    response = safe_request(url)
    if not response:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    reports = []

    for item in soup.select('.views-row'):
        title_tag = item.select_one('.display__condensed--title a')
        date_tag = item.select_one('.display__condensed--footer time')

        if title_tag and date_tag:
            title = title_tag.text.strip()
            date = date_tag.text.strip()
            link = title_tag['href']

            reports.append({
                'title': title,
                'date': date,
                'link': link
            })

    return reports

def get_press_release(url):
    response = safe_request(url)
    if not response:
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.select_one('.node-body .field_body')

    return content.text.strip() if content else ""

# Crawl through pages with optional limit
all_reports = []
page_limit = None  # Set limit here for testing, None for no limit
page_count = 0

while page_limit is None or page_count < page_limit:
    print(f"Scraping page: {page_count + 1}")
    reports = get_reports(page_count)

    if not reports:
        break

    for report in reports:
        print(f"Fetching press release for: {report['title']}")
        full_url = report['link'] if report['link'].startswith('http') else base_url + report['link']
        report['press_release'] = get_press_release(full_url)
        time.sleep(1)  # delay between press release requests

    all_reports.extend(reports)
    page_count += 1

    # Incremental saving
    with open('pandemic_reports.json', 'w', encoding='utf-8') as f:
        json.dump(all_reports, f, ensure_ascii=False, indent=4)

    time.sleep(2)  # delay between page requests

# Display results
for report in all_reports:
    print(f"Title: {report['title']}\nDate: {report['date']}\nLink: {report['link']}\nPress Release:\n{report['press_release']}\n{'-'*80}\n")


In [None]:
import pandas as pd
import spacy
import re
from rapidfuzz import fuzz, process
import json

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Improved regex for monetary amounts
MONEY_REGEX = r"\$[\d,.]+(?:\s?(million|billion|thousand|k|m|b))?"

def normalize_entity(entity):
    if isinstance(entity, str):
        entity = entity.lower().replace(".", "").strip()
        entity = re.sub(r"\s+(llc|inc|corp|ltd)\.?$", "", entity)
    else:
        entity = ""
    return entity

def parse_money(amount_str):
    multipliers = {'thousand':1e3, 'million':1e6, 'billion':1e9, 'k':1e3, 'm':1e6, 'b':1e9}
    amount_str = amount_str.lower().replace(",", "").replace("$", "").strip()
    match = re.match(r"([\d.]+)\s?(thousand|million|billion|k|m|b)?", amount_str)
    if match:
        number = float(match.group(1))
        multiplier = multipliers.get(match.group(2), 1)
        return number * multiplier
    return None

def extract_entities(text):
    doc = nlp(text)
    entities = {
        "names": list({normalize_entity(ent.text) for ent in doc.ents if ent.label_ in ["PERSON", "ORG"]}),
        "locations": list({ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]}),
        "dates": list({ent.text for ent in doc.ents if ent.label_ == "DATE" and re.search(r"\d{4}", ent.text)}),
        "money": list({ent.text for ent in doc.ents if ent.label_ == "MONEY"})
    }
    loan_amounts = re.findall(MONEY_REGEX, text)
    entities["loan_amounts"] = [parse_money(amt[0] if isinstance(amt, tuple) else amt) for amt in loan_amounts]
    entities["loan_amounts"] = [amt for amt in entities["loan_amounts"] if amt]
    return entities

# Load press releases from JSON
with open("/content/drive/MyDrive/NCU/Dissertation/Data/pandemic_reports.json", "r") as file:
    press_releases = json.load(file)

# Load loan applications data
loan_df = pd.read_csv("/content/drive/MyDrive/NCU/Dissertation/Data/PPP_Loan_apps.csv")
loan_df["NormalizedBorrowerName"] = loan_df["BorrowerName"].apply(normalize_entity)

matched_results = []

for release in press_releases:
    content = release["press_release"]
    entities = extract_entities(content)

    matched_app = None

    for name in entities["names"]:
        match_name, score, idx = process.extractOne(name, loan_df["NormalizedBorrowerName"], scorer=fuzz.token_sort_ratio)
        if score >= 85:
            potential_match = loan_df.iloc[idx]

            # Loan amount matching
            amount_matched = False
            for amount in entities["loan_amounts"]:
                if abs(amount - potential_match["CurrentApprovalAmount"]) < 1000:
                    amount_matched = True
                    break

            if amount_matched:
                matched_app = potential_match
                break

    if matched_app is not None:
        result = {
            "Title": release["title"],
            "Date": release["date"],
            "Link": release["link"],
            "MatchedLoanNumber": matched_app["LoanNumber"],
            "MatchedBorrowerName": matched_app["BorrowerName"],
            "LoanAmount": matched_app["CurrentApprovalAmount"],
            "LoanApprovalDate": matched_app["DateApproved"],
            "is_fraudulent": 1
        }
        matched_results.append(result)

# Save matched results
matched_df = pd.DataFrame(matched_results)
matched_df.to_csv("content/drive/MyDrive/NCU/Dissertation/Data/matched_fraud_cases_22mar25.csv", index=False)
matched_df.to_json("content/drive/MyDrive/NCU/Dissertation/Data/matched_fraud_cases_22mar25.json", orient="records", indent=4)

print(f"Matched {len(matched_results)} fraud cases saved.")


OSError: Cannot save file into a non-existent directory: 'content/drive/MyDrive/NCU/Dissertation/Data'

In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m158.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.12.2


In [None]:
reports