In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

pd.set_option('display.max_columns', None)

In [None]:
# Function Defs
def custom_title_case(s):
    return s.upper()

def convert_date(d):
    dateTimeObj = datetime.strptime(str(d), "%m-%d-%Y")
    return dateTimeObj.strftime("%Y-%m-%d")

def fix_recipient_name(recipient):
    recipient = re.sub(r'\(.*\)$', '', recipient).strip()
    parts = recipient.split(', ')
    if len(parts) > 1:
        first_part = custom_title_case(parts[0])
        rest = ' '.join(parts[1:])
        recipient_fixed = f"{custom_title_case(rest)} {first_part}"
    else:
        recipient_fixed = custom_title_case(recipient)
    return recipient_fixed

def split_contributor(contributor):
    p = contributor.split('  ')
    zip_code_match = re.search(r'(\b\d{5}\b)$', p[1])
    zip_code = zip_code_match.group(1) if zip_code_match else None

    village_match = re.search(r'(^[a-zA-Z ]+),', p[1])
    village = village_match.group(1) if village_match else None

    name_parts = p[0].split(', ')
    first_name = custom_title_case(' '.join(name_parts[1:])) if len(name_parts) > 1 else ''
    last_name = custom_title_case(name_parts[0])

    return last_name, first_name, village, zip_code

def clean_amount(amount):
    cleaned_amount = re.sub(r'[^\d]', '', amount)
    return int(cleaned_amount) if cleaned_amount else 0

def is_ny_zip_code(zip_code):
    return 10001 <= int(zip_code) <= 14975 if zip_code and zip_code.isdigit() else False

def process_data(url):
    disclaimer = 'FEDERAL LAW PROHIBITS THE USE OF CONTRIBUTOR INFORMATION FOR THE PURPOSE OF SOLICITING CONTRIBUTIONS OR FOR ANY COMMERCIAL PURPOSE.'
    all_data = []
    page = 1

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    service = Service('/usr/local/bin/chromedriver-linux64/chromedriver')

    driver = webdriver.Chrome(service=service, options=chrome_options)

    while True:
        full_url = f"{url}&page={page}"
        driver.get(full_url)

        try:
            table = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//table'))
            )

            table_html = table.get_attribute('outerHTML')

            df = pd.read_html(table_html)[0]

            if df.empty:
                break
            else:
                all_data.append(df)
                page += 1

        except Exception as e:
            break

    driver.quit()

    if all_data:
        df = pd.concat(all_data, ignore_index=True)
        df.drop(df.tail(1).index, inplace=True)
        df.columns = ['Category', 'Contributor', 'Employer', 'Occupation', 'Date', 'Amount', 'Recipient', 'Recipient Jurisdiction']
        df['Party'] = df['Recipient'].str.extract(r'\(([A-Z]+)\)$')
        df['Recipient'] = df['Recipient'].apply(fix_recipient_name)
        df['Amount'] = df['Amount'].apply(clean_amount)
        df = df[~df['Category'].str.contains(disclaimer)]
        split_contributions = df['Contributor'].apply(lambda x: pd.Series(split_contributor(x), index=['Last Name', 'First Name', 'Village', 'Zip Code']))
        df = pd.concat([df, split_contributions], axis=1)
        df = df[df['Zip Code'].apply(is_ny_zip_code)]
        df['Date'] = df['Date'].apply(convert_date)

        columns_order = ['Category', 'Contributor', 'Last Name', 'First Name', 'Village', 'Zip Code', 'Employer', 'Occupation', 'Date', 'Amount', 'Recipient', 'Party', 'Recipient Jurisdiction']
        df = df[columns_order]
    else:
        df = pd.DataFrame()

    return df

def generate_urls_and_aggregate_data(donor_data):
    base_url = "https://www.opensecrets.org/donor-lookup/results?name="
    aggregated_data = pd.DataFrame()
    for index, row in donor_data.iterrows():
        full_name = f"{row['FNAME']}+{row['LNAME']}"
        url = f"{base_url}{full_name}"
        data = process_data(url)
        if not data.empty:
            aggregated_data = pd.concat([aggregated_data, data], ignore_index=True)
    return aggregated_data

In [None]:
# Tests

def test_convert_date():
    pass

def test_split_contributor():
    pass