In [44]:
## import libraries
import requests  # Makes HTTP requests to fetch web pages from URLs
from bs4 import BeautifulSoup  # Parses HTML content into navigable Python objects for web scraping
import pandas as pd  # Creates and manipulates DataFrames for organizing scraped data into tables
import time  # Adds delays between requests to avoid overwhelming the server
from random import uniform  # Generates random time intervals to make scraping delays less predictable

In [46]:
## creating a function that would make a request to the New York State Commission on Ethics and Lobbying in Government website \
# and returns content as soup.

def makeSoup(url):
    
    headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")

    else:
        print(f"Your request returned {response.status_code}")

In [54]:
url = "https://ethics.ny.gov/financial-disclosure-statements-elected-officials?page=0"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response.status_code)

200


In [56]:
# for string content like HTML, XML etc
response.text

'\n<!DOCTYPE html>\n<html lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">\n  <head>\n    <meta charset="utf-8" />\n<script async src="https://www.googletagmanager.com/gtag/js?id=G-DHKKCHMQNH"></script>\n<script>window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "G-DHKKCHMQNH", {"groups":"default","page_placeholder":"PLACEHOLDER_page_location","allow_ad_personalization_signals":false});</script>\n<meta name="description" content="Financial disclosures of statewide elected officials, members of the Assembly, and Senators." />\n<link rel

In [60]:
## convert response.text into a BeautifulSoup object
soup = BeautifulSoup(response.text,"html.parser")
soup


<!DOCTYPE html>

<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
<head>
<meta charset="utf-8"/>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-DHKKCHMQNH"></script>
<script>window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "G-DHKKCHMQNH", {"groups":"default","page_placeholder":"PLACEHOLDER_page_location","allow_ad_personalization_signals":false});</script>
<meta content="Financial disclosures of statewide elected officials, members of the Assembly, and Senators." name="description"/>
<link href="/profiles/

In [64]:
## prettify our printout
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
 <head>
  <meta charset="utf-8"/>
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-DHKKCHMQNH">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "G-DHKKCHMQNH", {"groups":"default","page_placeholder":"PLACEHOLDER_page_location","allow_ad_personalization_signals":false});
  </script>
  <meta content="Financial disclosures of statewide elected officials, members of the Assembly, and Senators." name="description"/>
  <li

In [72]:
target_politicians = soup.find_all("div", class_="webny-teaser-title")
target_politicians

[<div class="webny-teaser-title">
 <a href="/sempolinski-joseph-2024">
                     Sempolinski, Joseph (2024)
             </a>
 </div>,
 <div class="webny-teaser-title">
 <a href="/2024-addabbo-jr-joseph-p-fds">
                     2024 Addabbo, Jr., Joseph P. FDS
             </a>
 </div>,
 <div class="webny-teaser-title">
 <a href="/ashby-jacob-c-2024">
                     Ashby, Jacob C. (2024)
             </a>
 </div>,
 <div class="webny-teaser-title">
 <a href="/bailey-jamaal-t-2024">
                     Bailey, Jamaal T. (2024)
             </a>
 </div>,
 <div class="webny-teaser-title">
 <a href="/baskin-april-nm-2024">
                     Baskin, April N.M. (2024)
             </a>
 </div>,
 <div class="webny-teaser-title">
 <a href="/borrello-george-m-2024">
                     Borrello, George M. (2024)
             </a>
 </div>,
 <div class="webny-teaser-title">
 <a href="/brisport-jabari-2024">
                     Brisport, Jabari (2024)
             </a>
 

In [84]:
# Because the url is inside an a tag within the div 
a_tags = [target_politician.find("a") for target_politician in target_politicians] 
a_tags

[<a href="/sempolinski-joseph-2024">
                     Sempolinski, Joseph (2024)
             </a>,
 <a href="/2024-addabbo-jr-joseph-p-fds">
                     2024 Addabbo, Jr., Joseph P. FDS
             </a>,
 <a href="/ashby-jacob-c-2024">
                     Ashby, Jacob C. (2024)
             </a>,
 <a href="/bailey-jamaal-t-2024">
                     Bailey, Jamaal T. (2024)
             </a>,
 <a href="/baskin-april-nm-2024">
                     Baskin, April N.M. (2024)
             </a>,
 <a href="/borrello-george-m-2024">
                     Borrello, George M. (2024)
             </a>,
 <a href="/brisport-jabari-2024">
                     Brisport, Jabari (2024)
             </a>,
 <a href="/brouk-samra-2024">
                     Brouk, Samra (2024)
             </a>,
 <a href="/bynoe-siela-2024">
                     Bynoe, Siela A. (2024)
             </a>,
 <a href="/canzoneri-fitzpatrick-patricia-m-2024">
                     Canzoneri-Fitzpatrick, Patricia

In [128]:
#checking document urls for page 1
partial_url = [a_tag.get("href") for a_tag in a_tags] 
partial_url

['/sempolinski-joseph-2024',
 '/2024-addabbo-jr-joseph-p-fds',
 '/ashby-jacob-c-2024',
 '/bailey-jamaal-t-2024',
 '/baskin-april-nm-2024',
 '/borrello-george-m-2024',
 '/brisport-jabari-2024',
 '/brouk-samra-2024',
 '/bynoe-siela-2024',
 '/canzoneri-fitzpatrick-patricia-m-2024']

In [175]:
## get urls for first 22 pages (this includes state senators and assemby people for 2024 and Governor and Lieutenant Governor for 2023)

base_url = "https://ethics.ny.gov/financial-disclosure-statements-elected-officials?"
domain = "https://ethics.ny.gov"

politician_financial_disclosures = []

for page in range (0,22):
    url = f"{base_url}page={page}"
    print (f"Scraping page {page}")

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    target_politicians = soup.find_all("div", class_="webny-teaser-title")

    for politician_div in target_politicians:
        a_tag = politician_div.find("a")
        if a_tag:
            href = a_tag.get("href")
            politician_name = a_tag.get_text(strip=True)
            all_partial_urls.append(href)

            if href:
                disclosure_doc_url = domain + href
                politician_financial_disclosures.append({
                    'name': politician_name,
                    'url': disclosure_doc_url
                })
        
        

  
    #snooze after each page (and not each document)     
    snoozer = uniform(15,30)
    print(f"snoozing for {snoozer} seconds before next scrape")
    time.sleep(snoozer)
                

print("done scraping all urls")

    

Scraping page 0
snoozing for 18.545940635110163 seconds before next scrape
Scraping page 1
snoozing for 24.629049129627738 seconds before next scrape
Scraping page 2
snoozing for 17.01993370395172 seconds before next scrape
Scraping page 3
snoozing for 16.53788169666133 seconds before next scrape
Scraping page 4
snoozing for 27.45808462810461 seconds before next scrape
Scraping page 5
snoozing for 16.83435668437907 seconds before next scrape
Scraping page 6
snoozing for 23.457515973610537 seconds before next scrape
Scraping page 7
snoozing for 27.12445051478615 seconds before next scrape
Scraping page 8
snoozing for 22.23510699926087 seconds before next scrape
Scraping page 9
snoozing for 18.34488724774307 seconds before next scrape
Scraping page 10
snoozing for 29.49760731935792 seconds before next scrape
Scraping page 11
snoozing for 21.962710860988985 seconds before next scrape
Scraping page 12
snoozing for 25.139255780034603 seconds before next scrape
Scraping page 13
snoozing for 

In [177]:
politician_financial_disclosures

[{'name': 'Sempolinski, Joseph (2024)',
  'url': 'https://ethics.ny.gov/sempolinski-joseph-2024'},
 {'name': '2024 Addabbo, Jr., Joseph P. FDS',
  'url': 'https://ethics.ny.gov/2024-addabbo-jr-joseph-p-fds'},
 {'name': 'Ashby, Jacob C. (2024)',
  'url': 'https://ethics.ny.gov/ashby-jacob-c-2024'},
 {'name': 'Bailey, Jamaal T. (2024)',
  'url': 'https://ethics.ny.gov/bailey-jamaal-t-2024'},
 {'name': 'Baskin, April N.M. (2024)',
  'url': 'https://ethics.ny.gov/baskin-april-nm-2024'},
 {'name': 'Borrello, George M. (2024)',
  'url': 'https://ethics.ny.gov/borrello-george-m-2024'},
 {'name': 'Brisport, Jabari (2024)',
  'url': 'https://ethics.ny.gov/brisport-jabari-2024'},
 {'name': 'Brouk, Samra (2024)',
  'url': 'https://ethics.ny.gov/brouk-samra-2024'},
 {'name': 'Bynoe, Siela A. (2024)',
  'url': 'https://ethics.ny.gov/bynoe-siela-2024'},
 {'name': 'Canzoneri-Fitzpatrick, Patricia M. (2024)',
  'url': 'https://ethics.ny.gov/canzoneri-fitzpatrick-patricia-m-2024'},
 {'name': 'Chan, Ste

In [179]:
len(politician_financial_disclosures)

220

In [181]:
df = pd.DataFrame(politician_financial_disclosures)
df

Unnamed: 0,name,url
0,"Sempolinski, Joseph (2024)",https://ethics.ny.gov/sempolinski-joseph-2024
1,"2024 Addabbo, Jr., Joseph P. FDS",https://ethics.ny.gov/2024-addabbo-jr-joseph-p...
2,"Ashby, Jacob C. (2024)",https://ethics.ny.gov/ashby-jacob-c-2024
3,"Bailey, Jamaal T. (2024)",https://ethics.ny.gov/bailey-jamaal-t-2024
4,"Baskin, April N.M. (2024)",https://ethics.ny.gov/baskin-april-nm-2024
...,...,...
215,2024 Lieutenant Governor Antonio Delgado,https://ethics.ny.gov/2024-lieutenant-governor...
216,2024 Attorney General Letitia James,https://ethics.ny.gov/2024-attorney-general-le...
217,2024 Comptroller Thomas P. DiNapoli,https://ethics.ny.gov/2024-comptroller-thomas-...
218,2023 Governor Kathleen Hochul,https://ethics.ny.gov/2023-governor-kathleen-h...


In [189]:
## Remove the 2024s in the name column
df["name"] = df["name"].str.replace(" (2024)", "").str.replace("2024 ", "")
df["name"] = df["name"].str.replace("2023 ", "")
df

Unnamed: 0,name,url
0,"Sempolinski, Joseph",https://ethics.ny.gov/sempolinski-joseph-2024
1,"Addabbo, Jr., Joseph P. FDS",https://ethics.ny.gov/2024-addabbo-jr-joseph-p...
2,"Ashby, Jacob C.",https://ethics.ny.gov/ashby-jacob-c-2024
3,"Bailey, Jamaal T.",https://ethics.ny.gov/bailey-jamaal-t-2024
4,"Baskin, April N.M.",https://ethics.ny.gov/baskin-april-nm-2024
...,...,...
215,Lieutenant Governor Antonio Delgado,https://ethics.ny.gov/2024-lieutenant-governor...
216,Attorney General Letitia James,https://ethics.ny.gov/2024-attorney-general-le...
217,Comptroller Thomas P. DiNapoli,https://ethics.ny.gov/2024-comptroller-thomas-...
218,Governor Kathleen Hochul,https://ethics.ny.gov/2023-governor-kathleen-h...


In [193]:
# Changing column names 
df.columns = ["Name of Politician", "Link to Financial Disclosure Document"]
df

Unnamed: 0,Name of Politician,Link to Financial Disclosure Document
0,"Sempolinski, Joseph",https://ethics.ny.gov/sempolinski-joseph-2024
1,"Addabbo, Jr., Joseph P. FDS",https://ethics.ny.gov/2024-addabbo-jr-joseph-p...
2,"Ashby, Jacob C.",https://ethics.ny.gov/ashby-jacob-c-2024
3,"Bailey, Jamaal T.",https://ethics.ny.gov/bailey-jamaal-t-2024
4,"Baskin, April N.M.",https://ethics.ny.gov/baskin-april-nm-2024
...,...,...
215,Lieutenant Governor Antonio Delgado,https://ethics.ny.gov/2024-lieutenant-governor...
216,Attorney General Letitia James,https://ethics.ny.gov/2024-attorney-general-le...
217,Comptroller Thomas P. DiNapoli,https://ethics.ny.gov/2024-comptroller-thomas-...
218,Governor Kathleen Hochul,https://ethics.ny.gov/2023-governor-kathleen-h...


In [207]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [211]:
import wget

In [224]:
# for url in df['Link to Financial Disclosure Document']:
#     try:
#         wget.download(url)
#         sleep(uniform(10, 20))
#     print(f"\nDownloading {i+1} of {len(df)}: {politician_name}")
#     except:
#     print(f"Error: {e}")

In [238]:
## Download the pdfs
from time import sleep

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}

for i, row in df.iterrows():
    url = row['Link to Financial Disclosure Document']
    name = row['Name of Politician']
    
    response = requests.get(url, headers=headers)
    open(f'{name}.pdf', 'wb').write(response.content) #save the pdfs and name them as politician's name
    print(f'Downloaded {name}')
    sleep(10)

Downloaded Sempolinski, Joseph
Downloaded Addabbo, Jr., Joseph P. FDS
Downloaded Ashby, Jacob C.
Downloaded Bailey, Jamaal T.
Downloaded Baskin, April N.M.
Downloaded Borrello, George M.
Downloaded Brisport, Jabari
Downloaded Brouk, Samra
Downloaded Bynoe, Siela A.
Downloaded Canzoneri-Fitzpatrick, Patricia M.
Downloaded Chan, Stephen
Downloaded Cleare, Cordell
Downloaded Comrie, Sr., Leroy G.
Downloaded Cooney, Jeremy A.
Downloaded Fahy, Patricia FDS
Downloaded Fernandez, Nathalia
Downloaded Gallivan, Patrick M. FDS
Downloaded Gianaris, Michael
Downloaded Gonzalez, Kristen S.
Downloaded Gounardes, Andrew S.
Downloaded Griffo, Joseph A. FDS
Downloaded Harckham, Peter B.
Downloaded Helming, Pamela A.
Downloaded Hinchey, Michelle
Downloaded Hoylman-Sigal, Brad
Downloaded Jackson, Robert
Downloaded Kavanagh, Brian FDS
Downloaded Krueger, Liz FDS
Downloaded Lanza, Andrew
Downloaded Liu, John C.
Downloaded Martinez, Monica R.
Downloaded Martins, Jack FDS
Downloaded Mattera, Mario
Downloaded

In [130]:

### Code i was trying with at first, used AI to tweak it. 


# base_url = "https://ethics.ny.gov"  # full url is https://ethics.ny.gov/borrello-george-m-2024

# politicians_list = []

# ## loop through 22 pages, since first 22 pages have 2024 politicians

# # Loop through each disclosure URL
# for index, path in enumerate(disclosure_url):
#     url = f"{base_url}{path}"
#     print(f"Scraping URL {index + 1}/{len(disclosure_url)}: {url}")

# for page_num in range (0,21):
    
#     url = f"{base_url}{disclosure_url}"
#     print(f"Scraping page {page_num}: {url}")

#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         target_politicians = soup.find_all("div", class_="webny-teaser-title")
           
            
#         for politician_div in target_politicians:
#             a_tag = politician_div.find("a")
#             href = a_tag.get("href")
#             politicians_list.append(href)

#     except Exception as e:
#         print(f"Error on page {page_num}: {e}")
        
#     finally:
#             snoozer = uniform(5,8)
#             print(f"snoozing for {snoozer} seconds before next scrape")
#             time.sleep(snoozer)
        
# print("done scraping all urls")
        
        
    