In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

In [2]:
# Reading in my directory of sources
df = pd.read_csv("MDEQ-SRN-directory.csv")
df.head()

Unnamed: 0,id,name,zip_code,county,full_address,geometry
0,A6260,ALGONAC CAST PRODUCTS INC,48001.0,SAINT CLAIR,"9300 STONE ROAD, ALGONAC, MI","[-82.54572189999999, 42.6299997]"
1,N8094,SPEEDWAY SUPERAMERICA LLC (REM SITE SSA 8702),48001.0,SAINT CLAIR,"440 POINTE TREMBLE RD, ALGONAC, MI","[-82.5371631, 42.6138238]"
2,N6769,SUNSATION PRODUCTS INC,48001.0,SAINT CLAIR,"9635 KRETZ DR, ALGONAC, MI","[-82.5700738, 42.6222489]"
3,P1024,ALTA EQUIPMENT COMPANY,48001.0,KENT,"8840 BYRON COMMERCE DRIVE, BYRON CENTER, MI","[-85.6703763, 42.80414830000001]"
4,U741802378,FORMER RESTAURANT,48001.0,SAINT CLAIR,"4219 POINTE TREMBLE ROAD, CLAY TOWNSHIP, MI","[-82.56713599999999, 42.620197]"


In [3]:
# Get a list of the links of the directories for each source
# By inserting the source id into the MI DEQ url
source_links = []
for cell in df.id:
    url = f"https://www.deq.state.mi.us/aps/downloads/SRN/{cell}/"
    source_links.append(url)
source_links[:5]

['https://www.deq.state.mi.us/aps/downloads/SRN/A6260/',
 'https://www.deq.state.mi.us/aps/downloads/SRN/N8094/',
 'https://www.deq.state.mi.us/aps/downloads/SRN/N6769/',
 'https://www.deq.state.mi.us/aps/downloads/SRN/P1024/',
 'https://www.deq.state.mi.us/aps/downloads/SRN/U741802378/']

In [None]:
# Loop through the list of links and scrape the contents of the directories

all_sources_data = []
all_sources_extras = []
mistakes = []
for source in source_links:
    raw_html = requests.get(source).content
    doc = BeautifulSoup(raw_html, "html.parser")
    links = doc.find_all('a')
    # company_data captures data from PDF names and urls
    company_data = []
    # company_extras captures PDF names and urls that don't fit the regex
    company_extras = []
    for link in links:
        data = {}
        other = {}
        if link.text != '[To Parent Directory]':
            try:
                # Getting the source_id
                data['source_id'] = re.findall(r"^\w\w?\d+",link.text)[0]
                # Getting the document code
                data['doc_type'] = re.findall(r"_([A-Z]+\d?)_",link.text)[0]
                # Getting the date
                data['date'] = re.findall(r"_(\d{8})", link.text)[0]
                # Getting the URL
                data['doc_url'] = "https://www.deq.state.mi.us"+link['href']
                company_data.append(data)
            except:
                try:
                    # Getting the extra document names
                    other['doc_name'] = link.text
                    # Getting the extra document urls
                    other['doc_url'] = "https://www.deq.state.mi.us"+link['href']
                    company_extras.append(other)
                except:
                    # If there are still links that don't work, save them in a list
                    mistake = link
                    mistakes.append(mistake)
    all_sources_data.append(company_data)
    # Only include the list of extras if it isn't empty
    if len(company_extras) != 0:
        all_sources_extras.append(company_extras)

In [None]:
# Turning my list of lists of dicts of company data into a dataframe
list_of_dfs = [pd.DataFrame(one_list) for one_list in all_sources_data]
df = pd.concat(list_of_dfs, ignore_index=True)

In [None]:
df.to_csv("MI_Sources_All_Documents.csv", index=False)

In [None]:
# Turning my list of lists of dicts of extra documents into a dataframe
list_of_dfs = [pd.DataFrame(one_list) for one_list in all_sources_extras]
df = pd.concat(list_of_dfs, ignore_index=True)
df

In [None]:
df.to_csv("MI_Sources_All_Extras.csv", index=False)