In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

In [12]:
# Reading in my directory of sources
df = pd.read_csv("csv/MDEQ-SRN-source-list-final.csv")
df.head()

Unnamed: 0,id,name,zip_code,county,full_address,geometry
0,A6260,ALGONAC CAST PRODUCTS INC,48001.0,SAINT CLAIR,"9300 STONE ROAD, ALGONAC, MI","[-82.54572189999999, 42.6299997]"
1,N8094,SPEEDWAY SUPERAMERICA LLC (REM SITE SSA 8702),48001.0,SAINT CLAIR,"440 POINTE TREMBLE RD, ALGONAC, MI","[-82.5371631, 42.6138238]"
2,N6769,SUNSATION PRODUCTS INC,48001.0,SAINT CLAIR,"9635 KRETZ DR, ALGONAC, MI","[-82.5700738, 42.6222489]"
3,P1024,ALTA EQUIPMENT COMPANY,48001.0,KENT,"8840 BYRON COMMERCE DRIVE, BYRON CENTER, MI","[-85.6703763, 42.80414830000001]"
4,U741802378,FORMER RESTAURANT,48001.0,SAINT CLAIR,"4219 POINTE TREMBLE ROAD, CLAY TOWNSHIP, MI","[-82.56713599999999, 42.620197]"


In [13]:
# Get a list of the links of the directories for each source
# By inserting the source id into the MI DEQ url
source_links = []
for cell in df.id:
    url = f"https://www.deq.state.mi.us/aps/downloads/SRN/{cell}/"
    source_links.append(url)
len(source_links)

5837

In [None]:
# Loop through the list of links and scrape the contents of the directories
from tqdm import tqdm
all_sources_data = []
all_sources_extras = []
mistakes = []
for source in tqdm(source_links):
    raw_html = requests.get(source).content
    doc = BeautifulSoup(raw_html, "html.parser")
    links = doc.find_all('a')
    # company_data captures data from PDF names and urls
    source_data = []
    # company_extras captures PDF names and urls that don't fit the regex
    source_extras = []
    for link in links:
        data = {}
        other = {}
        doc_url = 'https://www.deq.state.mi.us'+link['href']
        # I don't want the ['To Parent Directory'] link:
        if (doc_url != 'https://www.deq.state.mi.us/aps/downloads/SRN/'):
            try:
                # Getting the source_id
                data['source_id'] = re.findall(r"^\w\w?\d+",link.text)[0]
                # Getting the document code
                data['doc_type'] = re.findall(r"_([A-Z]+\d?\d?)_", link.text, re.IGNORECASE)[0]
                # Getting the date
                data['date'] = re.findall(r"_(\d{8})", link.text)[0]
                # Getting the URL
                data['doc_url'] = doc_url
                source_data.append(data)
                
            # If something doesn't fit the regex: 
            except:
                try:
                    # Getting the extra document names
                    other['doc_name'] = link.text
                    
                    # Getting the extra document urls
                    other['doc_url'] = doc_url
                    source_extras.append(other)

                except:
                    # If there are still links that don't work, save them in a list
                    mistake = link
                    mistakes.append(mistake)
    # Only include the list of data if it isn't empty
    if len(source_data) != 0:
        all_sources_data.append(source_data)
    # Only include the list of extras if it isn't empty
    if len(source_extras) != 0:
        all_sources_extras.append(source_extras)
    # Adding the results to a dataframe
    if len(all_sources_data) != 0:
        list_of_dfs = [pd.DataFrame(one_list) for one_list in all_sources_data]
        df = pd.concat(list_of_dfs, ignore_index=True)

In [35]:
df

Unnamed: 0,source_id,doc_type,date,doc_url
0,N8277,ACO,20151217,https://www.deq.state.mi.us/aps/downloads/SRN/...
1,N8274,ACO,20151217,https://www.deq.state.mi.us/aps/downloads/SRN/...
2,U04060035,SAR,20190716,https://www.deq.state.mi.us/aps/downloads/SRN/...
3,U04060035,SAR,20200507,https://www.deq.state.mi.us/aps/downloads/SRN/...
4,N7824,FCE,20160525,https://www.deq.state.mi.us/aps/downloads/SRN/...
...,...,...,...,...
18272,N1784,TEST,20211102,https://www.deq.state.mi.us/aps/downloads/SRN/...
18273,P0374,FCE,20220108,https://www.deq.state.mi.us/aps/downloads/SRN/...
18274,P0374,SAR,20211104,https://www.deq.state.mi.us/aps/downloads/SRN/...
18275,N7508,TEST,20211005,https://www.deq.state.mi.us/aps/downloads/SRN/...


In [36]:
df.to_csv("csv/MDEQ-SRN-documents.csv", index=False)

In [39]:
# Turning my list of lists of dicts of extra documents into a dataframe
list_of_dfs = [pd.DataFrame(one_list) for one_list in all_sources_extras]
df = pd.concat(list_of_dfs, ignore_index=True)

In [42]:
df.to_csv("csv/MDEQ-SRN-extra-documents.csv", index=False)