In [None]:
import requests
from bs4 import BeautifulSoup

def get_cam_listings():
    '''
    This function looks at the TCEQ's Monitor Site Listings Website and pulls the CAMS ID for each monitor available across the state
    Returns: a list containing the ID of each CAMS monitor.
    '''
    resp = requests.get("https://www.tceq.texas.gov/cgi-bin/compliance/monops/site_info.pl")
    soup = BeautifulSoup(resp.text, 'html.parser') 
    tables = soup.find_all("table")
    table = tables[4]
    # table

    trs = table.findAll('tr')
    # for tr in trs[1:]:
    #     print(tr.find('a').text)
    cam_options = [tr.find('a').text for tr in trs[1:]]
    return cam_options

def get_and_store_html(id):
    '''
    This function downloads to local storage the current day's monitoring data at each CAMS listing across Texas.
    All files are saved to the "./cams/" subdirectory
    Returns: Nothing.
    '''
    resp = requests.get(f"https://www.tceq.texas.gov/cgi-bin/compliance/monops/daily_summary.pl?cams={id}")
    html = resp.text
    with open(f"cams/{id}.html", 'w') as f:
        f.write(html)

# for i in range(40,100):
cams_list = get_cam_listings()
print(cams_list)
# for i in cams_list:
#     get_and_store_html(i)
#     print('success!',i)


# soup = BeautifulSoup(resp.text) 
# tables = soup.find_all("table")
# table = tables[3]
# table

In [19]:
import pandas as pd
from bs4 import BeautifulSoup

#So far this only works if the monitor has BOTH parameters. Need to separate it out so that it can read either-or.

def build_df(id):
    '''
    This function collects the desired parameter values for the current day given a specified monitor ID input.
    Returns: a dataframe object of monitor parameter values for a given single monitor
    '''
    # def get_param_index(param = "Ozone"):
    def get_param_index(param):
        '''
        This function is called by "get_values". It is used to find the table index of the desired parameter. 
        Returns: an integer value of the index.
        '''
        for i in range(2,len(trs)-4):    
            tr = trs[i]
            param_txt = tr.find_all("td")[0].text.strip()
            # print('in get_param_index: ',i,param_txt)
            if param in param_txt:
                return i

    def get_values_at_idx(i):
        '''
        This function is called by "get_values". it is used to find the values in the TCEQ monitoring table at a specific index.
        text.strip(). all leading and trailing whitespaces are removed from the string
        first column is the parameter name. last two columns are parameter name and Parameter Occurence Code (POC). Skip these.
        Returns: a list of recorded values for the day so far. Numeric and Text both possible for individual td values.
        '''
        vals = []
        for td in trs[i].find_all("td")[1:-2]:
            txt = td.text.strip()
            vals.append(txt)
            # print('In get_values_at_idx :',txt)
        # print('In get_values_at_idx :',vals)
        
        return vals, len(vals)
        # return vals

    #def get_values(param = "Ozone") -> list:
    def get_values(param) -> list:
        '''
        This function finds the index of the input parameter (default Ozone)
        Returns: a list of recorded values for the day so far. Numeric and Text both possible for individual td values.
        '''
        idx = get_param_index(param)
        return get_values_at_idx(idx)

    def get_trs_from_id(id = 23) -> str:
        '''
        This function opens a locally-saved HTML page and returns the table rows for the last table on the page.
        Returns: All table rows within the page's last table.
        '''
        with open(f"cams/{id}.html", 'r') as f:
            html = f.read()

        soup = BeautifulSoup(html)
        table = soup.find_all('table')[-1]
        return table.find_all('tr')
    
    trs = get_trs_from_id(id)
    params = ["Ozone","PM-2.5"]
    data = {}    
    
    # If the given monitor doesnt have ALL parameters available, this causes an error. Need to also allow for monitors that have >=1 params, but not necessarily all.
    for param in params:
        # vals = get_values(param)
        # num_hours = len(vals)
        data[param], num_hours = get_values(param)
        # data[param] = get_values(param)
        # print('In Param Loop: ',data[param], num_hours)
    
    
    # Notes. if we move params outside this loop, you're calling the HTML twice, and then you need a param-specific ID value. "Ozone Value", "Ozone ID". And then afterward if "Ozone ID" and "PM2.5 ID" match, you could combine/condense.
    # for param in params:
    #     try:
    #         data[param] = get_values(param)
    #     except Exception as e:
    #         empty_list = ["empty" for i in range(0,1)]
    #         data[param] = empty_list
    #     print('In Param Loop: ',data[param])
        
    df = pd.DataFrame(data)
    df['CAMS_ID'] = id
    hours_list = [i for i in range(0, num_hours)]
    df['Hour'] = hours_list
    # print(df)
    return df

dfs = []
# for i in cams_list:
for i in [3]:
    try:
        dfs.append(build_df(i))
    except Exception as e:
        pass
        # print(i, e)

df = pd.concat(dfs).reset_index(drop=True)
# df
    

In [None]:
# sorted(df._id.unique())
df.CAMS_ID.unique()
'3' in df.CAMS_ID.unique()

In [20]:
val_errors = ["NA","LST","FEW","CAL","LIM","NOD","NEG","QAS","PMA","SPN"]
df['Ozone'] = df['Ozone'].apply(lambda x: float(x) if x not in val_errors else None)
df['PM-2.5'] = df['PM-2.5'].apply(lambda x: float(x) if x not in val_errors else None)
df.to_csv("transformed/cleaner.csv", index=False)
df

Unnamed: 0,Ozone,PM-2.5,CAMS_ID,Hour
0,22.0,6.0,3,0
1,20.0,3.0,3,1
2,21.0,0.0,3,2
3,22.0,3.0,3,3
4,20.0,4.0,3,4
5,19.0,4.0,3,5
6,19.0,5.0,3,6
7,21.0,8.0,3,7
8,24.0,18.0,3,8
9,30.0,17.0,3,9


In [36]:
import plotly.express as px

# px.box(df, x="CAMS_ID", y=['Ozone','PM-2.5'])
# filt = (df["CAMS_ID"] != "3001") & (df["CAMS_ID"] != "41")
filt = df["CAMS_ID"] == 3  #Its an INT, not a STR
# px.scatter(df[filt], x="PM-2.5", y="Ozone", color="CAMS_ID")
# fig = px.scatter(df, x="PM-2.5", y="Ozone", color="CAMS_ID", title="Hourly Ozone and PM2.5 Concentrations Across Texas")
# fig.show()
fig = px.scatter(df[filt], x="PM-2.5", 
y="Ozone", 
color="Hour", 
title="Hourly Ozone and PM2.5 Concentrations Across Texas", 
text="CAMS_ID",
labels={
    "Hour": "Hour",
    "Ozone": "Ozone Concentration (ppb)",
    "PM-2.5": "Particulate Concentration (ug/m3)"
    })
fig.update_traces(textposition='top right')

In [44]:
df.iloc[0,2] == 3
df.loc[0:3,'Ozone']
df.loc[0,'CAMS_ID'] == 3
# filt = df["CAMS_ID"] == "3"
# df[filt]

True

In [None]:
class Example:
    def __init__(self, name: str):
        self.name = name

eg = Example("parker")
eg.name

In [None]:
oz_idx = _get_param_index("Ozone")
pm_idx = _get_param_index("PM")

In [None]:
pm25_tr = trs[-5]
for td in trs[-5]:
    print(td.text)