In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

import pathlib
from dateutil.parser import parse
import csv
from io import StringIO
from statsmodels.nonparametric.smoothers_lowess import lowess

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [3]:
%%time 
# Update the cached copy of the website. 
#!wget -q --mirror --convert-links --adjust-extension --page-requisites --no-parent -P ../mirror http://jtimmer.cts.com/

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs


In [4]:
from itertools import zip_longest

def seq_index(a, b):
    return [(i, i+len(b)) for i in range(len(a)) if a[i:i+len(b)] == b]
    
    
def find_start_line(d):
    '''Find the header that marks the start of data'''
    h = [ str(e) for e in list(range(24))] # look for the line with the hours 0...23
    
    for i in range(10):
        if seq_index(d[i], h):
            return i
    
    return None

header_map = {
    'RDS':'Hr. of Max'  ,
    'Site Name': 'SiteName'
}

gauge = None
frames = []
for i, file in enumerate(pathlib.Path('../mirror').glob("**/*.CSV")):
    
    if file.name.startswith('8_') or 'current' in file.name: # 8 hour average files
        continue

    parts = file.stem.split('_')
    
    # Filenames are one day after 
    date = parse(parts[-1]).date() - pd.to_timedelta(1, unit='d')
    data = list(csv.reader(file.open(encoding='latin1')))
    
    sl = find_start_line(data)


    if sl is None:
        print("ERROR No Start Line in ", file)
        continue
    
    data = data[sl:]
    
    # Check that the header is the same for every file. 
    header = list( a if a else b for a,b in zip_longest(data[0], data[1]) )
    header = [ header_map.get(e,e) for e in header]
    headerline = ' '.join(header).strip()
    if gauge is None: 
        gauge = headerline
   
    if headerline != gauge:
        print ('ERROR', file)
        print (headerline)
        print (gauge)
        break
    
    rows = [ [date]+r for r in data[2:] ]
    frames.append(pd.DataFrame(rows, columns = ['date'] + header))
    
df = pd.concat(frames, ignore_index=True)  
df['Parameter'] = df.Parameter.replace('',None).fillna(method='ffill')

ERROR No Start Line in  ../mirror/jtimmer.cts.com/2014/Oct14/yesterday_20141025.CSV
ERROR No Start Line in  ../mirror/jtimmer.cts.com/2014/Oct14/yesterday_20141024.CSV
ERROR No Start Line in  ../mirror/jtimmer.cts.com/2014/Oct14/yesterday_20141026.CSV
ERROR No Start Line in  ../mirror/jtimmer.cts.com/2014/Oct14/yesterday_20141027.CSV
ERROR No Start Line in  ../mirror/jtimmer.cts.com/2014/Oct14/yesterday_20141023.CSV
ERROR No Start Line in  ../mirror/jtimmer.cts.com/2014/Oct14/yesterday_20141022.CSV


In [5]:
t = df.set_index(['date','Parameter','SiteName'])
daily_cols = ['Hr. of Max', 'Max', 'Summary']
daily = t[daily_cols].drop_duplicates().copy()
daily.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Hr. of Max,Max,Summary
date,Parameter,SiteName,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-04,01 OZONE PPM,ALPINE,13.0,0.066,0.047
2020-05-04,01 OZONE PPM,CHULA VISTA,18.0,0.043,0.03
2020-05-04,01 OZONE PPM,DOWNTOWN SES,11.0,0.046,0.034
2020-05-04,01 OZONE PPM,EL CAJON LES,12.0,0.057,0.036
2020-05-04,01 OZONE PPM,ESCONDIDO,,,


In [6]:
hourly = df.set_index(['date','Parameter','SiteName']).drop(columns=daily).stack().to_frame('value').reset_index().rename(columns={'level_3':'hour'}).copy()
hourly.head()
hourly['date'] = hourly.apply(lambda r: r.date.replace(hour=int(r.hour)), axis=1)

In [7]:
hourly['value'] = pd.to_numeric(hourly.value, errors='coerce')
hourly.head()

Unnamed: 0,date,Parameter,SiteName,hour,value
0,2020-05-04 00:00:00,01 OZONE PPM,ALPINE,0,0.029
1,2020-05-04 01:00:00,01 OZONE PPM,ALPINE,1,0.034
2,2020-05-04 02:00:00,01 OZONE PPM,ALPINE,2,
3,2020-05-04 03:00:00,01 OZONE PPM,ALPINE,3,
4,2020-05-04 04:00:00,01 OZONE PPM,ALPINE,4,0.043


In [10]:
hourly.to_csv('../data/hourly_pollution.csv', index=False)
daily.to_csv('../data/daily_pollution.csv', index=False)

In [None]:
hourly.Parameter.value_counts().sort_index()

In [None]:
hourly.SiteName.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(16,4))
dtn_nox = hourly[(hourly.SiteName == 'KEARNY MESA') & (hourly.Parameter == '02 NOX PPM')]
t = dtn_nox.set_index('date').value.resample('W').mean().dropna()

smoothed = lowess(t,  t.index, .04)

sns.lineplot(t.index, smoothed[:,1], ax=ax)
sns.scatterplot(t.index, t, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(16,4))
sns.lineplot(t.index, smoothed[:,1], ax=ax)
sns.scatterplot(t.index, t, ax=ax)
ax.set_xlim('2017-11-01', '2018-06-01')