In [1]:
import pandas as pd
import numpy as np

import time
import re

In [2]:
pollutants = ["NO2", "Ozone", "PM10", "PM25"]#, "SO2"] # "Pb" "CO"
pollutants = ["Ozone", "PM25", "PM10"]

common_cols = ["Date", "Site ID", "SITE_LATITUDE", "SITE_LONGITUDE"]
df = pd.DataFrame(columns=common_cols)

data_cols = {}

datafile = 'airQual_CA_{}_{}.csv'
data_col_idx = 4

for i, p in enumerate(pollutants):
    dfs_p = []
    for y in [2018, 2019]:
        dfs_p.append(pd.read_csv(datafile.format(y, p.lower())))
    df_p = pd.concat(dfs_p)
    init_len = len(df_p)
    parameter = df_p["AQS_PARAMETER_DESC"].mode().item()
    df_p = df_p[df_p['AQS_PARAMETER_DESC'] == parameter]
    df_p["Date"] = pd.to_datetime(df_p["Date"])

    print(p, init_len, len(df_p))

    data_col = df_p.columns[data_col_idx]
    data_cols[p] = data_col
    keep_cols = common_cols + [data_col]

    df = df.merge(df_p[keep_cols], on=common_cols, how="outer")

Ozone 116698 116698
PM25 104937 75810
PM10 67576 67576


In [3]:
df.dropna(inplace=True)

In [4]:
df.isna().sum()

Date                                    0
Site ID                                 0
SITE_LATITUDE                           0
SITE_LONGITUDE                          0
Daily Max 8-hour Ozone Concentration    0
Daily Mean PM2.5 Concentration          0
Daily Mean PM10 Concentration           0
dtype: int64

In [5]:
len(df)

30806

In [6]:
df.head()

Unnamed: 0,Date,Site ID,SITE_LATITUDE,SITE_LONGITUDE,Daily Max 8-hour Ozone Concentration,Daily Mean PM2.5 Concentration,Daily Mean PM10 Concentration
2414,2018-01-01,60070008,39.76168,-121.84047,0.018,23.2,35.0
2416,2018-01-03,60070008,39.76168,-121.84047,0.013,24.4,41.0
2417,2018-01-04,60070008,39.76168,-121.84047,0.023,17.5,26.0
2418,2018-01-05,60070008,39.76168,-121.84047,0.019,10.8,16.0
2419,2018-01-06,60070008,39.76168,-121.84047,0.022,9.8,17.0


In [7]:
df.drop_duplicates(["Date", "Site ID"], inplace=True)

In [8]:
min_temps = 365
n_signals = 365
knn = 5
dir_graph = False
file_name = 'data/air_quality2018-19CA'
# file_name = '../temperatures2003_3months'
file_name += '_knn' + str(knn)
if dir_graph:
    file_name += '_dir'

In [9]:
stations_ids = df["Site ID"].tolist()

# Drop duplicates
stations_ids = list(set(stations_ids))
len(stations_ids)

58

In [13]:
ranges_2021819 = pd.date_range("2018-01-01", "2019-12-31")

In [14]:
MIN_VALS = 2*330
pm25 = []
oz = []
pm10 = []
stations_l = []
data_cols_names = list(data_cols.values())
for s in stations_ids:
    #
    station_vals = df[df['Site ID'] == s].copy()
    #station_vals.drop_duplicates('Date', inplace=True)
    #print(len(station_vals))
    #print(pm25_vals.values.shape)
    if len(station_vals) < MIN_VALS:
        # Skip station
        continue
    elif len(station_vals) == 2*365:
        # Complete station
        print("Complete ", s)
        stations_l.append(s)
        df_s = station_vals[data_cols_names]
    elif len(station_vals) > 2*365:
        raise RuntimeError("Cannot be")
    else:
        missing_dates = ranges_2021819.difference(station_vals['Date'])
        station_vals.set_index('Date', inplace=True)
        df_s = pd.concat([station_vals, pd.DataFrame(index=missing_dates)]).sort_index()[data_cols_names]
        df_s = df_s.interpolate(method='linear', axis=0)
        print("Interpolated ", s)
        stations_l.append(s)
    pm25.append(df_s[data_cols["PM25"]].values)
    oz.append(df_s[data_cols["Ozone"]].values)
    pm10.append(df_s[data_cols["PM10"]].values)

Interpolated  60832004
Interpolated  60832011
Interpolated  60090001
Interpolated  60379033
Interpolated  60831008
Interpolated  60710306
Interpolated  60190500
Interpolated  61113001
Interpolated  60798002
Interpolated  60590007
Interpolated  60190011
Interpolated  60830011
Interpolated  61112002
Interpolated  60690002
Interpolated  60658005
Interpolated  60530008
Interpolated  60290011


In [15]:
N = len(stations_l)
stations = df.copy().loc[df['Site ID'].isin(stations_l)].drop_duplicates('Site ID')
assert N == len(stations)
N

17

In [16]:
# Read stations coordinates and convert to radians
Coords = np.zeros((N, 2))
Coords[:, 0] = stations.SITE_LONGITUDE.to_numpy()*np.pi/180
Coords[:, 1] = stations.SITE_LATITUDE.to_numpy()*np.pi/180

# Earth radius in km
R_EARTH = 6371
# Coordinates in km
Coords_km = np.zeros((N, 2))
Coords_km[:, 0] = R_EARTH*Coords[:, 0]*np.cos(Coords[:, 1])
Coords_km[:, 1] = R_EARTH*Coords[:, 1]

In [17]:
# For geodesic distance in km
D = np.zeros((N, N))
for i in range(N):
    for j in range(i+1, N):
        D[i, j] = np.linalg.norm(Coords_km[i, :] - Coords_km[j, :])
D = D + D.T

In [18]:
P = np.exp(-D/np.sum(D)*N**2)
P_n = np.sum(P, axis=0)
np.fill_diagonal(D, np.inf)

idx = D.argsort()[:, :knn]
A = np.zeros(D.shape)
for i in range(N):
    A[i, idx[i, :]] = P[i, idx[i, :]]/P_n[idx[i, :]]
    if not dir_graph:
        A[idx[i, :], i] = A[i, idx[i, :]]

In [19]:
A_bin = np.zeros(A.shape)
A_bin[A != 0] = 1
print('Zeros:', np.sum(A == 0))
print('Non Zeros:', np.sum(A != 0))
print('Mean degree of A:', np.mean(np.sum(A_bin, axis=0)))

Zeros: 179
Non Zeros: 110
Mean degree of A: 6.470588235294118


In [20]:
file_name += '_N' + str(N)
np.savez(file_name, A=A, pm25=np.array(pm25), oz=np.array(oz), pm10=np.array(pm10), Coords=Coords,
         Coords_km=Coords_km, A_bin=A_bin, D=D)
print('File saved as ', file_name)

File saved as  data/air_quality2021CA_knn5_N17
