In [30]:
import pandas as pd
import numpy as np

import tarfile

import time
import re

In [31]:
# 54 nodes
#keep_states = "WA US|OR US"#|ID US|MT US"#|CA US|NV US|WY US|UT US|AZ US"

#keep_states = "CO US" # Colorado - 13 nodes with 2018 data

#keep_states = "CO US|UT US" # 17 nodes

keep_states = "CA US" # 31 nodes

# 28 nodes
#keep_states = "WA US"#|OR US|ID US|MT US"#|CA US|NV US|WY US|UT US|AZ US"
ks_re = re.compile(keep_states)

min_temps = 365
n_signals = 365
knn = 5
dir_graph = False
file_name = 'temperatures2017-18'
# file_name = '../temperatures2003_3months'
file_name += '_knn' + str(knn)
if dir_graph:
    file_name += '_dir'

In [32]:
dfs = []
temp = []
windsp = []
prec = []
pres = []
attrs = ["TEMP", "WDSP", "PRCP", "STP"]

years = [2017, 2018, 2019]
stations = {}

for y in years:
    tar = tarfile.open(str(y) + ".tar.gz")
    stations[y] = []
    for member in tar.getmembers():
        f = tar.extractfile(member)
        df = pd.read_csv(f)
        f.close()
        df.dropna(inplace=True)

        if len(df) == 0:
            continue
        
        station_name = df.NAME.iloc[0]

        if ks_re.search(station_name) and len(df) >= n_signals and not (np.any(df[attrs].values == 999.9) or np.any(df[attrs].values == 99.99)):
            stations[y].append(station_name)
            dfs.append(df)
            temp.append(df['TEMP'].values)
            windsp.append(df['WDSP'].values)
            prec.append(df['PRCP'].values)
            pres.append(df['STP'].values)

assert len(dfs) > 0, "No data with specified criteria"
df = pd.concat(dfs)
temps = np.array(temp)
windsps = np.array(windsp)
precs = np.array(prec)
press = np.array(pres)

In [33]:
len(set(stations[2017]).intersection(stations[2018])), len(set(stations[2017]).intersection(stations[2019])), len(set(stations[2018]).intersection(stations[2019])), len(set(stations[2017]).intersection(stations[2018]).intersection(stations[2019]))

(19, 13, 12, 8)

In [34]:
stations_1718 = list(set(stations[2017]).intersection(stations[2018]))

In [35]:
df1718 = df[df['NAME'].isin(stations_1718)]
df1718['DATE'] = pd.to_datetime(df1718['DATE'])
df1718 = df1718[(df1718['DATE'].dt.year == 2017) | (df1718['DATE'].dt.year == 2018)]

In [36]:
temp = []
windsp = []
prec = []
pres = []

for st in stations_1718:
    df_st = df1718[df1718['NAME'] == st].copy()
    df_st.sort_values('DATE', inplace=True)
    temp.append(df_st['TEMP'].values)
    windsp.append(df_st['WDSP'].values)
    prec.append(df_st['PRCP'].values)
    pres.append(df_st['STP'].values)

temps = np.array(temp)
windsps = np.array(windsp)
precs = np.array(prec)
press = np.array(pres)

In [37]:
temps.shape, windsps.shape, precs.shape, press.shape

((19, 730), (19, 730), (19, 730), (19, 730))

In [38]:
# # Constants
# Limit temperature in Fº
MAX_TEMP = 140
MIN_TEMP = -30
assert np.all(temps <= MAX_TEMP) and np.all(temps >= MIN_TEMP)

In [45]:
stations = df1718.drop_duplicates('NAME').sort_values('NAME')
N = len(stations)
N

19

In [52]:
stations.sort_values('NAME')['NAME'], sorted(stations_1718)

(0             CONCORD BUCHANAN FIELD, CA US
 0        FULLERTON MUNICIPAL AIRPORT, CA US
 0        HAWTHORNE MUNICIPAL AIRPORT, CA US
 0               HAYWARD AIR TERMINAL, CA US
 0                          LANCASTER, CA US
 0               OAKLAND METROPOLITAN, CA US
 0                   PALMDALE AIRPORT, CA US
 0                     RAMONA AIRPORT, CA US
 0        RIVERSIDE MUNICIPAL AIRPORT, CA US
 0            SACRAMENTO AIRPORT ASOS, CA US
 0    SACRAMENTO METROPOLITAN AIRPORT, CA US
 0    SAN DIEGO INTERNATIONAL AIRPORT, CA US
 0     SAN JOSE INTERNATIONAL AIRPORT, CA US
 0    SAN LUIS OBISPO MCCHESNEY FIELD, CA US
 0                           SANDBERG, CA US
 0    SANTA BARBARA MUNICIPAL AIRPORT, CA US
 0         SANTA MARIA PUBLIC AIRPORT, CA US
 0     SANTA MONICA MUNICIPAL AIRPORT, CA US
 0           SOUTH LAKE TAHOE AIRPORT, CA US
 Name: NAME, dtype: object,
 ['CONCORD BUCHANAN FIELD, CA US',
  'FULLERTON MUNICIPAL AIRPORT, CA US',
  'HAWTHORNE MUNICIPAL AIRPORT, CA US',
  

In [46]:
# Read stations coordinates and convert to radians
Coords = np.zeros((N, 2))
Coords[:, 0] = stations.LONGITUDE.to_numpy()*np.pi/180
Coords[:, 1] = stations.LATITUDE.to_numpy()*np.pi/180

# Earth radius in km
R_EARTH = 6371
# Coordinates in km
Coords_km = np.zeros((N, 2))
Coords_km[:, 0] = R_EARTH*Coords[:, 0]*np.cos(Coords[:, 1])
Coords_km[:, 1] = R_EARTH*Coords[:, 1]

In [47]:
# For geodesic distance in km
D = np.zeros((N, N))
for i in range(N):
    for j in range(i+1, N):
        D[i, j] = np.linalg.norm(Coords_km[i, :] - Coords_km[j, :])
D = D + D.T

In [48]:
P = np.exp(-D/np.sum(D)*N**2)
P_n = np.sum(P, axis=0)
np.fill_diagonal(D, np.inf)

idx = D.argsort()[:, :knn]
A = np.zeros(D.shape)
for i in range(N):
    A[i, idx[i, :]] = P[i, idx[i, :]]/P_n[idx[i, :]]
    if not dir_graph:
        A[idx[i, :], i] = A[i, idx[i, :]]

In [49]:
A_bin = np.zeros(A.shape)
A_bin[A != 0] = 1
print('Zeros:', np.sum(A == 0))
print('Non Zeros:', np.sum(A != 0))
print('Mean degree of A:', np.mean(np.sum(A_bin, axis=0)))

Zeros: 243
Non Zeros: 118
Mean degree of A: 6.2105263157894735


In [50]:
file_name += '_N' + str(N)
np.savez(file_name, A=A, temps=temps, precs=precs, windsps=windsps, press=press, Coords=Coords,
         Coords_km=Coords_km, A_bin=A_bin, D=D)
print('File saved as ', file_name)

File saved as  temperatures2017-19_knn5_N53_N19
