In [1]:
import os
import json
import google.cloud.bigquery as bq
import pandas as pd
from keplergl import KeplerGl

In [2]:
bq_client = bq.Client()

In [3]:
sql_dir_path = os.path.join(os.getcwd(), '..', 'sql')
query_path = os.path.join(sql_dir_path, 'stations_n_atlantic.sql')

with open(query_path, 'r') as f:
    query = f.read()

In [4]:
stations = bq_client.query(query).result().to_dataframe()

In [5]:
stations.head()

Unnamed: 0,id,country,state,name,latitude,longitude,element,firstyear,lastyear
0,ACW00011647,AC,,ST JOHNS,17.1333,-61.7833,TMAX,1961.0,1961.0
1,ACW00011647,AC,,ST JOHNS,17.1333,-61.7833,TMIN,1961.0,1961.0
2,BB000078954,BB,,GRANTLEY ADAMS,13.067,-59.483,TMAX,1944.0,2018.0
3,BB000078954,BB,,GRANTLEY ADAMS,13.067,-59.483,TMIN,1944.0,2018.0
4,BDM00078016,BD,,L F WADE INTL AP KINDLEY FLD,32.3667,-64.6833,TMAX,1949.0,2020.0


In [6]:
n_stations = len(stations.id.unique())
n_coountries = len(stations.country.unique())

print(f'There are {n_coountries} countries and {n_stations} stations within the polygon.')

There are 30 countries and 1932 stations within the polygon.


In [7]:
p_cntr_element = pd.pivot_table(data=stations,
                                index='country',
                                columns='element',
                                values=['firstyear', 'lastyear'],
                                aggfunc={'firstyear': 'min', 'lastyear': 'max'})

In [8]:
p_cntr_element

Unnamed: 0_level_0,firstyear,firstyear,firstyear,firstyear,firstyear,lastyear,lastyear,lastyear,lastyear,lastyear
element,AWND,EVAP,MXPN,TMAX,TMIN,AWND,EVAP,MXPN,TMAX,TMIN
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AC,,,,1961.0,1961.0,,,,1961.0,1961.0
BB,,,,1944.0,1944.0,,,,2018.0,2018.0
BD,1986.0,,,1945.0,1945.0,1995.0,,,2020.0,2020.0
BF,,,,1952.0,1952.0,,,,1963.0,1963.0
BR,,,,1974.0,1973.0,,,,1974.0,1975.0
CA,,,,1870.0,1870.0,,,,2020.0,2020.0
CO,,,,1961.0,1961.0,,,,2020.0,2020.0
CU,1986.0,,,1945.0,1945.0,2003.0,,,2020.0,2020.0
CV,,,,1973.0,1973.0,,,,2020.0,2020.0
DO,,,,1971.0,1971.0,,,,1973.0,1973.0


It looks like the most reliable data is `TMIN` and `TMAX`. In addition, it seems like the stations in Puerto Rico (RQ) and The United States Virgin Islands (VQ) contain a full set of data for all required elements. We may try to incorporate those anyways in our analysis.

In [9]:
min_last_year = 2015
max_first_year = 1960

cntr_to_exclude = p_cntr_element[(p_cntr_element[('lastyear', 'TMIN')] < min_last_year)
                                 | (p_cntr_element[('firstyear', 'TMIN')] > max_first_year)].index.tolist()

print(f'We will exclude stations from the following countries: {(", ").join(cntr_to_exclude)} because of incomplete data.')

We will exclude stations from the following countries: AC, BF, BR, CO, CV, DO, JM, NN, NS, SB, ST, UC because of incomplete data.


In [10]:
sel_stations = stations[~stations.country.isin(cntr_to_exclude)].copy()
print(f'{n_stations - len(sel_stations.id.unique())} stations have been discarded due to insufficient data.')

41 stations have been discarded due to insufficient data.


In [11]:
unq_sel_stations = sel_stations[['id', 'latitude', 'longitude']].drop_duplicates().copy()

In [12]:
polygon = {"type": "Polygon",
           "coordinates":[[[-1.306379, 49.316014],
                           [-10.342842, 35.002107],
                           [-19.931662, 13.986127],
                           [-9.352620, 0.0],
                           [-49.101915, 0],
                           [-71.903541, 13.516013],
                           [-77.111534, 7.367272],
                           [-84.601856, 10.676294],
                           [-100.744864, 23.415771],
                           [-97.158643, 29.735383],
                           [-76.957330, 34.883146],
                           [-70.455789, 43.963237],
                           [-21.150974, 49.144944],
                           [-1.306379, 49.316014],]]}

In [13]:
with open('weather_station_map_conf.json', 'r') as f:
    widget_conf = json.load(f)

In [14]:
widget = KeplerGl(height=480,
                  data={'stations': unq_sel_stations,
                        'poly': polygon},
                  conf=widget_conf)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [15]:
widget

KeplerGl(data={'stations':                id  latitude  longitude
2     BB000078954   13.0670   -59.4830
4    …