In [56]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [57]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [58]:
os.chdir(config["PATH"]["project"])

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [59]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Load weather stations location data from postgres DB

In [62]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [61]:
stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



## Load world US weather (prefiltered)

from raw 2020.csv, filter to keep only US stations, and remove failed measurements.
See US_weather_exporation.ipynb

In [63]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

In [64]:
weather_2020.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)



In [65]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

CPU times: user 6.26 ms, sys: 3.53 ms, total: 9.79 ms
Wall time: 43.4 s


25074507

In [72]:
all_elements = weather_2020.select("measured").distinct().collect()

In [73]:
all_elements

[Row(measured='WESD'),
 Row(measured='PGTM'),
 Row(measured='AWDR'),
 Row(measured='WT07'),
 Row(measured='SX33'),
 Row(measured='EVAP'),
 Row(measured='SN53'),
 Row(measured='WT10'),
 Row(measured='SN35'),
 Row(measured='TMIN'),
 Row(measured='MDPR'),
 Row(measured='WT09'),
 Row(measured='SX51'),
 Row(measured='WT05'),
 Row(measured='SN36'),
 Row(measured='SN32'),
 Row(measured='SN31'),
 Row(measured='SX52'),
 Row(measured='PSUN'),
 Row(measured='SX56'),
 Row(measured='WT06'),
 Row(measured='SN56'),
 Row(measured='WT04'),
 Row(measured='WT01'),
 Row(measured='MXPN'),
 Row(measured='WSFG'),
 Row(measured='WT11'),
 Row(measured='DAPR'),
 Row(measured='TOBS'),
 Row(measured='SX35'),
 Row(measured='SX31'),
 Row(measured='SX32'),
 Row(measured='SX55'),
 Row(measured='TMAX'),
 Row(measured='WDMV'),
 Row(measured='SNOW'),
 Row(measured='WDFG'),
 Row(measured='TSUN'),
 Row(measured='WSFI'),
 Row(measured='WESF'),
 Row(measured='WDF2'),
 Row(measured='WDF5'),
 Row(measured='WT02'),
 Row(measur

In [74]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [76]:
ndays_station_msr = weather_2020.groupBy("station_id", "measured").agg( F.countDistinct("date").alias("nb_days") )

keep only measurements for main elements

In [78]:
ndays_station_msr_filterered = ndays_station_msr.filter( ndays_station_msr["measured"].isin(l_measurements))

In [None]:
stations_msr_filtered2.selectdis

In [79]:
%%time
ndays_station_msr_filterered.count()

CPU times: user 39.1 ms, sys: 1.85 ms, total: 40.9 ms
Wall time: 3min 11s


81991

Keep only (station, element) were nb of measures > 350 over the year (i.e. almost 1 measure per day)

In [80]:
%%time
stations_msr_filtered2 = ndays_station_msr_filterered.filter( ndays_station_msr["nb_days"] > 350)

CPU times: user 2.59 ms, sys: 399 µs, total: 2.99 ms
Wall time: 132 ms


In [81]:
%%time
stations_msr_filtered2.count()

CPU times: user 45.1 ms, sys: 21.5 ms, total: 66.6 ms
Wall time: 4min 50s


32738

Nb of stations with at least 1 main element measured most of the year

In [83]:
%%time
stations_msr_filtered2.select("station_id").distinct().count()

CPU times: user 41.8 ms, sys: 24.3 ms, total: 66.1 ms
Wall time: 4min 48s


13318

Nb of stations per measurement

In [86]:
%%time 
stations_msr_filtered2.groupBy("measured").agg( F.countDistinct("station_id") ).collect()

CPU times: user 39.6 ms, sys: 22.5 ms, total: 62.2 ms
Wall time: 3min 28s


[Row(measured='TMIN', count(DISTINCT station_id)=5668),
 Row(measured='TMAX', count(DISTINCT station_id)=5713),
 Row(measured='SNOW', count(DISTINCT station_id)=3505),
 Row(measured='SNWD', count(DISTINCT station_id)=3931),
 Row(measured='TAVG', count(DISTINCT station_id)=2222),
 Row(measured='PRCP', count(DISTINCT station_id)=11699)]

In [89]:
%%time
stations_msr_filtered2.groupBy("station_id").agg( F.count("measured").alias("n_elem_measured"))\
                        .groupBy("n_elem_measured").count().collect()

CPU times: user 51.8 ms, sys: 78.9 ms, total: 131 ms
Wall time: 4min 24s


[Row(n_elem_measured=6, count=181),
 Row(n_elem_measured=5, count=2563),
 Row(n_elem_measured=1, count=6265),
 Row(n_elem_measured=3, count=3250),
 Row(n_elem_measured=2, count=707),
 Row(n_elem_measured=4, count=352)]

In [92]:
%%time
stations_msr_filtered2.filter( stations_msr_filtered2["measured"].isin( ["SNOW", "SNWD"]))\
    .groupBy("station_id")\
    .agg(F.count("measured"))\
    .count().collect()

AttributeError: 'int' object has no attribute 'collect'

In [None]:
universal_stations = df_nb_days.filter( df_nb_days["nb_days"] > 350) \
                                .groupBy("station_id")\
                                .min("nb_days")\
                                .count()