In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [3]:
project_path = config["PATH"]["project"]
os.chdir(project_path)

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [4]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Load world US weather (prefiltered)

from raw 2020.csv, filter to keep only US stations, and remove failed measurements.
See US_weather_exporation.ipynb

In [5]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

CPU times: user 4.88 ms, sys: 650 µs, total: 5.53 ms
Wall time: 13.7 s


In [None]:
weather_2020.printSchema()

In [9]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

CPU times: user 6.92 ms, sys: 8.65 ms, total: 15.6 ms
Wall time: 1min 9s


25074507

## Keep only rows for selected measurements (rain, temperature ...)

In [13]:
#all_elements = weather_2020.select("measured").distinct().collect()
#all_elements

[Row(measured='WESD'),
 Row(measured='PGTM'),
 Row(measured='AWDR'),
 Row(measured='WT07'),
 Row(measured='SX33'),
 Row(measured='EVAP'),
 Row(measured='SN53'),
 Row(measured='WT10'),
 Row(measured='SN35'),
 Row(measured='TMIN'),
 Row(measured='MDPR'),
 Row(measured='WT09'),
 Row(measured='SX51'),
 Row(measured='WT05'),
 Row(measured='SN36'),
 Row(measured='SN32'),
 Row(measured='SN31'),
 Row(measured='SX52'),
 Row(measured='PSUN'),
 Row(measured='SX56'),
 Row(measured='WT06'),
 Row(measured='SN56'),
 Row(measured='WT04'),
 Row(measured='WT01'),
 Row(measured='MXPN'),
 Row(measured='WSFG'),
 Row(measured='WT11'),
 Row(measured='DAPR'),
 Row(measured='TOBS'),
 Row(measured='SX35'),
 Row(measured='SX31'),
 Row(measured='SX32'),
 Row(measured='SX55'),
 Row(measured='TMAX'),
 Row(measured='WDMV'),
 Row(measured='SNOW'),
 Row(measured='WDFG'),
 Row(measured='TSUN'),
 Row(measured='WSFI'),
 Row(measured='WESF'),
 Row(measured='WDF2'),
 Row(measured='WDF5'),
 Row(measured='WT02'),
 Row(measur

In [12]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [7]:
ndays_station_msr = weather_2020.groupBy("station_id", "measured").agg( F.countDistinct("date").alias("nb_days") )

keep only measurements for main elements

In [8]:
ndays_station_msr_filterered = ndays_station_msr.filter( ndays_station_msr["measured"].isin(l_measurements))

In [None]:
%%time 
ndays_station_msr_filterered.write.mode("overwrite").parquet("OUT_DATA/stations_selected-measures_ndays")

In [17]:
%%time
ndays_station_msr_filterered.count()

CPU times: user 39 ms, sys: 10.7 ms, total: 49.7 ms
Wall time: 3min 37s


82204

Keep only (station, element) were nb of measures > 350 over the year (i.e. almost 1 measure per day)

In [9]:
%%time
stations_msr_filtered2 = ndays_station_msr_filterered.filter( ndays_station_msr_filterered["nb_days"] > 350)

CPU times: user 2.65 ms, sys: 402 µs, total: 3.05 ms
Wall time: 66 ms


In [10]:
%%time
df_filtered_stations = stations_msr_filtered2.toPandas()

CPU times: user 524 ms, sys: 56.8 ms, total: 580 ms
Wall time: 4min 52s


In [11]:
df_filtered_stations.to_csv("significant_stations.csv")

Nb of stations with at least 1 main element measured most of the year

In [21]:
%%time
len(df_filtered_stations["station_id"].unique())
#13361

CPU times: user 11.1 ms, sys: 11.1 ms, total: 22.2 ms
Wall time: 39.4 ms


13361

In [26]:
# stations are now directly read from pandas
#df_stations = stations.toPandas()

In [31]:
df_filtered_stations.head()

Unnamed: 0,station_id,measured,nb_days
0,US1MDHW0007,PRCP,366
1,US1MNSE0002,PRCP,366
2,USW00014820,TAVG,366
3,USC00132638,SNOW,355
4,USR0000CROC,TMIN,366


In [32]:
df_filtered_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32738 entries, 0 to 32737
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   station_id  32738 non-null  object
 1   measured    32738 non-null  object
 2   nb_days     32738 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 767.4+ KB
