In [2]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [3]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [4]:
project_path = config["PATH"]["project"]
os.chdir(project_path)

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [5]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Load weather stations location data from postgres DB

In [5]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [6]:
stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



## Load world US weather (prefiltered)

from raw 2020.csv, filter to keep only US stations, and remove failed measurements.
See US_weather_exporation.ipynb

In [7]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

CPU times: user 2.38 ms, sys: 4.47 ms, total: 6.84 ms
Wall time: 12.2 s


In [8]:
weather_2020.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)



In [9]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

CPU times: user 10.2 ms, sys: 4.44 ms, total: 14.7 ms
Wall time: 57.9 s


25074507

In [10]:
all_elements = weather_2020.select("measured").distinct().collect()

In [None]:
all_elements

In [13]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [11]:
ndays_station_msr = weather_2020.groupBy("station_id", "measured").agg( F.countDistinct("date").alias("nb_days") )

keep only measurements for main elements

In [14]:
ndays_station_msr_filterered = ndays_station_msr.filter( ndays_station_msr["measured"].isin(l_measurements))

In [None]:
%%time
ndays_station_msr_filterered.count()

Keep only (station, element) were nb of measures > 350 over the year (i.e. almost 1 measure per day)

In [15]:
%%time
stations_msr_filtered2 = ndays_station_msr_filterered.filter( ndays_station_msr["nb_days"] > 350)

CPU times: user 0 ns, sys: 2.78 ms, total: 2.78 ms
Wall time: 66.1 ms


In [16]:
%%time
df_filtered_stations = stations_msr_filtered2.toPandas()

CPU times: user 484 ms, sys: 73.8 ms, total: 558 ms
Wall time: 3min 49s


Nb of stations with at least 1 main element measured most of the year

In [16]:
%%time
len(df_filtered_stations["station_id"].unique())

CPU times: user 9.88 ms, sys: 295 µs, total: 10.2 ms
Wall time: 13.1 ms


13318

In [24]:
%%time
from sqlalchemy import create_engine
#engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')
engine = create_engine('postgresql://sb:sb@localhost:5432/covid')
df_stations = pd.read_sql_table("stations", engine)

CPU times: user 297 ms, sys: 38.7 ms, total: 335 ms
Wall time: 604 ms


In [26]:
# stations are now directly read from pandas
#df_stations = stations.toPandas()

In [31]:
df_filtered_stations.head()

Unnamed: 0,station_id,measured,nb_days
0,US1MDHW0007,PRCP,366
1,US1MNSE0002,PRCP,366
2,USW00014820,TAVG,366
3,USC00132638,SNOW,355
4,USR0000CROC,TMIN,366


In [32]:
df_filtered_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32738 entries, 0 to 32737
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   station_id  32738 non-null  object
 1   measured    32738 non-null  object
 2   nb_days     32738 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 767.4+ KB


In [34]:
df_filtered_stations.to_csv("significant_stations.csv")