In [None]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [None]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

In [None]:
os.chdir(config["PATH"]["project"])

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [None]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Load weather stations location data from postgres DB

In [None]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [None]:
stations.printSchema()

## Load world US weather (prefiltered)

from raw 2020.csv, filter to keep only US stations, and remove failed measurements.
See US_weather_exporation.ipynb

In [None]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

In [None]:
weather_2020.printSchema()

In [None]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

In [None]:
all_elements = weather_2020.select("measured").distinct().collect()

In [None]:
all_elements

In [None]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [None]:
ndays_station_msr = weather_2020.groupBy("station_id", "measured").agg( F.countDistinct("date").alias("nb_days") )

keep only measurements for main elements

In [None]:
ndays_station_msr_filterered = ndays_station_msr.filter( ndays_station_msr["measured"].isin(l_measurements))

In [None]:
stations_msr_filtered2.selectdis

In [None]:
%%time
ndays_station_msr_filterered.count()

Keep only (station, element) were nb of measures > 350 over the year (i.e. almost 1 measure per day)

In [None]:
%%time
stations_msr_filtered2 = ndays_station_msr_filterered.filter( ndays_station_msr["nb_days"] > 350)

In [None]:
%%time
df_filtered_stations = stations_msr_filtered2.toPandas()

Nb of stations with at least 1 main element measured most of the year

In [None]:
%%time
len(df_filtered_stations["station_id"].unique())

Nb of stations per measurement

**From Pyspark**

`
Row(measured='TMIN', count(DISTINCT station_id)=5668),
 Row(measured='TMAX', count(DISTINCT station_id)=5713),
 Row(measured='SNOW', count(DISTINCT station_id)=3505),
 Row(measured='SNWD', count(DISTINCT station_id)=3931),
 Row(measured='TAVG', count(DISTINCT station_id)=2222),
 Row(measured='PRCP', count(DISTINCT station_id)=11699)]
stations_msr_filtered2
`

In [None]:
%%time 
df_filtered_stations[["station_id", "measured"]].groupby("measured").nunique()

histogramm of nb stations for number of elements measured

** From Pyspark **

` [Row(n_elem_measured=6, count=181),
 Row(n_elem_measured=5, count=2563),
 Row(n_elem_measured=1, count=6265),
 Row(n_elem_measured=3, count=3250),
 Row(n_elem_measured=2, count=707),
 Row(n_elem_measured=4, count=352)]
`

In [None]:
df_filtered_stations.head()

In [None]:
%%time
df_filtered_stations["exist"] = 1
df_pivot_measure = df_filtered_stations.pivot(index = "station_id", columns= "measured", values = "exist" )


In [None]:
df_pivot_measure.head()

In [None]:
len(df_pivot_measure)

In [None]:
df_pivot_measure["flag"] = 1
df_pivot_measure.head(20)

In [None]:
df_pivot_measure_2 = df_pivot_measure.fillna(0)
for measure in l_measurements:
    df_pivot_measure_2[measure] = df_pivot_measure_2[measure].apply( lambda x : int(x))
df_pivot_measure_2

**PB WITH GROUP BY !!!!***

In [None]:
df_pivot_measure_2.reset_index().groupby(["SNOW", "SNWD", "PRCP"]).sum()

In [None]:
def indicator_fun(l_cols) :
    res = 0
    for i, col in enumerate(l_cols) :
        res = res + (1<<i) * col
    return res
    
df_pivot_measure_2["indicator"] = df_pivot_measure_2["PRCP"] \
                                + 2 * df_pivot_measure_2["SNOW"]\
                                + 4 * df_pivot_measure_2["SNWD"]\
                                + 8 * df_pivot_measure_2["TAVG"]\
                                + 16 * df_pivot_measure_2["TMAX"]\
                                + 32 * df_pivot_measure_2["TMIN"]     
df_pivot_measure_2["indicator"]

In [None]:
df_pivot_measure_2["indicator"].value_counts()

In [None]:
df_pivot_measure_2.groupby("indicator").agg( { "SNOW" : max,
                                              "SNWD" : max,
                                              "PRCP" : max,
                                              "TMAX" : max,
                                              "TMIN" : max,
                                              "TAVG" : max,
                                              "flag" : sum }).sort_values("flag", ascending= False)