In [56]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [57]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [58]:
os.chdir(config["PATH"]["project"])

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [59]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Load weather stations location data from postgres DB

In [62]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [61]:
stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



## Load world US weather (prefiltered)

from raw 2020.csv, filter to keep only US stations, and remove failed measurements.
See US_weather_exporation.ipynb

In [63]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

In [64]:
weather_2020.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)



In [65]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

CPU times: user 6.26 ms, sys: 3.53 ms, total: 9.79 ms
Wall time: 43.4 s


25074507

In [72]:
all_elements = weather_2020.select("measured").distinct().collect()

In [73]:
all_elements

[Row(measured='WESD'),
 Row(measured='PGTM'),
 Row(measured='AWDR'),
 Row(measured='WT07'),
 Row(measured='SX33'),
 Row(measured='EVAP'),
 Row(measured='SN53'),
 Row(measured='WT10'),
 Row(measured='SN35'),
 Row(measured='TMIN'),
 Row(measured='MDPR'),
 Row(measured='WT09'),
 Row(measured='SX51'),
 Row(measured='WT05'),
 Row(measured='SN36'),
 Row(measured='SN32'),
 Row(measured='SN31'),
 Row(measured='SX52'),
 Row(measured='PSUN'),
 Row(measured='SX56'),
 Row(measured='WT06'),
 Row(measured='SN56'),
 Row(measured='WT04'),
 Row(measured='WT01'),
 Row(measured='MXPN'),
 Row(measured='WSFG'),
 Row(measured='WT11'),
 Row(measured='DAPR'),
 Row(measured='TOBS'),
 Row(measured='SX35'),
 Row(measured='SX31'),
 Row(measured='SX32'),
 Row(measured='SX55'),
 Row(measured='TMAX'),
 Row(measured='WDMV'),
 Row(measured='SNOW'),
 Row(measured='WDFG'),
 Row(measured='TSUN'),
 Row(measured='WSFI'),
 Row(measured='WESF'),
 Row(measured='WDF2'),
 Row(measured='WDF5'),
 Row(measured='WT02'),
 Row(measur

In [74]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [76]:
ndays_station_msr = weather_2020.groupBy("station_id", "measured").agg( F.countDistinct("date").alias("nb_days") )

keep only measurements for main elements

In [78]:
ndays_station_msr_filterered = ndays_station_msr.filter( ndays_station_msr["measured"].isin(l_measurements))

In [None]:
stations_msr_filtered2.selectdis

In [79]:
%%time
ndays_station_msr_filterered.count()

CPU times: user 39.1 ms, sys: 1.85 ms, total: 40.9 ms
Wall time: 3min 11s


81991

Keep only (station, element) were nb of measures > 350 over the year (i.e. almost 1 measure per day)

In [80]:
%%time
stations_msr_filtered2 = ndays_station_msr_filterered.filter( ndays_station_msr["nb_days"] > 350)

CPU times: user 2.59 ms, sys: 399 µs, total: 2.99 ms
Wall time: 132 ms


In [93]:
%%time
df_filtered_stations = stations_msr_filtered2.toPandas()

CPU times: user 709 ms, sys: 172 ms, total: 881 ms
Wall time: 3min 55s


Nb of stations with at least 1 main element measured most of the year

In [96]:
%%time
len(df_filtered_stations["station_id"].unique())

CPU times: user 7.11 ms, sys: 3.3 ms, total: 10.4 ms
Wall time: 10.6 ms


13318

Nb of stations per measurement

**From Pyspark**

`
Row(measured='TMIN', count(DISTINCT station_id)=5668),
 Row(measured='TMAX', count(DISTINCT station_id)=5713),
 Row(measured='SNOW', count(DISTINCT station_id)=3505),
 Row(measured='SNWD', count(DISTINCT station_id)=3931),
 Row(measured='TAVG', count(DISTINCT station_id)=2222),
 Row(measured='PRCP', count(DISTINCT station_id)=11699)]
stations_msr_filtered2
`

In [104]:
%%time 
df_filtered_stations[["station_id", "measured"]].groupby("measured").nunique()

CPU times: user 31.4 ms, sys: 0 ns, total: 31.4 ms
Wall time: 29.9 ms


Unnamed: 0_level_0,station_id
measured,Unnamed: 1_level_1
PRCP,11699
SNOW,3505
SNWD,3931
TAVG,2222
TMAX,5713
TMIN,5668


histogramm of nb stations for number of elements measured

** From Pyspark **

` [Row(n_elem_measured=6, count=181),
 Row(n_elem_measured=5, count=2563),
 Row(n_elem_measured=1, count=6265),
 Row(n_elem_measured=3, count=3250),
 Row(n_elem_measured=2, count=707),
 Row(n_elem_measured=4, count=352)]
`

In [106]:
df_filtered_stations.head()

Unnamed: 0,station_id,measured,nb_days
0,US1MDHW0007,PRCP,366
1,US1MNSE0002,PRCP,366
2,USW00014820,TAVG,366
3,USC00132638,SNOW,355
4,USR0000CROC,TMIN,366


In [114]:
%%time
df_filtered_stations["exist"] = 1
df_pivot_measure = df_filtered_stations.pivot(index = "station_id", columns= "measured", values = "exist" )


CPU times: user 72.9 ms, sys: 713 µs, total: 73.6 ms
Wall time: 96.4 ms


In [119]:
df_pivot_measure.head()

measured,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
US10adam004,1.0,,,,,
US10adam008,1.0,,,,,
US10adam010,1.0,,,,,
US10adam032,1.0,,,,,
US10adam051,1.0,,,,,


In [120]:
len(df_pivot_measure)

13318

In [128]:
df_pivot_measure["flag"] = 1
df_pivot_measure.head(20)

measured,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,flag
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US10adam004,1.0,,,,,,1
US10adam008,1.0,,,,,,1
US10adam010,1.0,,,,,,1
US10adam032,1.0,,,,,,1
US10adam051,1.0,,,,,,1
US10adam054,1.0,1.0,,,,,1
US10adam056,1.0,1.0,1.0,,,,1
US10boon005,1.0,,,,,,1
US10boon010,1.0,,,,,,1
US10buff036,1.0,,,,,,1


In [141]:
df_pivot_measure_2 = df_pivot_measure.fillna(0)
for measure in l_measurements:
    df_pivot_measure_2[measure] = df_pivot_measure_2[measure].apply( lambda x : int(x))
df_pivot_measure_2

measured,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,flag
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US10adam004,1,0,0,0,0,0,1
US10adam008,1,0,0,0,0,0,1
US10adam010,1,0,0,0,0,0,1
US10adam032,1,0,0,0,0,0,1
US10adam051,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...
USW00096404,1,0,0,0,1,1,1
USW00096405,0,0,0,0,1,1,1
USW00096406,0,0,0,0,1,1,1
USW00096408,1,0,0,0,1,1,1


**PB WITH GROUP BY !!!!***

In [150]:
df_pivot_measure_2.reset_index().groupby(["SNOW", "SNWD", "PRCP"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,measured,TAVG,TMAX,TMIN,flag
SNOW,SNWD,PRCP,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1159,1358,1349,1383
0,0,1,80,1337,1335,7552
0,1,0,9,16,16,33
0,1,1,776,782,769,845
1,0,0,0,8,8,24
1,0,1,17,105,107,428
1,1,0,0,82,80,179
1,1,1,181,2025,2004,2874


In [161]:
def indicator_fun(l_cols) :
    res = 0
    for i, col in enumerate(l_cols) :
        res = res + (1<<i) * col
    return res
    
df_pivot_measure_2["indicator"] = df_pivot_measure_2["PRCP"] \
                                + 2 * df_pivot_measure_2["SNOW"]\
                                + 4 * df_pivot_measure_2["SNWD"]\
                                + 8 * df_pivot_measure_2["TAVG"]\
                                + 16 * df_pivot_measure_2["TMAX"]\
                                + 32 * df_pivot_measure_2["TMIN"]     
df_pivot_measure_2["indicator"]

station_id
US10adam004     1
US10adam008     1
US10adam010     1
US10adam032     1
US10adam051     1
               ..
USW00096404    49
USW00096405    48
USW00096406    48
USW00096408    49
USW00096409    49
Name: indicator, Length: 13318, dtype: int64

In [167]:
df_pivot_measure_2["indicator"].value_counts()

1     6200
55    1813
49    1243
56    1133
7      839
61     733
3      320
48     212
63     181
6       96
51      87
54      79
57      78
5       43
23      31
53      24
29      23
8       21
59      17
2       16
4       16
17      15
33      14
45      12
16      11
39      10
13       8
50       8
60       8
52       7
22       3
40       3
35       3
24       2
21       2
9        1
19       1
38       1
25       1
28       1
32       1
36       1
Name: indicator, dtype: int64

In [177]:
df_pivot_measure_2.groupby("indicator").agg( { "SNOW" : max,
                                              "SNWD" : max,
                                              "PRCP" : max,
                                              "TMAX" : max,
                                              "TMIN" : max,
                                              "TAVG" : max,
                                              "flag" : sum }).sort_values("flag", ascending= False)

Unnamed: 0_level_0,SNOW,SNWD,PRCP,TMAX,TMIN,TAVG,flag
indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,0,1,0,0,0,6200
55,1,1,1,1,1,0,1813
49,0,0,1,1,1,0,1243
56,0,0,0,1,1,1,1133
7,1,1,1,0,0,0,839
61,0,1,1,1,1,1,733
3,1,0,1,0,0,0,320
48,0,0,0,1,1,0,212
63,1,1,1,1,1,1,181
6,1,1,0,0,0,0,96
