In [None]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [None]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

In [None]:
os.chdir(config["PATH"]["project"])

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [None]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

In [None]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [None]:
stations.printSchema()

In [None]:
stations.show(5)

In [None]:
stations.count()

Load world wide weather info from csv file

In [None]:
weather_path = os.path.join(config["PATH"]["project"], "DATA/WEATHER/2020.csv" )
weather_2020 = spark.read.load(weather_path, format = "csv", sep = ",",
                            schema = "station_id string, date string, measured string, v1 string, v2 string, v3 string, v4 string, v5 string")

In [None]:
weather_2020.printSchema()

In [None]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

In [None]:
%%time
weather_2020.show(10)

Join weather with station, to then filter and keep only US stations

In [None]:
weather_2020 = weather_2020.join( stations, ["station_id"])

In [None]:
weather_2020.printSchema()

In [None]:
%%time
weather_2020 = weather_2020.filter( weather_2020.station_id.isNotNull() )

In [None]:
%%time
nb_weather_record_US = weather_2020.count()
nb_weather_record_US

In [None]:
nb_weather_record_US/nb_weather_records


In [None]:
%%time
weather_2020.show(10)

## check possible values for measurement columns

### v2 : measurement flag

In [None]:
# Measurement flag ??
%%time
weather_2020.select("v2").distinct().count()

In [None]:
%%time
v2_distinct = weather_2020.select("v2").distinct().collect()

### v3 : Q flag (quality of measurement)

In [None]:
# Q Flag : quality of measurement
v3_distinct = weather_2020.select("v3").distinct().collect()
len(v3_distinct)

In [None]:
stat_quality = weather_2020.groupBy("v3").count().orderBy("count").collect()

In [None]:
stat_quality

In [None]:
def none_to_str(a):
    if a is None:
        return ""
    else :
        return a

for item in sorted(v3_distinct, key = lambda a : none_to_str(a[0])):
    print(item)

### v4 : S flag (source of measurement)

In [None]:
# S flag : source of measurement
v4_distinct = weather_2020.select("v4").distinct().collect()
len(v4_distinct)

In [None]:
for item in sorted(v4_distinct, key = lambda a : none_to_str(a[0])):
    print(item)

### v5 : hour ???

In [None]:
# HOUR ????
v5_distinct = weather_2020.select("v5").distinct().collect()
len(v5_distinct)

In [None]:
for item in sorted(v5_distinct, key = lambda a : none_to_str(a[0])):
    print(item)

In [None]:
l_elements= weather_2020.groupBy("measured").count().orderBy("measured").collect()
l_elements

In [None]:
sorted(l_elements, key = lambda a :a["count"])

In [None]:
measurements = ["PRCP","TMAX","TMIN", "TOBS", "SNOW", "SNWD"]

In [None]:
weather_2020.printSchema()

In [None]:
#weather_2020.write.partitionedBy( "date").format("parquet").save("weather_2020_with_stations.parquet")

In [None]:
%%time
weather_2020.select("date").distinct().count()

Remove rows with quality control failed

In [None]:
weather_2020 = weather_2020.filter( weather_2020["v3"].isNull())

Check that there is only 1 measurement per (date, station, metric)

In [None]:
too_much_measures = weather_2020.groupBy("date", "station_id", "measured").count().filter( "count > 1")

In [None]:
%%time 
too_much_measures.count()

In [None]:
weather_2020.groupBy("date").count().orderBy( F.col("count").desc()).show(10)

# Specialized vs generic stations

In [None]:
groupby_station_element = weather_2020.groupBy("station_id", "measured").agg( F.count("date").alias("nb_days"))

In [None]:
groupby_station_element.printSchema()

In [None]:
df_stations_minnbmeasurements= groupby_station_element.groupBy("station_id").min("nb_days").toPandas()

In [None]:
df_stations_minnbmeasurements.head()

In [None]:
df_stations_minnbmeasurements.hist()

In [None]:
df_stations_maxnbmeasurements= groupby_station_element.groupBy("station_id").max("nb_days").toPandas()

In [None]:
df_stations_maxnbmeasurements.hist()

**Conclusion : most of the stations are specialized in a few elements. For therse elements, they output a measure almost every day. For the other elements, they almost never output a measure.** 

In [None]:
nb_snowfall_perdate = weather_2020.filter( weather_2020["measured"] == "SNOW").groupBy("date").count().orderBy("date").toPandas()

In [None]:
#### !!!!!!!!!!! must user .values for constructing series from dataframe with explicit index !!!!!!!!!!
snowfall_serie = pd.Series( nb_snowfall_perdate["count"].values, index =pd.to_datetime(nb_snowfall_perdate["date"]) )

In [None]:
snowfall_serie.plot()

In [None]:
df_stations_minnbmeasurements[ df_stations_minnbmeasurements["min(nb_days)"] > 360].count()

In [None]:
select_measures = weather_2020.filter( weather_2020["measured"].isin(measurements ))

In [None]:
select_measures.count()

In [None]:
select_measures.printSchema()

Check that there is only 1 measurement (per date, station, metric)

In [None]:
too_much_measures = select_measures.groupBy("date", "station_id", "measured").count().filter( "count > 1")

In [None]:
too_much_measures.count()

In [None]:
select_measures = select_measures.select("station_id", "date", "measured", "v1")\
    .groupBy("date","station_id", "measured")\
    .agg( F.first("v1").alias("v1"))

In [None]:
select_measures.printSchema()

In [None]:
pivoted_weather = select_measures.groupBy("station_id", "date").pivot("measured", measurements).agg(F.first("v1").alias("v1"))

In [None]:
pivoted_weather.count()

In [None]:
pivoted_weather.show(10)

In [None]:
US10keit020 = pivoted_weather.filter(pivoted_weather.station_id == "US10keit020").collect()

In [None]:
US10keit020

check what type of measurement a station performs