In [2]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [4]:
os.chdir(config["PATH"]["project"])

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [5]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

In [6]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [7]:
stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



In [8]:
stations.show(5)

+-----------+--------+---------+---------+-----+--------------------+
| station_id|latitude|longitude|elevation|state|        station_name|
+-----------+--------+---------+---------+-----+--------------------+
|US009052008| 43.7333| -96.6333|    482.0|   SD|SIOUX FALLS (ENVI...|
|US10RMHS145| 40.5268|-105.1113|   1569.1|   CO|RMHS 1.6 SSW     ...|
|US10adam001|  40.568| -98.5069|    598.0|   NE|JUNIATA 1.5 S    ...|
|US10adam002| 40.5093| -98.5493|    601.1|   NE|JUNIATA 6.0 SSW  ...|
|US10adam003| 40.4663| -98.6537|    615.1|   NE|HOLSTEIN 0.1 NW  ...|
+-----------+--------+---------+---------+-----+--------------------+
only showing top 5 rows



In [None]:
stations.count()

Load world wide weather info from csv file

In [9]:
weather_path = os.path.join(config["PATH"]["project"], "DATA/WEATHER/2020.csv" )
weather_2020 = spark.read.load(weather_path, format = "csv", sep = ",",
                            schema = "station_id string, date string, measured string, v1 string, v2 string, v3 string, v4 string, v5 string")

In [10]:
weather_2020.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- v2: string (nullable = true)
 |-- v3: string (nullable = true)
 |-- v4: string (nullable = true)
 |-- v5: string (nullable = true)



In [11]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

CPU times: user 1.63 ms, sys: 3.78 ms, total: 5.4 ms
Wall time: 9.2 s


34320857

In [None]:
weather_2020.show(10)

Join weather with station, to then filter and keep only US stations

In [None]:
weather_2020 = weather_2020.join( stations, ["station_id"])

In [None]:
weather_2020.printSchema()

In [None]:
weather_2020 = weather_2020.filter( weather_2020.station_id.isNotNull() )

In [None]:
nb_weather_record_US = weather_2020.count()
nb_weather_record_US

In [None]:
nb_weather_record_US/nb_weather_records


In [None]:
weather_2020.show(10)

In [None]:
# Measurement flag ??
weather_2020.select("v2").distinct().count()

In [None]:
v2_distinct = weather_2020.select("v2").distinct().collect()

In [None]:
# Q Flag : quality of measurement
v3_distinct = weather_2020.select("v3").distinct().collect()
len(v3_distinct)

In [None]:
stat_quality = weather_2020.groupBy("v3").count().orderBy("count").collect()

In [None]:
stat_quality

In [None]:
def none_to_str(a):
    if a is None:
        return ""
    else :
        return a

for item in sorted(v3_distinct, key = lambda a : none_to_str(a[0])):
    print(item)

In [None]:
# S flag : source of measurement
v4_distinct = weather_2020.select("v4").distinct().collect()
len(v4_distinct)

In [None]:
for item in sorted(v4_distinct, key = lambda a : none_to_str(a[0])):
    print(item)

In [None]:
# HOUR ????
v5_distinct = weather_2020.select("v5").distinct().collect()
len(v5_distinct)

In [None]:
for item in sorted(v5_distinct, key = lambda a : none_to_str(a[0])):
    print(item)

In [None]:
l_elements= weather_2020.groupBy("measured").count().orderBy("measured").collect()
l_elements

In [None]:
sorted(l_elements, key = lambda a :a["count"])

In [None]:
measurements = ["PRCP","TMAX","TMIN", "TOBS", "SNOW", "SNWD"]

In [None]:
weather_2020.printSchema()

In [None]:
#weather_2020.write.partitionedBy( "date").format("parquet").save("weather_2020_with_stations.parquet")

In [None]:
weather_2020.select("date").distinct().count()

Remove rows with quality control failed

In [None]:
weather_2020 = weather_2020.filter( weather_2020["v3"].isNull())

In [None]:
select_measures = weather_2020.filter( weather_2020["measured"].isin(measurements ))

In [None]:
select_measures.count()

In [None]:
select_measures.printSchema()

Check that there is only 1 measurement (per date, station, metric)

In [None]:
too_much_measures = select_measures.groupBy("date", "station_id", "measured").count().filter( "count > 1")

In [None]:
too_much_measures.count()

In [None]:
select_measures = select_measures.select("station_id", "date", "measured", "v1")\
    .groupBy("date","station_id", "measured")\
    .agg( F.first("v1").alias("v1"))

In [None]:
select_measures.printSchema()

In [None]:
pivoted_weather = select_measures.groupBy("station_id", "date").pivot("measured", measurements).agg(F.first("v1").alias("v1"))

In [None]:
pivoted_weather.count()

In [None]:
pivoted_weather.show(10)

In [None]:
US10keit020 = pivoted_weather.filter(pivoted_weather.station_id == "US10keit020").collect()

In [None]:
US10keit020

check what type of measurement a station performs