In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType
from collections import OrderedDict
import pandas as pd


In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [3]:
os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [4]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("preprocess_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

In [5]:
path = os.path.join(project_path, "DATA", "WEATHER", "2020.csv.gz")
weather = spark.read.csv(path,  
                         schema = "station_id string, date string, measured string, value string, measurement_flag string, quality_flag string, source_flag string, hour string")

In [6]:
weather.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- value: string (nullable = true)
 |-- measurement_flag: string (nullable = true)
 |-- quality_flag: string (nullable = true)
 |-- source_flag: string (nullable = true)
 |-- hour: string (nullable = true)



In [7]:
weather.show(10)

+-----------+--------+--------+-----+----------------+------------+-----------+----+
| station_id|    date|measured|value|measurement_flag|quality_flag|source_flag|hour|
+-----------+--------+--------+-----+----------------+------------+-----------+----+
|AE000041196|20200101|    TMIN|  168|            null|        null|          S|null|
|AE000041196|20200101|    PRCP|    0|               D|        null|          S|null|
|AE000041196|20200101|    TAVG|  211|               H|        null|          S|null|
|AEM00041194|20200101|    PRCP|    0|            null|        null|          S|null|
|AEM00041194|20200101|    TAVG|  217|               H|        null|          S|null|
|AEM00041217|20200101|    TAVG|  205|               H|        null|          S|null|
|AEM00041218|20200101|    TMIN|  148|            null|        null|          S|null|
|AEM00041218|20200101|    TAVG|  199|               H|        null|          S|null|
|AFM00040938|20200101|    PRCP|   23|            null|        nul

In [8]:
# keep only rows with quality_flag = NULL (no quality check failed)
weather = weather.filter(weather["quality_flag"].isNull())

In [9]:
weather.count()

34532211

In [10]:
# keep only stations in USA and unincorporated territories
# AQ : American Samoa
# CQ : Northern Mariana Islands
# GQ : Guam
# RQ : Puerto Rico
# VQ : Virgin Islands
l_states = ["US", "CQ", "GQ", "AQ", "RQ", "VQ"]
from functools import reduce
state_in_us = reduce(
        lambda x,y : x| y, 
        [col("station_id").startswith(state) for state in l_states],
        lit(False))
weather = weather.filter( state_in_us)

In [11]:
weather.count()

25336215

In [12]:
# select only measurements of interest (precipitation, temperature, snow, wind)
l_measurements = ["TMIN", "PRCP", "SNOW", "AWND"]
weather = weather.filter(weather["measured"].isin(l_measurements))

In [13]:
weather.count()


14716418

In [14]:
# group by stations and measurement, then count number of values measured in a year
weather_per_stations_per_measurement = weather.groupby("station_id", "measured").count()

In [15]:
weather_per_stations_per_measurement.count()

58620

In [16]:
weather_per_stations_per_measurement.show(10)

+-----------+--------+-----+
| station_id|measured|count|
+-----------+--------+-----+
|US1ALBW0001|    SNOW|   24|
|US1ALMB0001|    PRCP|  365|
|US1AZPM0138|    SNOW|  302|
|US1AZPM0156|    PRCP|  366|
|US1CASD0060|    PRCP|  267|
|US1CASD0186|    PRCP|  204|
|US1CASR0034|    PRCP|  314|
|US1COBO0477|    PRCP|  338|
|US1CODG0229|    SNOW|   99|
|US1CODN0016|    PRCP|  364|
+-----------+--------+-----+
only showing top 10 rows



In [17]:
min_nb_days = 350
weather_per_stations_per_measurement = weather_per_stations_per_measurement.filter(weather_per_stations_per_measurement["count"] > min_nb_days)

In [18]:
weather_per_stations_per_measurement.count()

23556

In [19]:
significant_stations = weather_per_stations_per_measurement.select("station_id").distinct()
significant_stations.count()

14017

In [20]:
weather_per_stations_per_measurement.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- count: long (nullable = false)



In [21]:
# output : list of relevant stations per element
out_path = os.path.join(project_path, "OUT_DATA", "filtered_stations")
weather_per_stations_per_measurement.select("measured", "station_id")\
    .write.partitionBy("measured").format("parquet").save(out_path)