In [1]:
# filter weather stations and weather data according to selected elements and station_ids

In [2]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType
from collections import OrderedDict
import pandas as pd


In [3]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [4]:
os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]

In [5]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [6]:
spark = create_spark_session()

In [7]:
path = os.path.join(project_path, "DATA", "WEATHER", "2020.csv.gz")
weather = spark.read.csv(path,  
                         schema = "station_id string, date string, measured string, value string, measurement_flag string, quality_flag string, source_flag string, hour string")

In [8]:
# read mapping btw locations and stations
path = os.path.join(project_path, "OUT_DATA", "map_locations_stations")
map_locations_to_stations = spark.read.parquet(path)

In [9]:
# load all stations, with GPS location
raw_stations = spark.read.csv( os.path.join(project_path, "DATA", "WEATHER", "ghcnd-stations.txt"))

In [10]:
# parse raw stations into columns
@udf(MapType( StringType(), StringType()))
def ParseStationsUDF(line):
    return{
        "station_id": line[0:11],
        "latitude" : line[13:20], 
        "longitude" : line[21:30], 
        "elevation" : line[31:38], 
        "state" : line[38:40], 
        "station_name" : line[41:]
        
    }

fields = OrderedDict( [
        ( "station_id" , "string"),
        ( "latitude" , "float"), 
        ("longitude" , "float"), 
        ("elevation" , "float"),
        ("state" , "string"), 
        ("station_name" , "string")
] )

#exprs = [ f"parsed['{field}'].cast({fld_type}) as {field}" for field, fld_type in fields.items() ]
exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

stations = raw_stations.withColumn("parsed", ParseStationsUDF("_c0")).selectExpr( *exprs)

In [11]:
col_stations = stations.columns
print(col_stations)

['station_id', 'latitude', 'longitude', 'elevation', 'state', 'station_name']


In [12]:
# keep only selected weather data
selected_stations = stations.join(map_locations_to_stations, on = ["station_id"])\
    .select(*col_stations)

In [13]:
out_path = os.path.join(project_path, "OUT_DATA", "weather_stations")
#selected_stations.write.parquet(out_path)
selected_stations.write.mode("overwrite").parquet(out_path)

AnalysisException: 'path file:/home/user/CODE/BIG_DATA/CAPSTONE_PROJECT/covid-analysis/OUT_DATA/weather_stations already exists.;'

In [14]:
unique_stations = map_locations_to_stations\
                .select("measured", "station_id")\
                .distinct()

In [15]:
selected_weather = weather.filter(weather["quality_flag"].isNull())\
    .join( unique_stations, on = ["station_id", "measured"])\
    .select("measured", "station_id", "date", "value")
    

In [16]:
#with duplicates : 4761368
# with duplicates removed : 3051578
#selected_weather.count()

3051578

In [17]:
out_path = os.path.join(project_path, "OUT_DATA", "weather_data")
#selected_weather.write.parquet(out_path)
selected_weather.write.mode("overwrite").parquet(out_path)