In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, FloatType
from pyspark.sql import DataFrame, Window
from collections import OrderedDict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

local_data_dir = config["PATH"]["LOCAL_DATA_DIR"]
output_path =  config["PATH"]["STAGE_DATA_DIR"]

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['SECRET_ACCESS_KEY']

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0")\
        .config("fs.s3a.access.key", config['AWS']['ACCESS_KEY_ID'])\
        .config("fs.s3a.secret.key", config['AWS']['SECRET_ACCESS_KEY'])\
        .getOrCreate()
    
    return spark

In [4]:
spark = create_spark_session()

In [5]:
df_covid = spark.read\
    .option("header", True)\
    .csv(os.path.join(local_data_dir,"COVID", "us-counties.csv"))

In [6]:
df_covid.printSchema()

root
 |-- date: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: string (nullable = true)
 |-- cases: string (nullable = true)
 |-- deaths: string (nullable = true)



In [7]:
df_covid = df_covid.withColumn("date", F.to_date("date"))
df_covid = df_covid.withColumn("cases", col("cases").cast(T.IntegerType()) )
df_covid = df_covid.withColumn("deaths", col("deaths").cast(T.IntegerType()) )

In [8]:
df_covid.printSchema()

root
 |-- date: date (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: string (nullable = true)
 |-- cases: integer (nullable = true)
 |-- deaths: integer (nullable = true)



In [9]:
df_covid.show()

+----------+-----------+----------+-----+-----+------+
|      date|     county|     state| fips|cases|deaths|
+----------+-----------+----------+-----+-----+------+
|2020-01-21|  Snohomish|Washington|53061|    1|     0|
|2020-01-22|  Snohomish|Washington|53061|    1|     0|
|2020-01-23|  Snohomish|Washington|53061|    1|     0|
|2020-01-24|       Cook|  Illinois|17031|    1|     0|
|2020-01-24|  Snohomish|Washington|53061|    1|     0|
|2020-01-25|     Orange|California|06059|    1|     0|
|2020-01-25|       Cook|  Illinois|17031|    1|     0|
|2020-01-25|  Snohomish|Washington|53061|    1|     0|
|2020-01-26|   Maricopa|   Arizona|04013|    1|     0|
|2020-01-26|Los Angeles|California|06037|    1|     0|
|2020-01-26|     Orange|California|06059|    1|     0|
|2020-01-26|       Cook|  Illinois|17031|    1|     0|
|2020-01-26|  Snohomish|Washington|53061|    1|     0|
|2020-01-27|   Maricopa|   Arizona|04013|    1|     0|
|2020-01-27|Los Angeles|California|06037|    1|     0|
|2020-01-2

In [10]:
df_covid.count()

1170376

In [15]:
df_covid.where(col("fips").isNull()).count()

10721

In [16]:
map_fips_stations = spark.read.parquet( os.path.join(output_path, "map_locations_stations"))

In [17]:
map_fips_stations.printSchema()

root
 |-- location_id: long (nullable = true)
 |-- station_id: string (nullable = true)
 |-- distance: double (nullable = true)
 |-- measured: string (nullable = true)



In [21]:
fpath = os.path.join(output_path, "nyt_locations_geography")
nyt_locations = spark.read.parquet(fpath)
nyt_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- location_id: long (nullable = true)
 |-- state: string (nullable = true)



In [26]:
# show non matched locations 
df_covid.join(nyt_locations, 
            (df_covid.fips.eqNullSafe(nyt_locations.fips) ) &
            (df_covid.county.eqNullSafe(nyt_locations.county) )& 
            (df_covid.state.eqNullSafe( nyt_locations.state) ) ,
            how = "left_anti")\
            .select("county", "state", "fips")\
            .distinct()\
            .show()

+--------------------+------+-----+
|              county| state| fips|
+--------------------+------+-----+
|Yakutat plus Hoon...|Alaska|02998|
|Bristol Bay plus ...|Alaska|02997|
+--------------------+------+-----+



In [29]:
# filter out non matched locations and add location_id
df_covid_filter = df_covid.alias("covid").join( nyt_locations.alias("loc"),
            (df_covid.fips.eqNullSafe(nyt_locations.fips) ) &
            (df_covid.county.eqNullSafe(nyt_locations.county) ) & 
            (df_covid.state.eqNullSafe( nyt_locations.state) ) )\
       .select("date", "location_id", "covid.fips", "covid.county", "covid.state", "deaths", "cases")
                                             

In [30]:
# compute daily cases and daily deaths, from cumulated values
# first add a column with the lag value (i.e. the value from the previous day)
# then compute the difference btw current day and previous day values
w = Window.partitionBy("location_id").orderBy("date")
df_covid_daily = df_covid_filter\
    .withColumn("deaths_prev", F.lag("deaths", count = 1, default = 0).over(w) ) \
    .withColumn("cases_prev", F.lag("cases", count = 1, default = 0).over(w) )\
    .withColumn("daily_deaths", col("deaths") - col("deaths_prev"))\
    .withColumn("daily_cases", col("cases") - col("cases_prev"))

In [31]:
df_covid_daily.printSchema()

root
 |-- date: date (nullable = true)
 |-- location_id: long (nullable = true)
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- deaths: integer (nullable = true)
 |-- cases: integer (nullable = true)
 |-- deaths_prev: integer (nullable = true)
 |-- cases_prev: integer (nullable = true)
 |-- daily_deaths: integer (nullable = true)
 |-- daily_cases: integer (nullable = true)



In [32]:
out_path = os.path.join(output_path, "covid_per_county")

df_covid_daily.select("date", "location_id", "daily_cases", "daily_deaths")\
    .write\
    .partitionBy("date")\
    .mode("overwrite")\
    .parquet(out_path)

In [42]:
max_date = df_covid_daily.agg({"date" : "max" }).collect()[0]["max(date)"]

In [44]:
max_date

datetime.date(2021, 3, 29)

In [50]:
# compute last date for each location_id : used to filter out data for subsequent loads
w = Window.partitionBy("location_id").orderBy( col("date").desc())
columns = df_covid_filter.columns[:]
last_data = df_covid_filter.withColumn("rank", F.rank().over(w))\
    .where("rank == 1")\
    .select(columns)

In [55]:
out_path = os.path.join(output_path, "last_data")
last_data.write\
    .format("parquet")\
    .mode("overwrite")\
    .parquet(out_path)

In [39]:
last_data.count()

3272

In [52]:
last_data.printSchema()

root
 |-- date: date (nullable = true)
 |-- location_id: long (nullable = true)
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- deaths: integer (nullable = true)
 |-- cases: integer (nullable = true)

