In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, FloatType
from pyspark.sql import DataFrame, Window
from collections import OrderedDict
import pandas as pd
import numpy as np

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [4]:
spark = create_spark_session()

In [11]:
df_covid = spark.read.option("header", True).csv(os.path.join(project_path, "DATA","COVID", "us-counties.csv"))

In [13]:
df_covid.printSchema()

root
 |-- date: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: string (nullable = true)
 |-- cases: string (nullable = true)
 |-- deaths: string (nullable = true)



In [14]:
df_covid.show()

+----------+-----------+----------+-----+-----+------+
|      date|     county|     state| fips|cases|deaths|
+----------+-----------+----------+-----+-----+------+
|2020-01-21|  Snohomish|Washington|53061|    1|     0|
|2020-01-22|  Snohomish|Washington|53061|    1|     0|
|2020-01-23|  Snohomish|Washington|53061|    1|     0|
|2020-01-24|       Cook|  Illinois|17031|    1|     0|
|2020-01-24|  Snohomish|Washington|53061|    1|     0|
|2020-01-25|     Orange|California|06059|    1|     0|
|2020-01-25|       Cook|  Illinois|17031|    1|     0|
|2020-01-25|  Snohomish|Washington|53061|    1|     0|
|2020-01-26|   Maricopa|   Arizona|04013|    1|     0|
|2020-01-26|Los Angeles|California|06037|    1|     0|
|2020-01-26|     Orange|California|06059|    1|     0|
|2020-01-26|       Cook|  Illinois|17031|    1|     0|
|2020-01-26|  Snohomish|Washington|53061|    1|     0|
|2020-01-27|   Maricopa|   Arizona|04013|    1|     0|
|2020-01-27|Los Angeles|California|06037|    1|     0|
|2020-01-2

In [19]:
df_covid.count()

1170376

In [16]:
df_covid.where(col("fips").isNull()).count()

10721

In [21]:
map_fips_stations = spark.read.parquet( os.path.join(project_path, "OUT_DATA/", "stations_per_fips"))

In [24]:
map_fips_stations.printSchema()

root
 |-- location_id: long (nullable = true)
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- fips_state: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- station_state: string (nullable = true)
 |-- angle: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- measured: string (nullable = true)



In [22]:
nyt_locations = df_covid.select("fips", "county", "state").distinct()

In [23]:
nyt_locations.count()

3274

In [27]:
map_locations = map_fips_stations.select("location_id", "fips", "county", "fips_state").distinct()
map_locations.count()

3272

In [31]:
nyt_locations.join(map_locations, 
            (nyt_locations.fips.eqNullSafe(map_locations.fips) ) &
            (nyt_locations.county.eqNullSafe(map_locations.county) )& 
            (nyt_locations.state.eqNullSafe( map_locations.fips_state) ) )\
            .count()

3272

In [32]:
nyt_locations.join(map_locations, 
            (nyt_locations.fips.eqNullSafe(map_locations.fips) ) &
            (nyt_locations.county.eqNullSafe(map_locations.county) )& 
            (nyt_locations.state.eqNullSafe( map_locations.fips_state) ) ,
            how = "left_anti")\
            .show()

+-----+--------------------+------+
| fips|              county| state|
+-----+--------------------+------+
|02997|Bristol Bay plus ...|Alaska|
|02998|Yakutat plus Hoon...|Alaska|
+-----+--------------------+------+



In [68]:
# filter out non matched locations and add location_id
df_covid_filter = df_covid.alias("covid").join( map_locations.alias("loc"),
            (df_covid.fips.eqNullSafe(map_fips_stations.fips) ) &
            (df_covid.county.eqNullSafe(map_fips_stations.county) ) & 
            (df_covid.state.eqNullSafe( map_fips_stations.fips_state) ) )\
       .select("date", "location_id", "covid.fips", "covid.county", "covid.state", "deaths", "cases")
                                             

In [78]:
# compute daily cases and daily deaths, from cumulated values
# first add a column with the lag value (i.e. the value from the previous day)
# then compute the difference btw current day and previous day values
w = Window.partitionBy("location_id").orderBy("date")
df_covid_daily = df_covid_filter\
    .withColumn("deaths_prev", F.lag("deaths", count = 1, default = 0).over(w) ) \
    .withColumn("cases_prev", F.lag("cases", count = 1, default = 0).over(w) )\
    .withColumn("daily_deaths", col("deaths") - col("deaths_prev"))\
    .withColumn("daily_cases", col("cases") - col("cases_prev"))

In [79]:
# check value of prev deaths for LA county
df_covid_daily.where( (col("fips") == "06037") & ( F.month(col("date")) == 12) ).show()

+----------+-----------+-----+-----------+----------+------+------+-----------+----------+------------+-----------+
|      date|location_id| fips|     county|     state|deaths| cases|deaths_prev|cases_prev|daily_deaths|daily_cases|
+----------+-----------+-----+-----------+----------+------+------+-----------+----------+------------+-----------+
|2020-12-01|17179869218|06037|Los Angeles|California|  7700|408515|       7655|    401034|        45.0|     7481.0|
|2020-12-02|17179869218|06037|Los Angeles|California|  7740|414304|       7700|    408515|        40.0|     5789.0|
|2020-12-03|17179869218|06037|Los Angeles|California|  7782|422000|       7740|    414304|        42.0|     7696.0|
|2020-12-04|17179869218|06037|Los Angeles|California|  7842|430713|       7782|    422000|        60.0|     8713.0|
|2020-12-05|17179869218|06037|Los Angeles|California|  7886|439538|       7842|    430713|        44.0|     8825.0|
|2020-12-06|17179869218|06037|Los Angeles|California|  7909|449982|     

In [None]:
# to smooth down anomalies due to data reporting (e.g. no data reported on weekends), compute cumulation over a week


In [87]:
df_covid_daily = df_covid_daily.withColumn("days", F.datediff( col("date") , F.to_date( lit("2019-01-01") )) )

In [88]:
df_covid_daily.where( (col("fips") == "06037") & ( F.month(col("date")) == 3)  & (F.year(col("date")) == 2020) ).show(40)

+----------+-----------+-----+-----------+----------+------+-----+-----------+----------+------------+-----------+----+
|      date|location_id| fips|     county|     state|deaths|cases|deaths_prev|cases_prev|daily_deaths|daily_cases|days|
+----------+-----------+-----+-----------+----------+------+-----+-----------+----------+------------+-----------+----+
|2020-03-01|17179869218|06037|Los Angeles|California|     0|    1|          0|         1|         0.0|        0.0| 425|
|2020-03-02|17179869218|06037|Los Angeles|California|     0|    1|          0|         1|         0.0|        0.0| 426|
|2020-03-03|17179869218|06037|Los Angeles|California|     0|    1|          0|         1|         0.0|        0.0| 427|
|2020-03-04|17179869218|06037|Los Angeles|California|     0|    7|          0|         1|         0.0|        6.0| 428|
|2020-03-05|17179869218|06037|Los Angeles|California|     0|   11|          0|         7|         0.0|        4.0| 429|
|2020-03-06|17179869218|06037|Los Angele

In [89]:
w = Window.partitionBy("location_id").orderBy("days").rangeBetween(-6,0)

In [90]:
df_covid_daily = df_covid_daily.withColumn("week_deaths", F.sum("daily_deaths").over(w) )

In [91]:
df_covid_daily.where( (col("fips") == "06037") & ( F.month(col("date")) == 3)  & (F.year(col("date")) == 2020) ).toPandas()

Unnamed: 0,date,location_id,fips,county,state,deaths,cases,deaths_prev,cases_prev,daily_deaths,daily_cases,days,week_deaths
0,2020-03-01,17179869218,6037,Los Angeles,California,0,1,0,1,0.0,0.0,425,0.0
1,2020-03-02,17179869218,6037,Los Angeles,California,0,1,0,1,0.0,0.0,426,0.0
2,2020-03-03,17179869218,6037,Los Angeles,California,0,1,0,1,0.0,0.0,427,0.0
3,2020-03-04,17179869218,6037,Los Angeles,California,0,7,0,1,0.0,6.0,428,0.0
4,2020-03-05,17179869218,6037,Los Angeles,California,0,11,0,7,0.0,4.0,429,0.0
5,2020-03-06,17179869218,6037,Los Angeles,California,0,13,0,11,0.0,2.0,430,0.0
6,2020-03-07,17179869218,6037,Los Angeles,California,0,14,0,13,0.0,1.0,431,0.0
7,2020-03-08,17179869218,6037,Los Angeles,California,0,14,0,14,0.0,0.0,432,0.0
8,2020-03-09,17179869218,6037,Los Angeles,California,0,19,0,14,0.0,5.0,433,0.0
9,2020-03-10,17179869218,6037,Los Angeles,California,0,20,0,19,0.0,1.0,434,0.0
