In [170]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, DateType
from collections import OrderedDict
import pandas as pd
import numpy as np

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [4]:
spark = create_spark_session()

In [139]:
df_cdc_raw = spark.read.json(os.path.join( project_path, "DATA","covid_by_pop_group.json"),
                        multiLine = True)

In [140]:
df_cdc_raw.count()

95735

In [141]:
df_cdc_raw.printSchema()

root
 |-- age_group: string (nullable = true)
 |-- cdc_case_earliest_dt: string (nullable = true)
 |-- count: string (nullable = true)
 |-- race_ethnicity_combined: string (nullable = true)
 |-- sex: string (nullable = true)



In [142]:
df_cdc_raw.show(5)

+-------------+--------------------+-----+-----------------------+-------+
|    age_group|cdc_case_earliest_dt|count|race_ethnicity_combined|    sex|
+-------------+--------------------+-----+-----------------------+-------+
|    80+ Years|2021-01-07T00:00:...|    1|                Unknown|Missing|
|    80+ Years|2020-12-20T00:00:...|   24|    White, Non-Hispanic|Unknown|
|30 - 39 Years|2020-12-07T00:00:...|   30|                Unknown|Missing|
|    80+ Years|2020-12-27T00:00:...|   55|    Asian, Non-Hispanic|   Male|
|20 - 29 Years|2020-12-09T00:00:...|  663|    Black, Non-Hispanic|   Male|
+-------------+--------------------+-----+-----------------------+-------+
only showing top 5 rows



## Create dimension tables

In [110]:
dim_age = spark.createDataFrame(
    data = [
            (0,  '0 - 9 Years'),
            (10, '10 - 19 Years'),
            (20, '20 - 29 Years'),
            (30, '30 - 39 Years'),
            (40, '40 - 49 Years'),
            (50, '50 - 59 Years'),
            (60, '60 - 69 Years'),
            (70, '70 - 79 Years'),
            (80, '80+ Years'),
            (1000,'Missing'),
            (2000,'NA') ],
    schema = ["age_group_id", "age_group"]
    
)

In [111]:
dim_age.printSchema()

root
 |-- age_group_id: long (nullable = true)
 |-- age_group: string (nullable = true)



In [32]:
dim_age.show()

+------------+-------------+
|age_group_id|    age_group|
+------------+-------------+
|           0|  0 - 9 Years|
|          10|10 - 19 Years|
|          20|20 - 29 Years|
|          30|30 - 39 Years|
|          40|40 - 49 Years|
|          50|50 - 59 Years|
|          60|60 - 69 Years|
|          70|70 - 79 Years|
|          80|    80+ Years|
|        1000|      Missing|
|        2000|           NA|
+------------+-------------+



In [23]:
l_race_ethnicity = ['American Indian/Alaska Native, Non-Hispanic',
 'Asian, Non-Hispanic',
 'Black, Non-Hispanic',
 'Hispanic/Latino',
 'Missing',
 'Multiple/Other, Non-Hispanic',
 'NA',
 'Native Hawaiian/Other Pacific Islander, Non-Hispanic',
 'Unknown',
 'White, Non-Hispanic']

In [24]:
l_race = [ a.split(",")[0] for a in l_race_ethnicity]
l_race

['American Indian/Alaska Native',
 'Asian',
 'Black',
 'Hispanic/Latino',
 'Missing',
 'Multiple/Other',
 'NA',
 'Native Hawaiian/Other Pacific Islander',
 'Unknown',
 'White']

In [123]:
dim_race_ethnicity = spark.createDataFrame(
    data = [ (0, 'American Indian/Alaska Native', False),
             (1, 'Asian', False),
             (2, 'Black', False),
             (3, 'Native Hawaiian/Other Pacific Islander', False),
             (4, 'White', False),
             (5, 'Hispanic/Latino', True),
             (10, 'Multiple/Other', None),
             (1000, 'Missing', None),
             (2000, 'NA', None),
             (3000, 'Unknown', None)
           ],
    schema = ["race_ethnicity_id", "race", "Hispanic_Latino"]
)

In [124]:
dim_race_ethnicity.printSchema()

root
 |-- race_ethnicity_id: long (nullable = true)
 |-- race: string (nullable = true)
 |-- Hispanic_Latino: boolean (nullable = true)



In [125]:
dim_race_ethnicity.show()

+-----------------+--------------------+---------------+
|race_ethnicity_id|                race|Hispanic_Latino|
+-----------------+--------------------+---------------+
|                0|American Indian/A...|          false|
|                1|               Asian|          false|
|                2|               Black|          false|
|                3|Native Hawaiian/O...|          false|
|                4|               White|          false|
|                5|     Hispanic/Latino|           true|
|               10|      Multiple/Other|           null|
|             1000|             Missing|           null|
|             2000|                  NA|           null|
|             3000|             Unknown|           null|
+-----------------+--------------------+---------------+



In [36]:
dim_sex = spark.createDataFrame(
    data = [ (0, 'Female'), 
             (1, 'Male'), 
             (2, 'Other'), 
             (1000, 'Missing'), 
             (2000, 'NA'), 
             (3000, 'Unknown')        
            ],
    schema = [ "sex_id", "sex"]
)

In [37]:
dim_sex.printSchema()

root
 |-- sex_id: long (nullable = true)
 |-- sex: string (nullable = true)



In [38]:
dim_sex.show()

+------+-------+
|sex_id|    sex|
+------+-------+
|     0| Female|
|     1|   Male|
|     2|  Other|
|  1000|Missing|
|  2000|     NA|
|  3000|Unknown|
+------+-------+



In [163]:
df_dates = df_cdc_raw.select("cdc_case_earliest_dt").distinct().sort(col("cdc_case_earliest_dt").desc() )

In [164]:
df_dates.count()

441

In [166]:
df_dates.show(5)

+--------------------+
|cdc_case_earliest_dt|
+--------------------+
|2021-03-16T00:00:...|
|2021-03-15T00:00:...|
|2021-03-14T00:00:...|
|2021-03-13T00:00:...|
|2021-03-12T00:00:...|
+--------------------+
only showing top 5 rows



In [173]:
df_dates = df_dates.withColumn("date", col("cdc_case_earliest_dt").cast(DateType()))

In [174]:
df_dates.show()

+--------------------+----------+
|cdc_case_earliest_dt|      date|
+--------------------+----------+
|2021-03-16T00:00:...|2021-03-16|
|2021-03-15T00:00:...|2021-03-15|
|2021-03-14T00:00:...|2021-03-14|
|2021-03-13T00:00:...|2021-03-13|
|2021-03-12T00:00:...|2021-03-12|
|2021-03-11T00:00:...|2021-03-11|
|2021-03-10T00:00:...|2021-03-10|
|2021-03-09T00:00:...|2021-03-09|
|2021-03-08T00:00:...|2021-03-08|
|2021-03-07T00:00:...|2021-03-07|
|2021-03-06T00:00:...|2021-03-06|
|2021-03-05T00:00:...|2021-03-05|
|2021-03-04T00:00:...|2021-03-04|
|2021-03-03T00:00:...|2021-03-03|
|2021-03-02T00:00:...|2021-03-02|
|2021-03-01T00:00:...|2021-03-01|
|2021-02-28T00:00:...|2021-02-28|
|2021-02-27T00:00:...|2021-02-27|
|2021-02-26T00:00:...|2021-02-26|
|2021-02-25T00:00:...|2021-02-25|
+--------------------+----------+
only showing top 20 rows



In [51]:
@udf(StringType())
def parse_race_ethnicity(line):
    return line.split(",")[0]


In [70]:
df_cdc_raw.printSchema()

root
 |-- age_group: string (nullable = true)
 |-- cdc_case_earliest_dt: string (nullable = true)
 |-- count: string (nullable = true)
 |-- race_ethnicity_combined: string (nullable = true)
 |-- sex: string (nullable = true)



In [143]:
df_cdc_raw = df_cdc_raw.withColumn("race_ethnicity_combined", parse_race_ethnicity("race_ethnicity_combined"))

In [144]:
df_cdc_raw.select("race_ethnicity_combined").distinct().toPandas()

Unnamed: 0,race_ethnicity_combined
0,
1,Native Hawaiian/Other Pacific Islander
2,Hispanic/Latino
3,Unknown
4,Missing
5,White
6,Black
7,Asian
8,Multiple/Other
9,American Indian/Alaska Native


## Check that the dataframe does not contain null values

In [145]:
# query : rows with null values (in any columns)
cond_null = " OR ".join( [ f"'{a}' IS NULL" for a in df_cdc_raw.columns])
cond_null

"'age_group' IS NULL OR 'cdc_case_earliest_dt' IS NULL OR 'count' IS NULL OR 'race_ethnicity_combined' IS NULL OR 'sex' IS NULL"

In [146]:
# check that no value is null
cdc_null = df_cdc_raw.filter( cond_null)

In [147]:
cdc_null.count()

0

In [148]:
df_cdc_raw.printSchema()

root
 |-- age_group: string (nullable = true)
 |-- cdc_case_earliest_dt: string (nullable = true)
 |-- count: string (nullable = true)
 |-- race_ethnicity_combined: string (nullable = true)
 |-- sex: string (nullable = true)



In [175]:
# convert text date column into DateType
df_cdc = df_cdc_raw.withColumn("cdc_case_earliest_dt", col("cdc_case_earliest_dt").cast(DateType()))

In [176]:
# join with dimension tables to get fact table
df_cdc = df_cdc.join( dim_age, on = "age_group", how = "left_outer")\
                    .withColumnRenamed("race_ethnicity_combined", "race")\
                    .join(dim_race_ethnicity, on = "race", how = "left_outer")\
                    .join(dim_sex, on = "sex", how = "left_outer")\
                    .select("cdc_case_earliest_dt","sex_id","age_group_id", "race_ethnicity_id", "count")

In [177]:
df_cdc.count()

95735

In [178]:
df_cdc.printSchema()

root
 |-- cdc_case_earliest_dt: date (nullable = true)
 |-- sex_id: long (nullable = true)
 |-- age_group_id: long (nullable = true)
 |-- race_ethnicity_id: long (nullable = true)
 |-- count: string (nullable = true)



In [179]:
# check that there are no NULL values, ie that no rows contain values not in dimensions tables
df_cdc.filter( " OR ".join( [ f"'{a}' IS NULL" for a in df_cdc.columns] ) )\
        .count()

0

In [181]:
# write to jdbc
table_name = "covid_per_popgroup"
df_cdc.write\
    .format("jdbc")\
    .mode("overwrite")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", table_name)\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()

In [136]:
table_name = "dim_age_group"
dim_age.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", table_name)\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()

In [137]:
table_name = "dim_sex"
dim_sex.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", table_name)\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()

In [138]:
table_name = "dim_race_ethnicity"
dim_race_ethnicity.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", table_name)\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()