# Demographics Data Processing

The staging script for the Demographics Data Processing Spark Job

In [1]:
# import libraries
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

# Spark Job
---

In [2]:
def initialize_spark():
    """
    Initializes a spark instance
    """
    # initialize spark
    spark = SparkSession\
        .builder\
        .appName("demographics-data-processing")\
        .getOrCreate()

    return spark

In [3]:
def load_demo_data(spark, input_fp):
    """
    This function simply loads in the demographic data

    Params
    ------
    spark: spark session
        An initialized spark session
    input_fp: str
        The location of the demographics file
    """
    # specify schema
    demo_file_schema = StructType([
        StructField("City", StringType()),
        StructField("State", StringType()),
        StructField("Median Age", DoubleType()),
        StructField("Male Population", IntegerType()),
        StructField("Female Population", IntegerType()),
        StructField("Total Population", IntegerType()),
        StructField("Number of Veterans", IntegerType()),
        StructField("Foreign-born", IntegerType()),
        StructField("Average Household Size", DoubleType()),
        StructField("State Code", StringType()),
        StructField("Race", StringType()),
        StructField("Count", IntegerType())
    ])

    # load data
    demo = spark.read.csv(
        input_fp, 
        header=True, 
        sep=";",
        schema=demo_file_schema
    )

    return demo

In [4]:
def create_city_state_lookup(demo, preprocessed_fp):
    """
    Creates a lookup table containing cities and state codes

    Params
    ------
    demo: spark dataframe
        The demographics data
    preprocessed_fp: str
        The location to store the lookup table
    """
    # create a city, state, and state code database that we can use to map state codes to the temperature data
    state_city_lookup = demo.select("City", "State", "State Code").dropDuplicates()

    # rename state code to avoid running into an error
    state_city_lookup = state_city_lookup.withColumnRenamed("State Code", "state_id")

    # write to folder
    state_city_lookup.write.parquet(preprocessed_fp + "state_city_lookup_demo/", "append")

In [5]:
def create_state_dim(demo, output_fp):
    """
    Creates the state dimension table

    Params
    ------
    demo: spark dataframe
        The demographics data
    output_fp: str
        The location to store the dimension table in
    """
    # create our state dimension table
    state_dim = demo.select("State", "State Code").dropDuplicates()

    # rename columns
    state_dim_new_cols = {
        "State Code": "state_id",
        "State": "state_name"
    }

    for k,v in state_dim_new_cols.items():
        state_dim = state_dim.withColumnRenamed(k,v)

    # write into folder
    state_dim.write.parquet(output_fp + "dim_state/", "append")

In [6]:
def create_fact_demographics(demo, output_fp):
    """
    Creates the fact demographics table

    Params
    ------
    demo: spark dataframe
        The demographics data
    output_fp: str
        The location to store the dimension table in
    """
    # remove the race data from the table and store it separately
    race_demo = demo.select("City", "State Code", "Race", "Count")

    # drop the race data and deduplicate
    demo = demo.drop("Race", "Count")
    demo = demo.dropDuplicates()

    # calculate total number of households
    demo = demo.withColumn("total_number_of_households", demo["Total Population"] / demo["Average Household Size"])

    # roll everything we need up to a state code level to begin creating the fact table
    exprs = {x:"sum" for x in demo.columns if x not in ["City", "State", "State Code", 
                                                        "Median Age", "Average Household Size"]}

    fact_demo = demo.groupBy("State Code").agg(exprs)

    # now for the race demo table, aggregate to a state level with a column for each race
    fact_race_demo = race_demo.groupBy("State Code").pivot("Race").agg({"Count": "sum"})

    # merge the main demo table and the race-wise table
    fact_demo = fact_demo.join(
        fact_race_demo,
        on="State Code",
        how="left"
    )

    # rename all columns
    fact_demo_col_names = {
        "State Code": "state_id",
        "sum(Total Population)": "total_pop",
        "sum(Female Population)": "female_pop",
        "sum(Number of Veterans)": "veteran_pop",
        "sum(Foreign-born)": "foreign_pop",
        "sum(Male Population)": "male_pop",
        "sum(total_number_of_households)": "total_hh",
        "American Indian and Alaska Native": "native_pop",
        "Asian": "asian_pop",
        "Black or African-American": "black_pop",
        "Hispanic or Latino": "hispanic_pop",
        "White": "white_pop"
    }

    for k,v in fact_demo_col_names.items():
        fact_demo = fact_demo.withColumnRenamed(k,v)

    # calculate average household size at a state level
    fact_demo = fact_demo.withColumn("avg_hh_size", fact_demo["total_pop"] / fact_demo["total_hh"])

    # write to output folder
    fact_demo.write.parquet(output_fp + "fact_demographics/", "append")

In [7]:
def main():
    """
    The main function that orchestrates the spark job
    """
    # hardcoded variables
    input_fp = "../data/us-cities-demographics.csv"
    preprocessed_fp = "../data/preprocessed_files/"
    output_fp = "../data/output_files/"
    
    # run the spark job
    spark = initialize_spark()
    demo = load_demo_data(spark, input_fp)
    create_city_state_lookup(demo, preprocessed_fp)
    create_state_dim(demo, output_fp)
    create_fact_demographics(demo, output_fp)

    spark.stop()

In [8]:
# run above
main()

22/04/07 02:29:16 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlo1)
22/04/07 02:29:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/07 02:29:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/07 02:29:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/07 02:29:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/04/07 02:29:17 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
                                                                                

# Testing
---

In [16]:
# import files
spark = initialize_spark()

fact_demo = spark.read.parquet("../data/output_files/fact_demographics/")
state_dim = spark.read.parquet("../data/output_files/state_dim/")
state_city_lookup = spark.read.parquet("../data/preprocessed_files/state_city_lookup_demo/")

In [17]:
# check facts
fact_demo.limit(10).toPandas()

Unnamed: 0,state_id,total_pop,female_pop,veteran_pop,foreign_pop,male_pop,total_hh,native_pop,asian_pop,black_pop,hispanic_pop,white_pop,avg_hh_size
0,AZ,4499542,2272087,264505,682313,2227455,1639724.0,129708,229183,296222,1508157,3591611,2.744086
1,SC,533657,272713,33463,27744,260944,217023.3,3705,13355,175064,29863,343764,2.458985
2,LA,1300595,673597,69771,83419,626998,528498.9,8263,38739,602377,87133,654578,2.460923
3,MN,1422403,720246,64894,215873,702157,582725.8,25242,151544,216731,103229,1050239,2.440947
4,NJ,1428908,723172,30195,477028,705736,496067.8,11350,116844,452202,600437,615083,2.880469
5,DC,672228,352523,25963,95117,319705,300101.8,6130,35072,328786,71129,285402,2.24
6,OR,1436509,729066,78948,185753,707443,576566.8,38597,117279,72150,201498,1235819,2.491488
7,VA,2363622,1203148,229766,269254,1160474,946353.3,26160,167784,771569,216760,1428158,2.497611
8,RI,413562,210746,18607,87365,202816,161297.8,6369,24245,55556,109226,287304,2.563965
9,KY,929877,477394,56025,66488,452483,385575.8,7772,32667,202749,50478,705790,2.411658


In [18]:
# check state_dim
state_dim.limit(10).toPandas()

Unnamed: 0,state_name,state_id
0,Mississippi,MS
1,Utah,UT
2,South Dakota,SD
3,Kentucky,KY
4,California,CA
5,Nebraska,NE
6,New Hampshire,NH
7,Delaware,DE
8,Minnesota,MN
9,North Carolina,NC


In [19]:
# check state_city_lookup
state_city_lookup.limit(10).toPandas()

Unnamed: 0,City,State,state_id
0,Rockville,Maryland,MD
1,Delray Beach,Florida,FL
2,Jersey City,New Jersey,NJ
3,Gulfport,Mississippi,MS
4,Cincinnati,Ohio,OH
5,Urban Honolulu,Hawaii,HI
6,Alhambra,California,CA
7,South Jordan,Utah,UT
8,Caguas,Puerto Rico,PR
9,Boca Raton,Florida,FL
