# Shailja Somani, John Vincent Deniega, Muris Saab
# ADS 508 - Team 2
# Spring 2024

## Initialize SageMaker variables: Direct copy from

Coyne, S. (2024, February 27). Update 01_Copy_TSV_To_S3.ipynb [Jupyter Notebook]. GitHub. Retrieved March 15, 2024, from https://github.com/MADS508/labs/blob/main/04_ingest/01_Copy_TSV_To_S3.ipynb.

In [7]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Import PyAthena

Coyne, S. (2022, October 9). 02_Create_Athena_Database.ipynb [Jupyter Notebook]. GitHub. Retrieved March 15, 2024, from https://github.com/MADS508/labs/commits/main/04_ingest/02_Create_Athena_Database.ipynb.

In [8]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

## Create Athena Database

Coyne, S. (2022, October 9). 02_Create_Athena_Database.ipynb [Jupyter Notebook]. GitHub. Retrieved March 15, 2024, from https://github.com/MADS508/labs/commits/main/04_ingest/02_Create_Athena_Database.ipynb.

In [9]:
database_name = "covid"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS covid


## Verify Athena Database

Coyne, S. (2022, October 9). 02_Create_Athena_Database.ipynb [Jupyter Notebook]. GitHub. Retrieved March 15, 2024, from https://github.com/MADS508/labs/commits/main/04_ingest/02_Create_Athena_Database.ipynb.

In [10]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,covid
1,default


## Copy from Public S3 Bucket into My Private Bucket

In [67]:
# Check what is in public S3 bucket for our team
s3_public_path_csv = "s3://ads508-team2-spring24/"
!aws s3 ls $s3_public_path_csv

2024-03-16 23:56:13     592336 01_MD_COVID-19_-_Cases_by_ZIP_Code.csv
2024-03-17 22:02:37   61876919 02_SDOH_2020_ZIPCODE_1_0.csv
2024-03-16 23:56:16    3660534 03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv
2024-03-17 22:09:43     164479 04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas.csv
2024-03-17 22:02:39    2564885 05_ZIPCodetoZCTACrosswalk2022UDS.csv


In [57]:
# Copy each file from the Public S3 bucket to its own folder in my private S3 bucket 
# We keep them all in their own folders because Athena external table creation takes in a full S3 folder name, not a file name

# Maryland covid cases file - list path names to file in public & where to copy in private
s3_public_path_md_covid = "s3://ads508-team2-spring24/01_MD_COVID-19_-_Cases_by_ZIP_Code.csv"
s3_priv_path_md_covid = "s3://{}/ads508-team2-spring24/md_covid_cases/01_MD_COVID-19_-_Cases_by_ZIP_Code.csv".format(bucket)

# Actually copy over
!aws s3 cp $s3_public_path_md_covid $s3_priv_path_md_covid

copy: s3://ads508-team2-spring24/01_MD_COVID-19_-_Cases_by_ZIP_Code.csv to s3://sagemaker-us-east-1-590183834230/ads508-team2-spring24/md_covid_cases/01_MD_COVID-19_-_Cases_by_ZIP_Code.csv


In [58]:
# Check that copied properly 
!aws s3 ls $s3_priv_path_md_covid

2024-03-17 21:50:06     592336 01_MD_COVID-19_-_Cases_by_ZIP_Code.csv


In [64]:
# SDOH file - list path names to file in public & where to copy in private
s3_public_path_sdoh = "s3://ads508-team2-spring24/02_SDOH_2020_ZIPCODE_1_0.csv"
s3_priv_path_sdoh = "s3://{}/ads508-team2-spring24/sdoh_2020_data/02_SDOH_2020_ZIPCODE_1_0.csv".format(bucket)

# Actually copy over
!aws s3 cp $s3_public_path_sdoh $s3_priv_path_sdoh

# Check that copied properly 
!aws s3 ls $s3_priv_path_sdoh

copy: s3://ads508-team2-spring24/02_SDOH_2020_ZIPCODE_1_0.csv to s3://sagemaker-us-east-1-590183834230/ads508-team2-spring24/sdoh_2020_data/02_SDOH_2020_ZIPCODE_1_0.csv
2024-03-17 22:05:35   61876919 02_SDOH_2020_ZIPCODE_1_0.csv


In [65]:
# CDC PLACES data file - list path names to file in public & where to copy in private
s3_public_path_cdc = "s3://ads508-team2-spring24/03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv"
s3_priv_path_cdc = "s3://{}/ads508-team2-spring24/cdc_places_data/03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv".format(bucket)

# Actually copy over
!aws s3 cp $s3_public_path_cdc $s3_priv_path_cdc

# Check that copied properly 
!aws s3 ls $s3_priv_path_cdc

copy: s3://ads508-team2-spring24/03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv to s3://sagemaker-us-east-1-590183834230/ads508-team2-spring24/cdc_places_data/03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv
2024-03-17 22:06:21    3660534 03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv


In [68]:
# MD Census data file - list path names to file in public & where to copy in private
s3_public_path_md_census = "s3://ads508-team2-spring24/04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas.csv"
s3_priv_path_md_census = "s3://{}/ads508-team2-spring24/md_census_data/04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas.csv".format(bucket)

# Actually copy over
!aws s3 cp $s3_public_path_md_census $s3_priv_path_md_census

# Check that copied properly 
!aws s3 ls $s3_priv_path_md_census

copy: s3://ads508-team2-spring24/04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas.csv to s3://sagemaker-us-east-1-590183834230/ads508-team2-spring24/md_census_data/04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas.csv
2024-03-17 22:10:09     164479 04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas.csv


In [69]:
# Zip Code to ZTCA Crosswalk data file - list path names to file in public & where to copy in private
s3_public_path_xwalk = "s3://ads508-team2-spring24/05_ZIPCodetoZCTACrosswalk2022UDS.csv"
s3_priv_path_xwalk = "s3://{}/ads508-team2-spring24/crosswalk_data/05_ZIPCodetoZCTACrosswalk2022UDS.csv".format(bucket)

# Actually copy over
!aws s3 cp $s3_public_path_xwalk $s3_priv_path_xwalk

# Check that copied properly 
!aws s3 ls $s3_priv_path_xwalk

copy: s3://ads508-team2-spring24/05_ZIPCodetoZCTACrosswalk2022UDS.csv to s3://sagemaker-us-east-1-590183834230/ads508-team2-spring24/crosswalk_data/05_ZIPCodetoZCTACrosswalk2022UDS.csv
2024-03-17 22:11:13    2564885 05_ZIPCodetoZCTACrosswalk2022UDS.csv


In [19]:
MDCovid = pd.read_csv("dataset_csvs/01_MD_COVID-19_-_Cases_by_ZIP_Code.csv")
SDOH = pd.read_excel("dataset_csvs/02_SDOH_2020_ZIPCODE_1_0.xlsx") # Troubleshoot why selecting a sheet slows this down
CDC = pd.read_csv("dataset_csvs/03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv")
MDCensus = pd.read_csv("dataset_csvs/04_Maryland_Census_Data_-_ZIP_Code_Tabulation_Areas_(ZCTAs).csv")
MDCrosswalk = pd.read_excel("dataset_csvs/05_ZIPCodetoZCTACrosswalk2022UDS.xlsx")

# Use to verify data ingest, but commented out to save on computing costs on notebook rerun
#MDCovid.head(5), SDOH.head(5), CDC.head(5), MDCensus.head(5), MDCrosswalk.head(5)

## Create Athena Tables/Parquet Tables from S3 CSV Files

Coyne, S. (2022, October 9). 03_Register_S3_TSV_With_Athena.ipynb [Jupyter Notebook]. GitHub. Retrieved March 15, 2024, from https://github.com/MADS508/labs/blob/main/04_ingest/03_Register_S3_TSV_With_Athena.ipynb. \
Coyne, S. (2022, October 9). 04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb [Jupyter Notebook]. GitHub. Retrieved March 17, 2024, from https://github.com/MADS508/labs/blob/main/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb.

In [40]:
# Set S3 path to parquet data
s3_path_parquet = "s3://{}/ads508-team2-spring24/parquet/".format(bucket)

### Create string variables to hold values for respective tables and the database for SQL statement

In [51]:
MDCovid_table = "MDCovid"
SDOH_table = "SDOH"
CDC_table = "CDC"
MDCensus_table = "MDCensus"
MDCrosswalk_table = "MDCrosswalk"
database_name = "covid"

### MD Covid by Zip Code Table 

In [60]:
# Build SQL statement to create MD Covid by Zip Code Athena Table
table_name = MDCovid_table
path = "s3://{}/ads508-team2-spring24/md_covid_cases/".format(bucket)

'''
NOTE: The covid counts in this table are running totals, not just new cases.
However, as stated in our Assignment 2.1, to avoid the additional confounding variable of time, we will just train 
     & test our model on one week of data at a time. Thus, it's not important how the numbers for each time point are 
     calculated, as long as it remains consistent for that time point. 
There is data for every day, but we don't want to explode our this table that much, so we'll just push the running
    total for the first of each month to our Athena table. 
'''
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         ZIP_CODE int,
         total01_01_2022 int,
         total02_01_2022 int,
         total03_01_2022 int,
         total04_01_2022 int,
         total05_01_2022 int,
         total06_01_2022 int,
         total07_01_2022 int,
         total08_01_2022 int,
         total09_01_2022 int,
         total10_01_2022 int,
         total11_01_2022 int,
         total12_01_2022 int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS covid.MDCovid(
         ZIP_CODE int,
         total01_01_2022 int,
         total02_01_2022 int,
         total03_01_2022 int,
         total04_01_2022 int,
         total05_01_2022 int,
         total06_01_2022 int,
         total07_01_2022 int,
         total08_01_2022 int,
         total09_01_2022 int,
         total10_01_2022 int,
         total11_01_2022 int,
         total12_01_2022 int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-590183834230/ads508-team2-spring24/md_covid_cases/'
TBLPROPERTIES ('skip.header.line.count'='1')


In [61]:
# Actually create table (execute statement) & verify created properly
pd.read_sql(statement, conn)

In [62]:
# Check that worked
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,mdcovid


### Social Determinants of Health Database Table

In [10]:
table_name = SDOH_table
path = SDOH_path

#TO DO: Figure out how to handle the 327 columns. S3?
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         OBJECTID int,
         ZIP_CODE int,
)""".format(
    database_name, table_name, path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS covid.SDOH(
         OBJECTID int,
         ZIP_CODE int,
)


### PLACES: Local Data for Better Health Table

In [11]:
table_name = CDC_table
path = CDC_path

#TO DO: Figure out how to handle total{date} columns for schema. Columnar?
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         YEAR YEAR,
         LocationName int,
         DataSource string,
         Category string,
         Measure string,
         Data_Value_Unit string,
         Data_Value_Type string,
         Data_Value DECIMAL(3,1),
         Data_Value_Footnote_Symbol string,
         Data_Value_Footnote string,
         Low_Confidence_Limit DECIMAL(3,1),
         High_Confidence_Limit DECIMAL(3,1),
         TotalPopulation int,
         Geolocation string,
         LocationID int,
         CategoryID string,
         MeasureId string,
         DataValueTypeID string,
         Short_Question_Text string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS covid.CDC(
         YEAR YEAR,
         LocationName int,
         DataSource string,
         Category string,
         Measure string,
         Data_Value_Unit string,
         Data_Value_Type string,
         Data_Value DECIMAL(3,1),
         Data_Value_Footnote_Symbol string,
         Data_Value_Footnote string,
         Low_Confidence_Limit DECIMAL(3,1),
         High_Confidence_Limit DECIMAL(3,1),
         TotalPopulation int,
         Geolocation string,
         LocationID int,
         CategoryID string,
         MeasureId string,
         DataValueTypeID string,
         Short_Question_Text string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://ads508-team2-spring24/03_PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release.csv'
TBLPROPERTIES ('skip.header.line.count'='1')


### Maryland Census Data Table

In [12]:
table_name = MDCensus_table
path = MDCensus_path

#TO DO: Figure out how to handle total{date} columns for schema. Columnar?
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         OBJECTID_1 int,
         ZCTA5CE10 int,
         FIRST_STAT int,
         FIRST_GEOI int,
         FIRST_CLAS CHAR(2),
         FIRST_MTFC string,
         FIRST FUNC CHAR(1),
         ZCTA5N int,
         STATE int,
         AREALAND int,
         AREAWATR int,
         POP100 int,
         HU100 int,
         NHW int,
         NHB int,
         NHAI int,
         NHA int,
         NHNH int,
         NHO int,
         NHT int,
         HISP int,
         PNHW DECIMAL(3,1),
         PNHB DECIMAL(3,1),
         PNHAI DECIMAL(2,1),
         PNHA DECIMAL(2,1),
         PNHNH DECIMAL(2,1),
         PNHT DECIMAL(2,1),
         PHISP DECIMAL(2,1),
         POP65_ int,
         PCTPOP65_ DECIMAL(3,1),
         MEDAGE DECIMAL(3,1),
         VACNS DECIMAL(2,1),
         PVACNS DECIMAL(2,1),
         PHOWN DECIMAL(3,1),
         PWOMORT DECIMAL(3,1),
         PRENT DECIMAL(3,1),
         PLT18SP DECIMAL(3,1),
         REPORT_2_P string,
         REPORT_9_P string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS covid.MDCensus(
         OBJECTID_1 int,
         ZCTA5CE10 int,
         FIRST_STAT int,
         FIRST_GEOI int,
         FIRST_CLAS CHAR(2),
         FIRST_MTFC string,
         FIRST FUNC CHAR(1),
         ZCTA5N int,
         STATE int,
         AREALAND int,
         AREAWATR int,
         POP100 int,
         HU100 int,
         NHW int,
         NHB int,
         NHAI int,
         NHA int,
         NHNH int,
         NHO int,
         NHT int,
         HISP int,
         PNHW DECIMAL(3,1),
         PNHB DECIMAL(3,1),
         PNHAI DECIMAL(2,1),
         PNHA DECIMAL(2,1),
         PNHNH DECIMAL(2,1),
         PNHT DECIMAL(2,1),
         PHISP DECIMAL(2,1),
         POP65_ int,
         PCTPOP65_ DECIMAL(3,1),
         MEDAGE DECIMAL(3,1),
         VACNS DECIMAL(2,1),
         PVACNS DECIMAL(2,1),
         PHOWN DECIMAL(3,1),
         PWOMORT DECIMAL(3,1),
         PRENT DECIMAL(3,1),
         PLT18SP DECIMAL(3,1),
         REPORT_2_P string

### ZTCA to Zip Code Crosswalk Table
#### https://www.arcgis.com/home/item.html?id=dc123f738bf846779c49db6472f82a4b
#### https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html

In [13]:
table_name = MDCrosswalk_table
path = MDCrosswalk_path

#TO DO: Figure out how to handle total{date} columns for schema. Columnar?
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         ZIP_CODE int,
         PO_NAME string,
         STATE string,
         ZIP_TYPE string,
         zcta int,
         zip_join_type string
)""".format(
    database_name, table_name, path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS covid.MDCrosswalk(
         ZIP_CODE int,
         PO_NAME string,
         STATE string,
         ZIP_TYPE string,
         zcta int,
         zip_join_type string
)


## Release Resources: Direct copy from

Coyne, S. (2024, February 27). Update 01_Copy_TSV_To_S3.ipynb [Jupyter Notebook]. GitHub. https://github.com/MADS508/labs/blob/main/04_ingest/01_Copy_TSV_To_S3.ipynb

In [14]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [15]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>