## Update variables

In [None]:
# list all tables in spark db

from pyspark.sql import SparkSession

for table in spark.catalog.listTables():
    print(table.name)

In [None]:
# drop tables

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
sqlContext.sql('drop table tb2')

In [26]:
storage_account_name = 'asastgsusa4bhqynzjymxma'
container_name = 'raw'
#sql_pool_name = 'SQLPool01'

StatementMeta(SparkPool01, 10, 1, Finished, Available)

## Load datasets

In [27]:
# load 2018-2021 SDUD data from ADLS2 into Spark dataframe and join

df_18 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2018.csv")
df_19 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2019.csv")
df_20 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2020.csv")
df_21 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2021.csv")


df = df_18.union(df_19).union(df_20).union(df_21)

StatementMeta(SparkPool01, 10, 2, Finished, Available)

## Explore the data

In [101]:
# display joined dataframe

display(df.limit(10))

StatementMeta(SparkPool01, 10, 77, Finished, Available)

SynapseWidget(Synapse.DataFrame, 260e77e6-310b-49a7-8eda-f8a110133285)

In [29]:
# how many rows, columns

print((df.count(), len(df.columns)))

StatementMeta(SparkPool01, 10, 4, Finished, Available)

(14743821, 20)

### Print schema and modify

In [30]:
# current schema

df.printSchema()

StatementMeta(SparkPool01, 10, 5, Finished, Available)

root
 |-- Utilization Type: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labeler Code: integer (nullable = true)
 |-- Product Code: integer (nullable = true)
 |-- Package Size: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Suppression Used: boolean (nullable = true)
 |-- Units Reimbursed: string (nullable = true)
 |-- Number of Prescriptions: string (nullable = true)
 |-- Total Amount Reimbursed: string (nullable = true)
 |-- Medicaid Amount Reimbursed: string (nullable = true)
 |-- Non Medicaid Amount Reimbursed: string (nullable = true)
 |-- Quarter Begin: string (nullable = true)
 |-- Quarter Begin Date: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- NDC: long (nullable = true)

In [31]:
# cast Number_of_Prescriptions from string to int

df1 = df.withColumn("Number of Prescriptions", df["Number of Prescriptions"].cast('int'))

StatementMeta(SparkPool01, 10, 6, Finished, Available)

In [32]:
# rename columns

oldColumns = df1.schema.names
newColumns = ["Utilization_Type", "State", "Labeler_Code", "Product_Code",
       "Package_Size", "Year", "Quarter", "Product_Name", "Supression_Used",
       "Units_Reimbursed", "Number_of_Prescriptions",
       "Total_Amount_Reimbursed", "Medicaid_Amount_Reimbursed",
       "Non_Medicaid_Amount_Reimbursed", "Quarter_Begin", "Quarter_Begin_Date",
       "Latitude", "Longitude", "Location", "NDC"]

from functools import reduce

df2 = reduce(lambda df1, idx: df1.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df1)

StatementMeta(SparkPool01, 10, 7, Finished, Available)

In [33]:
# new schema

df2.printSchema()


StatementMeta(SparkPool01, 10, 8, Finished, Available)

root
 |-- Utilization_Type: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labeler_Code: integer (nullable = true)
 |-- Product_Code: integer (nullable = true)
 |-- Package_Size: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Supression_Used: boolean (nullable = true)
 |-- Units_Reimbursed: string (nullable = true)
 |-- Number_of_Prescriptions: integer (nullable = true)
 |-- Total_Amount_Reimbursed: string (nullable = true)
 |-- Medicaid_Amount_Reimbursed: string (nullable = true)
 |-- Non_Medicaid_Amount_Reimbursed: string (nullable = true)
 |-- Quarter_Begin: string (nullable = true)
 |-- Quarter_Begin_Date: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- NDC: long (nullable = true)

### Create Spark table for downstream querying

In [41]:
df2.write.saveAsTable("tb1")

StatementMeta(SparkPool01, 10, 16, Finished, Available)

In [34]:
# temporary view

#df2.createOrReplaceTempView("df3")

StatementMeta(SparkPool01, 10, 9, Finished, Available)

### Get top prescriptions by product from 2018-2021

In [42]:
# top Rx by product - 2018-2020

display(spark.sql("select Product_Name, sum(Number_of_Prescriptions) \
    from tb1 \
    where Product_Name <> 'UNKNOWN' \
    group by Product_Name \
    order by sum(Number_of_Prescriptions) desc").limit(25))

StatementMeta(SparkPool01, 10, 17, Finished, Available)

SynapseWidget(Synapse.DataFrame, 167cad35-c5a9-4ec7-8f62-ea8fd93c8eae)

### Note that Albuterol (respiratory inhaler) had one of the highest prespription counts between 2018-2021.  It was a common drug used to treat COVID-19 symptoms, such as breathing difficulties.  Let's try to find additional trends by looking a little closer at the top prescriptions in 2020.

In [43]:
# top Rx by product - 2020

display(spark.sql("select Product_Name, sum(Number_of_Prescriptions) \
    from tb1 \
    where Year = '2020' \
    and Product_Name <> 'UNKNOWN' \
    group by Product_Name \
    order by sum(Number_of_Prescriptions) desc").limit(50))

StatementMeta(SparkPool01, 10, 18, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6f396b6f-d3a4-4fb5-9215-c826224e4e3d)

### In addition to using Albuterol to treat common COVID-19 symptoms, Corticosteroids (e.g., Dexamethasone, Prednisone, Methylprednisolone, Hydrocortisone) were also used to control inflammation in severe cases*.  Let's look at these drug trends from 2018-2021.  

In [125]:
# top Rx by product by year (Albuterol + Corticosteroids) - 2018-2021

# Methyprednisolone - Brand Names: A-Methapred, DEPO-Medrol, SOLU-Medrol. See https://www.drugs.com/mtm/methylprednisolone-injection.html
# Hydrocortisone - Brand Names: Solu-CORTEF.  See https://www.drugs.com/mtm/solu-cortef-injection.html

display(spark.sql("select Product_Name, Year, Quarter, sum(Number_of_Prescriptions) \
                   from tb1 \
                   where Product_Name LIKE 'ALBUTEROL%' \
                   OR \
                   Product_Name LIKE ('DEXAMETHA%') \
                   OR \
                   Product_Name LIKE ('PREDNISON%') \
                   OR \
                   Product_Name LIKE ('A-METHA%') \
                   OR \
                   Product_Name LIKE ('SOLU-MEDR%') \
                   OR \
                   Product_Name LIKE ('DEPO-MEDR%') \
                   OR \
                   Product_Name LIKE ('SOLU-CORTE%') \
                   group by Product_Name, Year, Quarter \
                   order by Year asc, Quarter asc, Product_Name asc, sum(Number_of_Prescriptions) desc"))

StatementMeta(SparkPool01, 10, 101, Finished, Available)

SynapseWidget(Synapse.DataFrame, d7e803da-1bc9-4c3d-82bf-0a780b8a9796)

### Albuterol + Corticosteroid trends for 2020 only.

In [116]:
# top Rx by product by year quarter (Albuterol + Corticosteroids) - 2020

display(spark.sql("select Product_Name, Year, Quarter, sum(Number_of_Prescriptions) \
                  from tb1 \
                  where Year = '2020' \
                  and Product_Name LIKE ('ALBUTER%') \
                        OR \
                        Product_Name LIKE ('DEXAMETH%') \
                        OR \
                        Product_Name LIKE ('PREDNISON%') \
                        OR \
                        Product_Name LIKE ('A-METHA%') \
                        OR \
                        Product_Name LIKE ('SOLU-MEDR%') \
                        OR \
                        Product_Name LIKE ('DEPO-MEDR%') \
                        OR \
                        Product_Name LIKE ('SOLU-CORTE%') \
                  group by Product_Name, Year, Quarter \
                  order by Product_Name asc, sum(Number_of_Prescriptions) desc"))

StatementMeta(SparkPool01, 10, 92, Finished, Available)

SynapseWidget(Synapse.DataFrame, 7f3fa73a-cc1c-43b4-b0bf-e301773b87be)

## Pre-process the data

### We've noted some interesting Rx trends among Albuterol + Corticosteroid drugs throughout 2020.   Note the drop in Albuterol Rx from Q1 to Q4.  Prednisone was also more widely prescribed vs other Corticosteroids.  We will consolidate these product groups for cleaner reporting.

In [117]:
# check for similar Corticosteroid Product_Names

display(spark.sql("select DISTINCT Product_Name, NDC \
                    from tb1 \
                    where Product_Name LIKE ('ALBUTER%') \
                        OR \
                        Product_Name LIKE ('DEXAMETHA%') \
                        OR \
                        Product_Name LIKE ('PREDNISON%') \
                        OR \
                        Product_Name LIKE ('A-METHA%') \
                        OR \
                        Product_Name LIKE ('SOLU-MEDR%') \
                        OR \
                        Product_Name LIKE ('DEPO-MEDR%') \
                        OR \
                        Product_Name LIKE ('SOLU-CORTE%') \
                    order by Product_Name asc" ))

StatementMeta(SparkPool01, 10, 93, Finished, Available)

SynapseWidget(Synapse.DataFrame, 0c7a8226-b67b-4741-8c64-e78fd2710c3f)

### Consolidate Product Names

In [152]:
# write spark table out as dataframe and consolidate Product_Name

from pyspark.sql.functions import when

df2 = spark.read.table("tb1")

df3 = df2.withColumn('Product_Name', 
      when(df2.Product_Name.startswith("DEXAMETHA"), "DEXAMETHASONE") \
      .when(df2.Product_Name.startswith("PREDNISON"), "PREDNISONE") \
      .when(df2.Product_Name == "A-METHAPRED", "METHYLPREDNISOLONE") \
      .when(df2.Product_Name == "DEPO-MEDRO", "METHYLPREDNISOLONE") \
      .when(df2.Product_Name == "SOLU-MEDRO", "METHYLPREDNISOLONE") \
      .when(df2.Product_Name == "SOLU-CORTE", "HYDROCORTISONE") \
      .otherwise(df2.Product_Name))

#display(df3)


StatementMeta(SparkPool01, 10, 128, Finished, Available)

In [155]:
# write back as Spark table


df3.write.saveAsTable("tb2")

StatementMeta(SparkPool01, 10, 131, Finished, Available)

In [156]:
# view after operation

display(spark.sql("select DISTINCT Product_Name \
                   from tb2 \
                   where Product_Name =='ALBUTEROL' \
                         OR \
                         Product_Name =='DEXAMETHASONE' \
                         OR \
                         Product_Name == 'PREDNISONE' \
                         OR \
                         Product_Name == 'METHYLPREDNISOLONE' \
                         OR \
                         Product_Name == 'HYDROCORTISONE' \
                    order by Product_Name asc" ))

StatementMeta(SparkPool01, 10, 132, Finished, Available)

SynapseWidget(Synapse.DataFrame, 63afa080-684c-4b49-bdb0-7ca249691523)

### Re-run query (Albuterol + Corticosteroid trends for 2020 only.)

In [158]:
# run query again

display(spark.sql("select Product_Name, Year, Quarter, sum(Number_of_Prescriptions) \
                   from tb2 \
                   where Year = '2020' \
                   and Product_Name =='ALBUTEROL' \
                         OR \
                         Product_Name =='DEXAMETHASONE' \
                         OR \
                         Product_Name == 'PREDNISONE' \
                         OR \
                         Product_Name == 'METHYLPREDNISOLONE' \
                         OR \
                         Product_Name == 'HYDROCORTISONE' \
                   group by Product_Name, Year, Quarter \
                   order by Product_Name asc, sum(Number_of_Prescriptions) desc"))

StatementMeta(SparkPool01, 10, 134, Finished, Available)

SynapseWidget(Synapse.DataFrame, bfd2c05e-6aec-4f67-b9f7-da3725c52f41)

In [None]:
# save to ADLS2 as parquet

df_all_mod.write.parquet(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_All.parquet")

StatementMeta(, , , Cancelled, )

## Write Spark dataframe into SQL Pool table

In [100]:
%%spark
//create Scala dataframe - use %%spark magic command

val df_scala = spark.sql("SELECT * FROM tb2")

StatementMeta(SparkPool01, 10, 76, Finished, Available)

df_all_scala: org.apache.spark.sql.DataFrame = [Utilization_Type: string, State: string ... 18 more fields]


In [None]:
%%spark
// write the dataframe into sql pool 

import org.apache.spark.sql.SqlAnalyticsConnector._
import com.microsoft.spark.sqlanalytics.utils.Constants

val sql_pool_name = "SQLPool01"

df_scala.write.sqlanalytics(s"$sql_pool_name.dbo.mergeSDUD", Constants.INTERNAL)


StatementMeta(, , , Cancelled, )