## Update variables

In [33]:
storage_account_name = 'asastgsusa4bhqynzjymxma'
container_name = 'raw'
#sql_pool_name = 'SQLPool01'

StatementMeta(SparkPool01, 2, 25, Finished, Available)

## Load datasets

In [34]:
# load 2018-2021 SDUD data from ADLS2 into Spark dataframe and join

df_18 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2018.csv")
df_19 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2019.csv")
df_20 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2020.csv")
df_21 = spark.read.format('csv').options(header='true', inferschema='true').load(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_2021.csv")


df_all = df_18.union(df_19).union(df_20).union(df_21)

StatementMeta(SparkPool01, 2, 26, Finished, Available)

## Explore the data

In [35]:
# display dataframe

display(df_all)

StatementMeta(SparkPool01, 2, 27, Finished, Available)

SynapseWidget(Synapse.DataFrame, af24c74f-b30b-4543-b373-096add47ea78)

In [36]:
# how many rows, columns

print((df_all.count(), len(df_all.columns)))

StatementMeta(SparkPool01, 2, 28, Finished, Available)

(14743821, 20)

In [37]:
# print schema

df_all.printSchema()

StatementMeta(SparkPool01, 2, 29, Finished, Available)

root
 |-- Utilization Type: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labeler Code: integer (nullable = true)
 |-- Product Code: integer (nullable = true)
 |-- Package Size: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Suppression Used: boolean (nullable = true)
 |-- Units Reimbursed: string (nullable = true)
 |-- Number of Prescriptions: string (nullable = true)
 |-- Total Amount Reimbursed: string (nullable = true)
 |-- Medicaid Amount Reimbursed: string (nullable = true)
 |-- Non Medicaid Amount Reimbursed: string (nullable = true)
 |-- Quarter Begin: string (nullable = true)
 |-- Quarter Begin Date: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- NDC: long (nullable = true)

In [38]:
# rename columns

oldColumns = df_all.schema.names
newColumns = ["Utilization_Type", "State", "Labeler_Code", "Product_Code",
       "Package_Size", "Year", "Quarter", "Product_Name", "Supression_Used",
       "Units_Reimbursed", "Number_of_Prescriptions",
       "Total_Amount_Reimbursed", "Medicaid_Amount_Reimbursed",
       "Non_Medicaid_Amount_Reimbursed", "Quarter_Begin", "Quarter_Begin_Date",
       "Latitude", "Longitude", "Location", "NDC"]

from functools import reduce

df_all_mod = reduce(lambda df_all, idx: df_all.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df_all)

StatementMeta(SparkPool01, 2, 30, Finished, Available)

In [39]:
# print new schema

df_all_mod.printSchema()

StatementMeta(SparkPool01, 2, 31, Finished, Available)

root
 |-- Utilization_Type: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labeler_Code: integer (nullable = true)
 |-- Product_Code: integer (nullable = true)
 |-- Package_Size: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Supression_Used: boolean (nullable = true)
 |-- Units_Reimbursed: string (nullable = true)
 |-- Number_of_Prescriptions: string (nullable = true)
 |-- Total_Amount_Reimbursed: string (nullable = true)
 |-- Medicaid_Amount_Reimbursed: string (nullable = true)
 |-- Non_Medicaid_Amount_Reimbursed: string (nullable = true)
 |-- Quarter_Begin: string (nullable = true)
 |-- Quarter_Begin_Date: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- NDC: long (nullable = true)

In [44]:
# save to ADLS2 as parquet

df_all_mod.write.parquet(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/SDUD/State_Drug_Utilization_Data_All.parquet")

StatementMeta(SparkPool01, 2, 37, Finished, Available)

## Save and re-load as parquet

In [None]:
%%pyspark
#df_test = df_all_mod.toPandas()
#df_test.to_parquet('test.parquet')

df_all_mod.write.parquet("abfss://sdudsynapsefilesystem@sdudsynapseadls2.dfs.core.windows.net/State_Drug_Utilization_Data_20182019.parquet")


StatementMeta(, , , Cancelled, )

In [None]:
%%pyspark
df = spark.read.load('abfss://sdudsynapsefilesystem@sdudsynapseadls2.dfs.core.windows.net/State_Drug_Utilization_Data_20182019.parquet', format='parquet')
display(df.limit(10))

#df = spark.read.format('parquet').options(header='true', inferSchema='true').load('abfss://sdudsynapsefilesystem@sdudsynapseadls2.dfs.core.windows.net/State_Drug_Utilization_Data_2018.parquet')

StatementMeta(, , , Cancelled, )

## Write Spark dataframe into SQL Pool table

In [42]:
# create temp view

df_all_mod.createOrReplaceTempView("df_all_mod_temp")

StatementMeta(SparkPool01, 2, 35, Finished, Available)

In [43]:
%%spark
//create Scala dataframe - use %%spark magic command

val df_all_scala = spark.sql("SELECT * FROM df_all_mod_temp")

StatementMeta(SparkPool01, 2, 36, Finished, Available)

df_all_scala: org.apache.spark.sql.DataFrame = [Utilization_Type: string, State: string ... 18 more fields]


In [None]:
%%spark
// write the dataframe into sql pool 
import org.apache.spark.sql.SqlAnalyticsConnector._
import com.microsoft.spark.sqlanalytics.utils.Constants

val sql_pool_name = "SQLPool01"

df_all_scala.write.sqlanalytics(s"$sql_pool_name.dbo.DFTest", Constants.INTERNAL)


StatementMeta(, , , Cancelled, )