In [0]:
#SparkSession, import all required libraries
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp
from pyspark.sql import functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.appName("UseCase_pricing").getOrCreate()


In [0]:
# spark = SparkSession.builder.\
#     appName("Check").\
#     config("spark.executor.instances", "4").config("spark.executor.memory", "4g").config("spark.executor.cores", "2").getOrCreate()  

In [0]:
def make_connection(storage_account_name,storage_account_access_key):
    try:
        spark.conf.set(
        "fs.azure.account.key."+storage_account_name+".blob.core.windows.net",
        storage_account_access_key)
        return True
    except Exception as e:
        raise Exception("Not able to make connection with storage account")
        return False

In [0]:
custom_schema= StructType([StructField("formatVersion",StringType(),True),
                        StructField("disclaimer",StringType(),True),
                        StructField("offerCode",StringType(),True),
                        StructField("version",StringType(),True),
                        StructField("publicationDate",StringType(),True),
                        StructField("products",MapType(StringType(),StructType([StructField("sku",StringType(),True),
                                                                                StructField("productFamily",StringType(),True),
                                                                                StructField("attributes",StructType([
                                                                                    StructField("servicecode",StringType(),True),
                                                                                    StructField("location",StringType(),True),
                                                                                    StructField("locationType",StringType(),True),
                                                                                    StructField("group",StringType(),True),
                                                                                    StructField("usagetype",StringType(),True),
                                                                                    StructField("operation",StringType(),True),
                                                                                    StructField("regionCode",StringType(),True),
                                                                                    StructField("servicename",StringType(),True)
                                                                                                                    
                                                                                    ]),True)
]),True)),
                            StructField('terms', StructType([StructField('OnDemand', MapType(StringType(), MapType(StringType(), StructType([StructField('sku', StringType(), True),StructField('effectiveDate', StringType(), True),StructField('priceDimensions', MapType(StringType(), StructType([StructField('description', StringType(), True),StructField('beginRange', StringType(), True),StructField('endRange', StringType(), True),StructField('unit', StringType(), True),StructField('pricePerUnit', MapType(StringType(), StringType(), True), True)])), True)]))), True)]))])

In [0]:
#Create Dataframe using file path
def create_df(path,custom_schema):
    raw_df=spark.read.json(path,multiLine=True,schema=custom_schema)
    return raw_df


In [0]:
#Explode the columns of the raw_df
def explode_pt(raw_df):
    df1 = raw_df.select(F.explode("products")).select('value.*').select('sku','productFamily','attributes.*')
    df2=raw_df.select('terms.*').select(F.explode('onDemand')).select(F.explode('value')).select('value.*').select('sku',F.explode('priceDimensions')).select('sku','value.*').select('sku','description','beginRange',  'endRange','unit',F.explode('pricePerUnit').alias("currency","priceperunit"))
    return df1,df2

In [0]:
#Make final_df with intermediate df's
def final_dataframe(df1,df2):
    #joins terms df with products
    final_df=df1.join(df2,"sku")

    final_df=final_df.select('sku',F.col('productFamily').alias("product_family"),'servicecode','location',F.col('locationType').alias("location_type"),'group','usagetype','operation',F.col('regionCode').alias("region_code"),'servicename','description',F.col('beginRange').alias("begin_range"),F.col('endRange').alias("end_range"),'unit','currency','priceperunit')

    final_df = final_df.withColumn("timestamp",current_timestamp())
    return final_df

In [0]:
#Write final df into postgres table
def write_to_postgres(final_df,database_url,postgres_table,properties):
    try:
        final_df.write.jdbc(url=database_url, table=postgres_table, mode="overwrite", properties=properties)
        return "Successfully loaded into postgres"
    except Exception as e:
        print(e)
        return "Failed to load into Postgres"


In [0]:
database_url = dbutils.secrets.get('usecaseScope','database_url')
properties = {
    "user": dbutils.secrets.get('usecaseScope','user'),
    "password": dbutils.secrets.get('usecaseScope','password'),
    "driver": "org.postgresql.Driver"}
container_name = dbutils.secrets.get('usecaseScope','container_name')
storage_account_name = dbutils.secrets.get('usecaseScope','storage_account_name')
access_key = dbutils.secrets.get('usecaseScope','storage_account_access_key')



try:
    status  =   make_connection(storage_account_name,access_key)
    if status:
        container_url = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/json_data/"
        # service_code    =   dbutils.widgets.get("dropdown_param")
        files = dbutils.fs.ls(container_url)

        for f in files:
            if "glue" in f.path.lower():
                postgres_table = "glue_pricing"
            elif "lambda" in f.path.lower():
                postgres_table = "lambda_pricing"

            raw_df  =   create_df(f.path,custom_schema)
            df1,df2 =   explode_pt(raw_df)
            final_df=   final_dataframe(df1,df2)
            display(final_df)
            load=write_to_postgres(final_df,database_url,postgres_table,properties)
            print(load)

except Exception as e:
    raise Exception(e)



sku,product_family,servicecode,location,location_type,group,usagetype,operation,region_code,servicename,description,begin_range,end_range,unit,currency,priceperunit,timestamp
6GCTGHJ7U5FXSYAE,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,Data Brew free interactive sessions,APS2-DBrew-FreeSessions,Session,ap-southeast-2,AWS Glue,$0 for AWS Glue DataBrew interactive sessions under free trial in Asia Pacific (Sydney),0,inf,Sessions,USD,0.0,2023-11-03T07:27:22.863+0000
QFEPXC4GZZNVY6VA,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,Data catalog crawler run,APS2-Crawler-DPU-Hour,CrawlerRun,ap-southeast-2,AWS Glue,$0.44 per Data Processing Unit-Hour for AWS Glue crawler,0,inf,DPU-Hour,USD,0.44,2023-11-03T07:27:22.863+0000
GR6BUK2D8AMYADSU,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,Data catalog storage,APS2-Catalog-Storage,Storage,ap-southeast-2,AWS Glue,"$1 per 100,000 objects per month for AWS Glue Data Catalog storage",0,inf,Obj-Month,USD,1e-05,2023-11-03T07:27:22.863+0000
AKY8T7VH5B3H2TSG,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,ETL Job run,APS2-ETL-DPU-Hour,Jobrun,ap-southeast-2,AWS Glue,$0.44 per Data Processing Unit-Hour for AWS Glue ETL job,0,inf,DPU-Hour,USD,0.44,2023-11-03T07:27:22.863+0000
9X78P4VGKHXDATD7,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,Data Brew Job run,APS2-DBrew-Node-Hour,Job,ap-southeast-2,AWS Glue,$0.48 per node hour for AWS Glue DataBrew jobs in Asia Pacific (Sydney),0,inf,Node-hour,USD,0.48,2023-11-03T07:27:22.863+0000
NMKBXCV78RBJAQDA,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,ETL Development endpoint,APS2-DEVED-DPU-Hour,StartDevEndpoint,ap-southeast-2,AWS Glue,$0.44 per Data Processing Unit-Hour for AWS Glue development endpoints,0,inf,DPU-Hour,USD,0.44,2023-11-03T07:27:22.863+0000
7TGZ8PWFZ395R4V4,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,ETL Job run,APS2-ETL-Flex-DPU-Hour,FlexJobrun,ap-southeast-2,AWS Glue,$0.29 per Data Processing Unit-Hour for AWS Glue Flex ETL job,0,inf,DPU-Hour,USD,0.29,2023-11-03T07:27:22.863+0000
J69NQ2RHUW8ZPUQZ,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,ETL Interactive Session,APS2-GlueInteractiveSession-DPU-Hour,GlueInteractiveSession,ap-southeast-2,AWS Glue,$0.44 per Data Processing Unit-Hour for AWS Glue interactive sessions and job notebooks,0,inf,DPU-Hour,USD,0.44,2023-11-03T07:27:22.863+0000
7TDQ6HMSMSSR4P8H,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,Data Brew interactive sessions,APS2-DBrew-Sessions,Session,ap-southeast-2,AWS Glue,$1 per session for AWS Glue DataBrew interactive sessions in Asia Pacific (Sydney),0,inf,Sessions,USD,1.0,2023-11-03T07:27:22.863+0000
6PDFTXNWT277TXP4,AWS Glue,AWSGlue,Asia Pacific (Sydney),AWS Region,Data catalog requests,APS2-Catalog-Request,Request,ap-southeast-2,AWS Glue,"$1 per 1,000,000 requests for AWS Glue Data Catalog request",0,inf,Request,USD,1e-06,2023-11-03T07:27:22.863+0000


Successfully loaded into postgres


sku,product_family,servicecode,location,location_type,group,usagetype,operation,region_code,servicename,description,begin_range,end_range,unit,currency,priceperunit,timestamp
RZ8N2TAQDKPDSHPC,Serverless,AWSLambda,Any,AWS Region,AWS-Lambda-Processed-Bytes,Global-Lambda-Streaming-Response-Processed-Bytes,,,AWS Lambda,Global rate for Lambda-Streaming-Response-Processed-Bytes,0,100.0,Processed-Gigabytes,USD,0.0,2023-11-03T07:27:37.971+0000
WJCZC8VHP7VMDTAY,Serverless,AWSLambda,EU (Ireland),AWS Region,AWS-Lambda-Processed-Bytes,EU-Lambda-Streaming-Response-Processed-Bytes,,eu-west-1,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in EU (Ireland),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
S3EVS4ABD7UCUWX2,Serverless,AWSLambda,Canada (Central),AWS Region,AWS-Lambda-Processed-Bytes,CAN1-Lambda-Streaming-Response-Processed-Bytes,,ca-central-1,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in Canada (Central),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
CR34UD6VD8B2XFDN,Serverless,AWSLambda,US West (N. California),AWS Region,AWS-Lambda-Processed-Bytes,USW1-Lambda-Streaming-Response-Processed-Bytes,,us-west-1,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in US West (N. California),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
NZGRMXV87W8HQUH8,Serverless,AWSLambda,US West (Oregon),AWS Region,AWS-Lambda-Processed-Bytes,USW2-Lambda-Streaming-Response-Processed-Bytes,,us-west-2,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in US West (Oregon),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
T6J2KVMXT793PX6R,Serverless,AWSLambda,EU (Frankfurt),AWS Region,AWS-Lambda-Processed-Bytes,EUC1-Lambda-Streaming-Response-Processed-Bytes,,eu-central-1,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in EU (Frankfurt),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
5GNXVH3GA89GG5SQ,Serverless,AWSLambda,South America (Sao Paulo),AWS Region,AWS-Lambda-Processed-Bytes,SAE1-Lambda-Streaming-Response-Processed-Bytes,,sa-east-1,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in South America (Sao Paulo),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
93ZY797THXB9X3R6,Serverless,AWSLambda,EU (Paris),AWS Region,AWS-Lambda-Processed-Bytes,EUW3-Lambda-Streaming-Response-Processed-Bytes,,eu-west-3,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in EU (Paris),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
4PVUXX66YAUQK74P,Serverless,AWSLambda,EU (London),AWS Region,AWS-Lambda-Processed-Bytes,EUW2-Lambda-Streaming-Response-Processed-Bytes,,eu-west-2,AWS Lambda,$0.008 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in EU (London),0,inf,Processed-Gigabytes,USD,0.008,2023-11-03T07:27:37.971+0000
J9AU7QRJMT2NDX3H,Serverless,AWSLambda,Africa (Cape Town),AWS Region,AWS-Lambda-Processed-Bytes,AFS1-Lambda-Streaming-Response-Processed-Bytes,,af-south-1,AWS Lambda,$0.010608 per GiB for of Lambda-Streaming-Response-Processed-Bytes usage in Africa (Cape Town),0,inf,Processed-Gigabytes,USD,0.010608,2023-11-03T07:27:37.971+0000


Successfully loaded into postgres
