# **Creating Schema in **catalog****

In [0]:
%sql
-- creating schema in catalog
create schema if not exists kusha_solutions.sai

In [0]:
%sql
-- checking schema
describe schema kusha_solutions.sai;
show volumes in kusha_solutions.sai;
show tables in kusha_solutions.sai;

In [0]:
%sql
-- creating volume
create volume if not exists kusha_solutions.sai.volume

In [0]:
# copy the files in viresh to the volume in catalog

source_path ="/Volumes/kusha_solutions/viresh/dlt_poc"
target_path ="/Volumes/kusha_solutions/sai/volume"
dbutils.fs.cp(source_path,target_path, True)

In [0]:
# display the files in the volume
display(dbutils.fs.ls("/Volumes/kusha_solutions/sai/volume"))

# **Reading files**

In [0]:
from pyspark.sql.functions import *

# ✅ Step 1: Read JSON files from DIVISIONS folder with multiline support
df_divisionsraw = spark.read.option("multiline", "true").json("/Volumes/kusha_solutions/sai/volume/DIVISIONS/*")

# ✅ Step 2: Print the schema to understand the structure of the JSON
df_divisionsraw.printSchema()

# ✅ Step 3: Add ingestion timestamp and source file path metadata
df_divisionsraw_meta = df_divisionsraw.withColumn("divisions_ingestion_timestamp", current_timestamp())\
                                       .withColumn("source_file", col("_metadata.file_path"))               # Track source file

# ✅ Step 4: Display the enriched DataFrame
display(df_divisionsraw_meta)


In [0]:
from pyspark.sql.types import *
# Define the schema for the JSON file
schema = StructType([
    StructField("organizationId", StringType(), True),
    StructField("conversationId", StringType(), True),
    StructField("startTime", StringType(), True),
    StructField("endTime", StringType(), True),
    StructField("divisionIds", ArrayType(StringType()), True),
    StructField("participantData", ArrayType(
        StructType([
            StructField("participantPurpose", StringType(), True),
            StructField("participantAttributes", StructType([
                StructField("DDI", StringType(), True),
                StructField("DNIS", StringType(), True),
                StructField("Skill", StringType(), True),
                StructField("Whisper", StringType(), True),
                StructField("custCLI", StringType(), True),
                StructField("DNISName", StringType(), True),
                StructField("Location", StringType(), True),
                StructField("Priority", StringType(), True),
                StructField("TIQCheck", StringType(), True),
                StructField("TIQValue", StringType(), True),
                StructField("scriptId", StringType(), True),
                StructField("DNIS_Name", StringType(), True),
                StructField("DNIS_code", StringType(), True),
                StructField("CountryCode", StringType(), True),
                StructField("HoldMusic", StringType(), True),
                StructField("ComfortMsg1", StringType(), True),
                StructField("CustomerANI", StringType(), True),
                StructField("CBEWTSetting", StringType(), True),
                StructField("NoSurveyOptIn", StringType(), True),
                StructField("ScheduleGroup", StringType(), True),
                StructField("ScreenPopName", StringType(), True),
                StructField("Closed_Message", StringType(), True),
                StructField("FoundQueueName", StringType(), True),
                StructField("FoundSkillName", StringType(), True),
                StructField("SMSLandlineMxg", StringType(), True),
                StructField("SMSNoMobMsgIVR", StringType(), True),
                StructField("SMSOptInPrompt", StringType(), True),
                StructField("Business_Status", StringType(), True),
                StructField("SMSMsgMobileIVR", StringType(), True),
                StructField("Survey_Workflow", StringType(), True),
                StructField("Welcome_Message", StringType(), True),
                StructField("Callback_Enabled", StringType(), True),
                StructField("LandlineNoInptHD", StringType(), True),
                StructField("NT_Login_Matched", StringType(), True),
                StructField("NoMobOffPromptSD", StringType(), True),
                StructField("Emergency_Message", StringType(), True),
                StructField("SMSOOHMobileAudio", StringType(), True),
                StructField("SMSDeflectionOffer", StringType(), True),
                StructField("SMSOOHNoMobileAudio", StringType(), True),
                StructField("External_Xfer_Number", StringType(), True),
                StructField("Callback_Sunday_End_Time", StringType(), True),
                StructField("Callback_Weekday_End_Time", StringType(), True),
                StructField("WhatsDeflectionSuccessMsg", StringType(), True),
                StructField("Callback_Saturday_End_Time", StringType(), True),
                StructField("Callback_Sunday_Start_Time", StringType(), True),
                StructField("Callback_Weekday_Start_Time", StringType(), True),
                StructField("Log_SurveyWorkflowStartTime", StringType(), True),
                StructField("Callback_Saturday_Start_Time", StringType(), True),
                StructField("LegId", StringType(), True),
                StructField("ivr_Skills", StringType(), True),
                StructField("ivr_Priority", StringType(), True),
                StructField("Log_LegWorkflowStart", StringType(), True),
                StructField("Log_ConversationCheck", StringType(), True),
                StructField("Log_LegWorkflowComplete", StringType(), True),
                StructField("NTLogin", StringType(), True),
                StructField("Customer_Id", StringType(), True),
                StructField("Agent_NT_Login", StringType(), True),
                StructField("Workstation_Id", StringType(), True)
            ]), True),
            StructField("participantId", StringType(), True),
            StructField("sessionIds", ArrayType(StringType()), True)
        ])
    ), True),
    StructField("_type", StringType(), True)
])
# Step 2: Read JSON files from PARTICIPANT_ATTRIBUTES folder with multiline support
df_participants = spark.read.schema(schema).option("multiline","true").json("/Volumes/kusha_solutions/sai/volume/PARTICIPANT_ATTRIBUTES/*")
#Step 3: Print the schema to understand the structure of the JSON
df_participants.printSchema()
#Step 4: Add ingestion timestamp and source file name to the dataframe
df_participants_meta= df_participants.withColumn("participants_ingestion_timestamp", current_timestamp()) \
                                 .withColumn("source_file", col("_metadata.file_path"))
#Step 5: Display the dataframe
display(df_participants_meta)

In [0]:

# Step 1: Define schema for the JSON file
schema2 = StructType([
    StructField("conversationEnd", StringType(), True),
    StructField("conversationId", StringType(), True),
    StructField("conversationStart", StringType(), True),
    StructField("divisionIds", ArrayType(StringType()), True),
    StructField("mediaStatsMinConversationMos", DoubleType(), True),
    StructField("mediaStatsMinConversationRFactor", DoubleType(), True),
    StructField("originatingDirection", StringType(), True),
    StructField("participants", ArrayType(
        StructType([
            StructField("participantId", StringType(), True),
            StructField("purpose", StringType(), True),
            StructField("userId", StringType(), True),
            StructField("externalContactId", StringType(), True),
            StructField("participantName", StringType(), True),
            StructField("sessions", ArrayType(
                StructType([
                    StructField("ani", StringType(), True),
                    StructField("direction", StringType(), True),
                    StructField("dnis", StringType(), True),
                    StructField("DDI", StringType(), True),
                    StructField("edgeId", StringType(), True),
                    StructField("mediaType", StringType(), True),
                    StructField("peerId", StringType(), True),
                    StructField("protocolCallId", StringType(), True),
                    StructField("provider", StringType(), True),
                    StructField("remote", StringType(), True),
                    StructField("remoteNameDisplayable", StringType(), True),
                    StructField("sessionDnis", StringType(), True),
                    StructField("sessionId", StringType(), True),
                    StructField("destinationAddresses", ArrayType(StringType()), True),  # <- Added line
                    StructField("mediaEndpointStats", ArrayType(
                        StructType([
                            StructField("codecs", ArrayType(StringType()), True),
                            StructField("eventTime", StringType(), True),
                            StructField("maxLatencyMs", LongType(), True),
                            StructField("minMos", DoubleType(), True),
                            StructField("minRFactor", DoubleType(), True),
                            StructField("receivedPackets", LongType(), True)
                        ])
                    ), True),
                    StructField("metrics", ArrayType(
                        StructType([
                            StructField("emitDate", StringType(), True),
                            StructField("name", StringType(), True),
                            StructField("value", LongType(), True)
                        ])
                    ), True),
                    StructField("segments", ArrayType(
                        StructType([
                            StructField("conference", BooleanType(), True),
                            StructField("queueId", StringType(), True),
                            StructField("segmentEnd", StringType(), True),
                            StructField("segmentStart", StringType(), True),
                            StructField("segmentType", StringType(), True),
                            StructField("disconnectType", StringType(), True),
                            StructField("wrapUpCode", StringType(), True)
                        ])
                    ), True)
                ])
            ), True),
            StructField("attributes", StructType([
                StructField("VDN", StringType(), True),
                StructField("Log_LegWorkflowStart", StringType(), True),
                StructField("NTLogin", StringType(), True),
                StructField("DNIS", StringType(), True),
                StructField("LegId", StringType(), True)
            ]), True)
        ])
    ), True)
])
#Step 2: Read JSON files from DIVISIONS folder with multiline support
df_conversations = spark.read.schema(schema2).option("multiline","true").json("/Volumes/kusha_solutions/sai/volume/CONVERSATION_JOBS/*")
#Step 3: Print the schema to understand the structure of the JSON
df_conversations.printSchema()
#Step 4: Add ingestion timestamp and source file name to the dataframe
df_conversations_meta = df_conversations.withColumn("Conversations_ingestion_timestamp", current_timestamp()) \
                                 .withColumn("source_file", col("_metadata.file_path"))
#Step 5: Display the dataframe
display(df_conversations_meta)

In [0]:
# count the number of records in each dataframe
counts = [
    ("df_conversations_meta", df_conversations_meta.count()),
    ("df_participants_meta", df_participants_meta.count()),
    ("df_divisionsraw_meta", df_divisionsraw_meta.count())
]
print(counts)

# **Write bronze Tables**

In [0]:
# write the data to delta table
df_conversations_meta.write.format("delta").mode("overwrite").saveAsTable("kusha_solutions.sai.CONVERSATION_JOBS_bronze")

In [0]:
# write the data to delta table
df_participants_meta.write.format("delta").mode("append").saveAsTable("kusha_solutions.sai.PARTICIPANT_ATTRIBUTES_bronze")

In [0]:
# write the data to delta table
df_divisionsraw_meta.write.format("delta").mode("append").saveAsTable("kusha_solutions.sai.DIVISIONS_bronze")