In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
# Define Paths
silver_path = "/mnt/mock_prajwal/example/silver/"
gold_path = "/mnt/mock_prajwal/example/gold/"

In [0]:

df = spark.read.format("delta").load(silver_path + "DeviceInfo")
display(df)


In [0]:
df.printSchema()

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

df_selected = df.select(
    col("device_id"),
    col("customer_id"),
    col("imei_number"),
    col("device_brand"),
    col("device_model"),
    col("os_type"),
    col("network_capability"),
    col("Purchase_Date")
).dropDuplicates(['device_id'])

window_spec = Window.orderBy("device_id")

df_selected = df_selected.withColumn("device_key", row_number().over(window_spec))

display(df_selected)

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

schema = StructType([
    StructField("device_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("imei_number", StringType(), True),
    StructField("device_brand", StringType(), True),
    StructField("device_model", StringType(), True),
    StructField("os_type", StringType(), True),
    StructField("network_capability", StringType(), True),
    StructField("Purchase_Date", DateType(), True),
    StructField("device_key", IntegerType(), True)
])

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS Prajwal_Telecom")

# Create the table with the specified schema
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Telecom.Dim_Device (
        device_id INT,
        customer_id INT,
        imei_number STRING,
        device_brand STRING,
        device_model STRING,
        os_type STRING,
        network_capability STRING,
        Purchase_Date DATE,
        device_key  INT NOT NULL
    )
        USING DELTA 
    LOCATION "/mnt/mock_prajwal/example/gold/Dim_Device"
""")

In [0]:
df_selected.write.mode("overwrite").format("delta").save(gold_path + "Dim_Device")

In [0]:
# Load silver layer table
silver_df = spark.read.format("delta").load(silver_path + "DeviceInfo")

# Load gold layer table
gold_df = spark.read.format("delta").load("/mnt/mock_prajwal/example/gold/Dim_Device")

# Record count for silver layer
silver_count = silver_df.count()

# Record count for gold layer
gold_count = gold_df.count()

# Display counts
display(spark.createDataFrame([(silver_count, gold_count)], ["Silver Layer Count", "Gold Layer Count"]))