# Ingest and Save


In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from datetime import date

data = [
    (1,"Rajesh","North","Laptop","Electronics",1,55000,"2024-01-12"),
    (2,"Sneha","West","Refrigerator","Electronics",1,32000,"2024-02-05"),
    (3,"Anil","South","Shampoo","Personal Care",5,150,"2024-01-17"),
    (4,"Divya","North","Mobile","Electronics",2,20000,"2024-03-22"),
    (5,"Vikram","East","Washing Machine","Electronics",1,28000,"2024-02-28"),
    (6,"Preeti","West","Sneakers","Fashion",2,4000,"2024-01-31"),
    (7,"Aman","South","TV","Electronics",1,45000,"2024-02-15"),
    (8,"Isha","North","Notebook","Stationery",10,60,"2024-01-10"),
    (9,"Kunal","East","Pencil","Stationery",20,10,"2024-03-05"),
    (10,"Tanvi","West","Face Cream","Personal Care",3,200,"2024-03-19")
]
columns = ["transaction_id","customer_name","region","product","category","quantity","unit_price","date"]

df = spark.createDataFrame(data, columns)


parquet_path = "/tmp/parquet/sales_transactions"
delta_path = "/tmp/delta/sales_transactions"

df.write.mode("overwrite").parquet(parquet_path)

df.write.format("delta").mode("overwrite").save(delta_path)

df.createOrReplaceTempView("sales_transactions")


# Data Transformation 

In [0]:
df_transformed = df \
    .withColumn("total_amount", F.col("quantity") * F.col("unit_price")) \
    .withColumn("month", F.month(F.col("date"))) \
    .withColumn("date_formatted", F.date_format(F.col("date"), "dd-MMM-yyyy")) \
    .withColumn("is_high_value", F.col("total_amount") > 30000)

df_transformed.show()


df_transformed.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(delta_path)
df_transformed.createOrReplaceTempView("sales_transactions")


+--------------+-------------+------+---------------+-------------+--------+----------+----------+------------+-----+--------------+-------------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|total_amount|month|date_formatted|is_high_value|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+------------+-----+--------------+-------------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|       55000|    1|   12-Jan-2024|         true|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|       32000|    2|   05-Feb-2024|         true|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|         750|    1|   17-Jan-2024|        false|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|       40000|    3| 

# Aggregations & Insights

In [0]:

spark.sql("""
SELECT region, COUNT(*) AS transaction_count
FROM sales_transactions
GROUP BY region
""").show()

spark.sql("""
SELECT category, SUM(total_amount) AS total_sales
FROM sales_transactions
GROUP BY category
ORDER BY total_sales DESC
LIMIT 3
""").show()

spark.sql("""
SELECT month, SUM(total_amount) AS revenue
FROM sales_transactions
GROUP BY month
ORDER BY month
""").show()

spark.sql("""
SELECT customer_name, total_amount
FROM sales_transactions
ORDER BY total_amount DESC
LIMIT 1
""").show()


spark.sql("""
SELECT SUM(total_amount) AS q1_sales
FROM sales_transactions
WHERE month IN (1,2,3)
""").show()


+------+-----------------+
|region|transaction_count|
+------+-----------------+
|  West|                3|
| North|                3|
| South|                2|
|  East|                2|
+------+-----------------+

+-------------+-----------+
|     category|total_sales|
+-------------+-----------+
|  Electronics|     200000|
|      Fashion|       8000|
|Personal Care|       1350|
+-------------+-----------+

+-----+-------+
|month|revenue|
+-----+-------+
|    1|  64350|
|    2| 105000|
|    3|  40800|
+-----+-------+

+-------------+------------+
|customer_name|total_amount|
+-------------+------------+
|       Rajesh|       55000|
+-------------+------------+

+--------+
|q1_sales|
+--------+
|  210150|
+--------+



#Update & Delete Scenarios

In [0]:
from datetime import date
from delta.tables import DeltaTable
import pyspark.sql.functions as F

delta_table = DeltaTable.forPath(spark, delta_path)

delta_table.update(
    condition="category = 'Stationery'",
    set={"unit_price": "unit_price * 1.1"}
)

delta_table.delete("quantity < 3")

today_str = date.today().strftime("%Y-%m-%d")

new_data = [
    (
        11,                         
        "Meera",                  
        "North",                   
        "Headphones",                
        "Electronics",         
        1,                         
        5000,                     
        today_str,               
        5000,                  
        int(today_str.split("-")[1]), 
        today_str,                
        False                    
    )
]


new_columns = df_transformed.columns
df_new = spark.createDataFrame(new_data, schema=df_transformed.schema)


df_new.write.format("delta").mode("append").save(delta_path)


df_updated = spark.read.format("delta").load(delta_path)
df_updated.createOrReplaceTempView("sales_transactions")

df_updated.show()


+--------------+-------------+------+----------+-------------+--------+----------+----------+------------+-----+--------------+-------------+
|transaction_id|customer_name|region|   product|     category|quantity|unit_price|      date|total_amount|month|date_formatted|is_high_value|
+--------------+-------------+------+----------+-------------+--------+----------+----------+------------+-----+--------------+-------------+
|             8|         Isha| North|  Notebook|   Stationery|      10|        86|2024-01-10|         600|    1|   10-Jan-2024|        false|
|             9|        Kunal|  East|    Pencil|   Stationery|      20|        14|2024-03-05|         200|    3|   05-Mar-2024|        false|
|             3|         Anil| South|   Shampoo|Personal Care|       5|       150|2024-01-17|         750|    1|   17-Jan-2024|        false|
|            10|        Tanvi|  West|Face Cream|Personal Care|       3|       200|2024-03-19|         600|    3|   19-Mar-2024|        false|
|     

#Partitioning & Optimization

In [0]:

df_updated.write.format("delta").mode("overwrite").partitionBy("region").save("/tmp/delta/sales_transactions_by_region")


df_updated.write.format("delta").mode("overwrite").partitionBy("month").save("/tmp/delta/sales_transactions_by_month")

