In [1]:

!pip install pyspark==3.5.0 delta-spark==3.1.0


Collecting pyspark==3.5.0
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting delta-spark==3.1.0
  Downloading delta_spark-3.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading delta_spark-3.1.0-py3-none-any.whl (21 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425346 sha256=4bc0f640857a6ae64bb1d7a6c9b4813f1078ff0a41e48ba425a3764d7749cfa9
  Stored in directory: /root/.cache/pip/wheels/38/df/61/8c121f50c3cffd77f8178180dd232d90b3b99d1bd61fb6d6be
Successfully built pyspark
Installing collected packages: pyspark, delta-spark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.1
    Uninstalling pyspark-3.5.1:
      Successfully uninstalled

In [2]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder.appName("SmartEnergyMonitoringETL")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print(" Spark session created with Delta support")


 Spark session created with Delta support


In [3]:
from google.colab import files
uploaded = files.upload()


Saving devices.csv to devices.csv
Saving energy_logs.csv to energy_logs.csv
Saving rooms.csv to rooms.csv


In [7]:
logs_df = spark.read.csv("energy_logs.csv", header=True, inferSchema=True)
devices_df = spark.read.csv("devices.csv", header=True, inferSchema=True)
rooms_df = spark.read.csv("rooms.csv", header=True, inferSchema=True)

print(" Raw logs loaded")
logs_df.show(5)
print(" Raw devices loaded")
devices_df.show(5)
print(" Raw rooms loaded")
rooms_df.show(5)

 Raw logs loaded
+------+---------+-------------------+----------+
|log_id|device_id|          timestamp|energy_kwh|
+------+---------+-------------------+----------+
|     1|        1|2025-07-22 06:30:00|      0.55|
|     2|        2|2025-07-22 07:15:00|       1.8|
|     3|        3|2025-07-22 08:45:00|      2.25|
+------+---------+-------------------+----------+

 Raw devices loaded
+---------+------------+-------+--------+
|device_id|        name|room_id|  status|
+---------+------------+-------+--------+
|        1|  Smart Lamp|      1|  active|
|        2|Air Purifier|      2|inactive|
|        3|Water Heater|      3|  active|
+---------+------------+-------+--------+

 Raw rooms loaded
+-------+----------+
|room_id|      name|
+-------+----------+
|      1|Study Room|
|      2|Guest Room|
|      3|  Bathroom|
+-------+----------+



In [10]:
from pyspark.sql.functions import col


logs_df = logs_df.withColumn("timestamp", col("timestamp").cast("timestamp")) \
                 .filter(col("energy_kwh").isNotNull())

print(" Cleaned logs")
logs_df.show(5)

 Cleaned logs
+------+---------+-------------------+----------+
|log_id|device_id|          timestamp|energy_kwh|
+------+---------+-------------------+----------+
|     1|        1|2025-07-22 06:30:00|      0.55|
|     2|        2|2025-07-22 07:15:00|       1.8|
|     3|        3|2025-07-22 08:45:00|      2.25|
+------+---------+-------------------+----------+



In [12]:
from pyspark.sql.functions import date_trunc, sum as _sum

daily_df = logs_df.groupBy("device_id", date_trunc("day", col("timestamp")).alias("date")) \
                  .agg(_sum("energy_kwh").alias("daily_energy"))

print(" Daily summary")
daily_df.show(5)


weekly_df = logs_df.groupBy("device_id", date_trunc("week", col("timestamp")).alias("week")) \
                   .agg(_sum("energy_kwh").alias("weekly_energy"))

print("Weekly summary")
weekly_df.show(5)


 Daily summary
+---------+-------------------+------------+
|device_id|               date|daily_energy|
+---------+-------------------+------------+
|        1|2025-07-22 00:00:00|        0.55|
|        3|2025-07-22 00:00:00|        2.25|
|        2|2025-07-22 00:00:00|         1.8|
+---------+-------------------+------------+

Weekly summary
+---------+-------------------+-------------+
|device_id|               week|weekly_energy|
+---------+-------------------+-------------+
|        3|2025-07-21 00:00:00|         2.25|
|        2|2025-07-21 00:00:00|          1.8|
|        1|2025-07-21 00:00:00|         0.55|
+---------+-------------------+-------------+



In [14]:

daily_df.write.format("delta").mode("overwrite").save("daily_summary_delta")
daily_df.coalesce(1).write.csv("daily_summary_csv", header=True, mode="overwrite")


weekly_df.write.format("delta").mode("overwrite").save("weekly_summary_delta")
weekly_df.coalesce(1).write.csv("weekly_summary_csv", header=True, mode="overwrite")

print(" ETL results saved in Delta & CSV formats")


 ETL results saved in Delta & CSV formats


In [15]:

daily_df.createOrReplaceTempView("daily_summary")
weekly_df.createOrReplaceTempView("weekly_summary")


high_usage = spark.sql("""
SELECT device_id, date, daily_energy
FROM daily_summary
WHERE daily_energy > 100
ORDER BY daily_energy DESC
""")

print(" High energy usage detected")
high_usage.show(10)


 High energy usage detected
+---------+----+------------+
|device_id|date|daily_energy|
+---------+----+------------+
+---------+----+------------+

