In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, when, hour


# Step 1: Initialize Spark Session

In [None]:
spark = SparkSession.builder.appName("DeviceLevelAggregation").getOrCreate()
print("Spark session created")

Spark session created


# Step 2: Load CSV files

In [6]:
from google.colab import files
uploaded = files.upload()

Saving devices.csv to devices (1).csv
Saving energy_logs.csv to energy_logs (1).csv
Saving rooms.csv to rooms (1).csv


In [7]:
devices_df = spark.read.csv("devices.csv", header=True, inferSchema=True)

print("Devices Preview:")
devices_df.show(5)



Devices Preview:
+---------+------------+-------+--------+
|device_id|        name|room_id|  status|
+---------+------------+-------+--------+
|        1|  Smart Lamp|      1|  active|
|        2|Air Purifier|      2|inactive|
|        3|Water Heater|      3|  active|
+---------+------------+-------+--------+



In [8]:
energy_logs_df = spark.read.csv("energy_logs.csv", header=True, inferSchema=True)

print("Energy Logs Preview:")
energy_logs_df.show(5)

Energy Logs Preview:
+------+---------+-------------------+----------+
|log_id|device_id|          timestamp|energy_kwh|
+------+---------+-------------------+----------+
|     1|        1|2025-07-22 06:30:00|      0.55|
|     2|        2|2025-07-22 07:15:00|       1.8|
|     3|        3|2025-07-22 08:45:00|      2.25|
+------+---------+-------------------+----------+



In [9]:

rooms_df = spark.read.csv("rooms.csv", header=True, inferSchema=True)
print("Rooms Preview:")
rooms_df.show(5)

Rooms Preview:
+-------+----------+
|room_id|      name|
+-------+----------+
|      1|Study Room|
|      2|Guest Room|
|      3|  Bathroom|
+-------+----------+



# Step 3: Convert timestamp column to proper timestamp

In [10]:
energy_logs_df = energy_logs_df.withColumn("timestamp", col("timestamp").cast("timestamp"))
print("Timestamp converted")
energy_logs_df.show(5)

Timestamp converted
+------+---------+-------------------+----------+
|log_id|device_id|          timestamp|energy_kwh|
+------+---------+-------------------+----------+
|     1|        1|2025-07-22 06:30:00|      0.55|
|     2|        2|2025-07-22 07:15:00|       1.8|
|     3|        3|2025-07-22 08:45:00|      2.25|
+------+---------+-------------------+----------+



# Step 4: Define peak (8 AM to 8 PM) and off-peak usage

In [11]:
energy_logs_df = energy_logs_df.withColumn(
    "period",
    when((hour(col("timestamp")) >= 8) & (hour(col("timestamp")) < 20), "Peak").otherwise("Off-Peak")
)
print(" Added Peak/Off-Peak period column")
energy_logs_df.show(5)

 Added Peak/Off-Peak period column
+------+---------+-------------------+----------+--------+
|log_id|device_id|          timestamp|energy_kwh|  period|
+------+---------+-------------------+----------+--------+
|     1|        1|2025-07-22 06:30:00|      0.55|Off-Peak|
|     2|        2|2025-07-22 07:15:00|       1.8|Off-Peak|
|     3|        3|2025-07-22 08:45:00|      2.25|    Peak|
+------+---------+-------------------+----------+--------+



# Step 5: Aggregate energy usage by device and period

In [12]:
device_usage_df = energy_logs_df.groupBy("device_id", "period").agg(
    _sum("energy_kwh").alias("total_energy_kwh")
)
print(" Aggregated energy by device and period")
device_usage_df.show(10)

 Aggregated energy by device and period
+---------+--------+----------------+
|device_id|  period|total_energy_kwh|
+---------+--------+----------------+
|        2|Off-Peak|             1.8|
|        1|Off-Peak|            0.55|
|        3|    Peak|            2.25|
+---------+--------+----------------+




# Step 6: Pivot to get Peak and Off-Peak in columns

In [13]:
pivot_df = device_usage_df.groupBy("device_id").pivot("period").sum("total_energy_kwh").fillna(0)
print(" Pivoted data to show Peak and Off-Peak columns")
pivot_df.show(10)

 Pivoted data to show Peak and Off-Peak columns
+---------+--------+----+
|device_id|Off-Peak|Peak|
+---------+--------+----+
|        1|    0.55| 0.0|
|        3|     0.0|2.25|
|        2|     1.8| 0.0|
+---------+--------+----+



# Step 7: Calculate total energy and identify top devices

In [14]:
pivot_df = pivot_df.withColumn("total_energy", col("Peak") + col("Off-Peak"))
top_devices_df = pivot_df.orderBy(col("total_energy").desc())
print("Calculated total energy and sorted top devices")
top_devices_df.show(10)

Calculated total energy and sorted top devices
+---------+--------+----+------------+
|device_id|Off-Peak|Peak|total_energy|
+---------+--------+----+------------+
|        3|     0.0|2.25|        2.25|
|        2|     1.8| 0.0|         1.8|
|        1|    0.55| 0.0|        0.55|
+---------+--------+----+------------+




# Step 8: Save result to CSV

In [16]:
output_path = "top_devices_by_usage"
top_devices_df.coalesce(1).write.csv(output_path, header=True, mode="overwrite")
print(f" Output saved to folder: {output_path}")

 Output saved to folder: top_devices_by_usage
