In [17]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, sum as spark_sum
spark = SparkSession.builder.appName("EnergyAggregation").getOrCreate()



In [19]:
import pandas as pd

data = {
    "device_id": [1,1,2,3,4,2,3],
    "timestamp": [
        "2025-09-29 08:00:00","2025-09-29 09:00:00",
        "2025-09-29 10:00:00","2025-09-29 08:30:00",
        "2025-09-29 11:00:00","2025-09-29 22:00:00",
        "2025-09-29 23:30:00"
    ],
    "energy_kwh": [0.2,0.3,1.5,0.7,0.9,2.0,1.2]
}
df_pd = pd.DataFrame(data)
df_pd.to_csv("spark_logs.csv", index=False)
df = spark.read.csv("spark_logs.csv", header=True, inferSchema=True)

In [21]:
df = df.withColumn("hour", hour(col("timestamp")))
df = df.withColumn("period", (col("hour").between(6,22)).cast("string"))

In [23]:
agg = df.groupBy("device_id", "period").agg(
    spark_sum("energy_kwh").alias("total_energy")
)
print("Device Usage (Peak vs Off-Peak):")
agg.show()

Device Usage (Peak vs Off-Peak):
+---------+------+------------+
|device_id|period|total_energy|
+---------+------+------------+
|        2|  true|         3.5|
|        1|  true|         0.5|
|        4|  true|         0.9|
|        3| false|         1.2|
|        3|  true|         0.7|
+---------+------+------------+



In [24]:
top_devices = df.groupBy("device_id").agg(
    spark_sum("energy_kwh").alias("total_energy")
).orderBy(col("total_energy").desc())
print("Top Energy-Consuming Devices:")
top_devices.show()

Top Energy-Consuming Devices:
+---------+------------+
|device_id|total_energy|
+---------+------------+
|        2|         3.5|
|        3|         1.9|
|        4|         0.9|
|        1|         0.5|
+---------+------------+

