In [1]:
!pip install pyspark findspark lightgbm numpy pandas -q

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, array
from pyspark.sql.types import StructType, StructField, FloatType, DoubleType, ArrayType

# Initialize Spark
spark = SparkSession.builder \
    .appName("Energy_Consumption_Streaming") \
    .master("local[2]") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

print("Spark Session Created")


Spark Session Created


In [2]:
# This code loads a trained LightGBM model and defines a prediction UDF that rebuilds
# all 26 engineered features exactly as they were created during training. It computes
# additional values such as quarter, active device count, power efficiency, weekend/
# peak hour flags, and temperature category. All inputs are combined into the exact
# feature order the LightGBM model expects, and the predicted energy consumption is
# returned. Finally, the function is registered as a Spark UDF for use in streaming
# or batch pipelines.

import lightgbm as lgb
import pickle
import numpy as np
import math

# 1. Load Model
try:
    lgb_model = lgb.Booster(model_file='lightgbm_energy_model.txt')
    print(f"Model loaded. Expects {lgb_model.num_feature()} features.")
except:
    lgb_model = None

# 2. Define Prediction UDF (Recalculating 26 Features)
def predict_energy(temp, line_voltage, voltage, apparent_power,
                   hour_sin, hour_cos, month_sin, month_cos,
                   tv, dryer, oven, fridge, microwave, weather_encoded,
                   hour, day_of_week, month, day):
    try:
        # Check inputs
        if lgb_model is None: return -2.0
        if any(v is None for v in [temp, apparent_power, hour]): return -3.0

        # --- A. RECREATE ENGINEERED FEATURES ---

        # 1. Quarter
        quarter = (float(month) - 1) // 3 + 1

        # 2. Active Devices (Sum of device flags)
        active_devices = float(tv) + float(dryer) + float(oven) + float(fridge) + float(microwave)

        # 3. Temp x Active Devices
        temp_x_active = float(temp) * active_devices

        # 4. Power Efficiency (Apparent Power / Voltage) - handle divide by zero
        power_efficiency = float(apparent_power) / float(voltage) if float(voltage) != 0 else 0.0

        # 5. Voltage Variation (Line Voltage - Voltage)
        voltage_variation = float(line_voltage) - float(voltage)

        # 6. is_Weekend (1 if Sat/Sun, else 0) - Spark day_of_week is usually 1=Sun, 7=Sat or 0-6
        # Assuming Simulator output 0=Mon, 6=Sun (pandas default) -> Weekend is 5,6
        is_weekend = 1.0 if float(day_of_week) >= 5 else 0.0

        # 7. is_Peak_Hours (e.g., 17-21)
        # Check training notebook for exact logic. Assuming standard peak 17-21
        h = float(hour)
        is_peak = 1.0 if 17 <= h <= 21 else 0.0

        # 8. Temp_Category_Encoded
        # We need the logic from training. Simple approximation:
        # Low < 10, Medium 10-25, High > 25.
        # Ideally we load the encoder, but let's approximate: 0=Cold, 1=Mild, 2=Hot
        t = float(temp)
        temp_cat = 0.0 if t < 10 else (1.0 if t < 25 else 2.0)

        features = [
            float(temp),
            float(tv), float(dryer), float(oven), float(fridge), float(microwave),
            float(line_voltage), float(voltage), float(apparent_power),
            float(hour), float(day_of_week), float(month), float(day),
            float(quarter),
            float(hour_sin), float(hour_cos), float(month_sin), float(month_cos),
            float(active_devices),
            float(temp_x_active),
            float(power_efficiency),
            float(voltage_variation),
            float(is_weekend),
            float(is_peak),
            float(weather_encoded),
            float(temp_cat)
        ]

        # Validation Check
        if len(features) != 26: return -5.0

        # --- C. PREDICT ---
        pred = lgb_model.predict([features])[0]
        return float(pred)

    except Exception as e:
        return -4.0 # Crash

# Register UDF
predict_energy_udf = udf(predict_energy, FloatType())
print("Robust UDF Registered (26 Features)")


Model loaded. Expects 26 features.
Robust UDF Registered (26 Features)


In [3]:
# This script simulates a streaming energy dataset by repeatedly sampling small batches
# from the full smart-home CSV and recreating all feature-engineering steps needed by
# the energy prediction UDF. It computes date/time fields, cyclical encodings, and a
# numeric weather label, then selects the exact set of columns required for prediction.
# Each processed batch is written as a CSV into a streaming directory every few seconds,
# allowing Spark to read and process the data as if it were arriving in real time.

import time
import os
import shutil
import pandas as pd
import numpy as np
from threading import Thread

# Config
SOURCE_CSV_PATH = 'smart_home_dataset_with_weather.csv' # UPDATE THIS
input_dir = "/content/streaming_energy"

if os.path.exists(input_dir): shutil.rmtree(input_dir)
os.makedirs(input_dir)

def stream_generator():
    print(f"Simulator started...")
    try:
        full_df = pd.read_csv(SOURCE_CSV_PATH)
    except:
        print("CSV not found.")
        return

    # Weather Mapping (Matches training logic)
    weather_map = {'clear': 0, 'cloudy': 1, 'rainy': 2, 'overcast': 3, 'sunny': 4}

    batch_id = 0
    while True:
        try:
            # Sample
            raw = full_df.sample(np.random.randint(5, 10)).copy()

            # Feature Eng
            raw['timestamp'] = pd.to_datetime(raw['Unix Timestamp'], unit='s')

            # 1. DATE FIELDS (Needed for Feature Engineering in UDF)
            raw['hour'] = raw['timestamp'].dt.hour
            raw['month'] = raw['timestamp'].dt.month
            raw['day_of_week'] = raw['timestamp'].dt.dayofweek # 0=Mon, 6=Sun
            raw['day_of_month'] = raw['timestamp'].dt.day

            # 2. CYCLICAL FEATURES
            raw['hour_sin'] = np.sin(2 * np.pi * raw['hour'] / 24)
            raw['hour_cos'] = np.cos(2 * np.pi * raw['hour'] / 24)
            raw['month_sin'] = np.sin(2 * np.pi * raw['month'] / 12)
            raw['month_cos'] = np.cos(2 * np.pi * raw['month'] / 12)

            # 3. WEATHER ENCODING
            raw['Weather_Type_encoded'] = raw['Weather_Type'].map(weather_map).fillna(0)

            # 4. SELECT COLUMNS (Ensure ALL UDF inputs are here)
            cols = [
                'Outside_Temperature_C', 'Line Voltage', 'Voltage', 'Apparent Power',
                'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
                'Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave',
                'Weather_Type_encoded',
                'hour', 'day_of_week', 'month', 'day_of_month',
                'Energy Consumption (kWh)'
            ]

            out_df = raw[cols].copy()
            out_df = out_df.fillna(0) # Ensures no NaNs

            out_df.to_csv(f"{input_dir}/batch_{batch_id}.csv", index=False)
            batch_id += 1
            time.sleep(3)

        except Exception as e:
            print(f"Simulator Error: {e}")
            time.sleep(1)

# Start
t = Thread(target=stream_generator)
t.daemon = True
t.start()
print("Simulator Running (Updated with Date Fields)")


Simulator started...
Simulator Running (Updated with Date Fields)


In [4]:
# This Spark streaming pipeline reads incoming CSV batches generated by the simulator
# using a schema that exactly matches the engineered feature columns. It applies the
# LightGBM-based UDF to compute predicted energy consumption for each row, then selects
# a few key fields—such as apparent power, a sample device flag, predicted consumption,
# and actual consumption—for easy monitoring. The results are written to an in-memory
# table so they can be queried live as the stream updates.

# Schema matches the Simulator output
schema_cols = [
    'Outside_Temperature_C', 'Line Voltage', 'Voltage', 'Apparent Power',
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
    'Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave',
    'Weather_Type_encoded',
    'hour', 'day_of_week', 'month', 'day_of_month',
    'Energy Consumption (kWh)'
]

schema = StructType([StructField(c, FloatType(), True) for c in schema_cols])

# Read Stream
stream = spark.readStream.option("header", "true").schema(schema).csv(input_dir)

# Predict
predictions = stream.withColumn(
    "Predicted_Energy_kWh",
    predict_energy_udf(
        col("Outside_Temperature_C"), col("Line Voltage"), col("Voltage"), col("Apparent Power"),
        col("hour_sin"), col("hour_cos"), col("month_sin"), col("month_cos"),
        col("Television"), col("Dryer"), col("Oven"), col("Refrigerator"), col("Microwave"),
        col("Weather_Type_encoded"),
        col("hour"), col("day_of_week"), col("month"), col("day_of_month")
    )
)


# Display
query = predictions.select(
    col("Apparent Power"),
    col("Television"),
    col("Predicted_Energy_kWh").alias("Predicted Consumption"),
    col("Energy Consumption (kWh)").alias("Actual Consumption")
).writeStream.format("memory").queryName("energy_preds").outputMode("append").start()

print("Streaming Started")


Streaming Started


In [5]:
import time
from IPython.display import clear_output, display
import pandas as pd

start = time.time()
last_count = 0

print("Monitoring started... (Updates every 3s)")

while time.time() - start < 120:  # Run for 2 minutes
    if spark.catalog.tableExists("energy_preds"):
        # Get count to see if data is incoming
        count_df = spark.sql("SELECT count(*) as cnt FROM energy_preds").toPandas()
        current_count = count_df['cnt'][0]

        # Get latest 10 rows
        df = spark.sql("SELECT * FROM energy_preds")
        pdf = df.toPandas()

        if current_count > last_count:
            clear_output(wait=True)
            print(f"NEW BATCH RECEIVED! Total Rows: {current_count} (+{current_count - last_count})")
            print(f"Time Elapsed: {int(time.time() - start)}s")

            # Show the last 20 rows
            display(pdf.tail(20))

            last_count = current_count
        else:
            # Optional: Printing dots to show it's ready to receive
            print(".", end="", flush=True)

    time.sleep(3)

print("\n Monitoring stopped.")


NEW BATCH RECEIVED! Total Rows: 315 (+34)
Time Elapsed: 117s


Unnamed: 0,Apparent Power,Television,Predicted Consumption,Actual Consumption
295,1620.0,1.0,62.064972,60.281746
296,1814.0,0.0,51.674053,23.568869
297,1999.940186,0.0,81.613487,84.253296
298,1744.371094,0.0,26.788591,18.168468
299,1546.0,0.0,98.997597,94.443642
300,1714.753296,0.0,45.935726,38.253044
301,1669.0,1.0,91.710129,100.176033
302,2439.266602,1.0,134.52562,130.670685
303,1684.365234,0.0,38.756252,24.993904
304,1757.0,1.0,34.28828,20.589474



 Monitoring stopped.


In [6]:
query.stop()

In [7]:
# if lgb_model:
#     print(f"Model expects {lgb_model.num_feature()} features")
#     print(f"Model feature names: {lgb_model.feature_name()}")
