In [1]:
!pip install pyspark findspark lightgbm numpy pandas -q

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, array
from pyspark.sql.types import StructType, StructField, FloatType, DoubleType, ArrayType

# Initialize Spark
spark = SparkSession.builder \
    .appName("Energy_Consumption_Streaming") \
    .master("local[2]") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

print("‚úì Spark Session Created")


‚úì Spark Session Created


In [2]:
import lightgbm as lgb
import pickle
import numpy as np

# 1. Load Model
try:
    # LightGBM loads from text file usually
    lgb_model = lgb.Booster(model_file='lightgbm_energy_model.txt')
    print("‚úì LightGBM Model loaded")

    # Load Scaler
    # with open('energy_scaler.pkl', 'rb') as f:
    #     scaler = pickle.load(f)
    # print("‚úì Scaler loaded")

except Exception as e:
    print(f"‚ö†Ô∏è Error loading files: {e}")
    print("Make sure you uploaded 'lightgbm_energy_model.txt'!")

# 2. Define Features Order (CRITICAL)
# Based on your training notebook, these are the features used:
# [num_cols + cat_cols]
# Numerical: ['Outside_Temperature_C', 'Line Voltage', 'Voltage', 'Apparent Power', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
# Categorical: ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave', 'Weather_Type_encoded']

# SCALER_FEATURES = [
#     'Outside_Temperature_C', 'Line Voltage', 'Voltage', 'Apparent Power',
#     'hour_sin', 'hour_cos', 'month_sin', 'month_cos'
# ]

# 3. Define Prediction UDF
def predict_energy(temp, line_voltage, voltage, apparent_power,
                   hour_sin, hour_cos, month_sin, month_cos,
                   tv, dryer, oven, fridge, microwave, weather_encoded):
    try:
        # A. Scale Numerical Features
        # raw_nums = [[temp, line_voltage, voltage, apparent_power,
        #              hour_sin, hour_cos, month_sin, month_cos]]

        # scaled_nums = scaler.transform(raw_nums)[0]

        # B. Construct Full Feature Vector
        # Order must match training: scaled_numericals + categoricals
        # Note: Categoricals (TV, Dryer...) were NOT scaled in your notebook, just encoded.

        # features = [
        #     scaled_nums[0], # Scaled Temp
        #     scaled_nums[1], # Scaled Line Voltage
        #     scaled_nums[2], # Scaled Voltage
        #     scaled_nums[3], # Scaled Apparent Power
        #     scaled_nums[4], # Scaled Hour Sin
        #     scaled_nums[5], # Scaled Hour Cos
        #     scaled_nums[6], # Scaled Month Sin
        #     scaled_nums[7], # Scaled Month Cos
        #     float(tv),
        #     float(dryer),
        #     float(oven),
        #     float(fridge),
        #     float(microwave),
        #     float(weather_encoded)
        # ]
        features = [
            float(temp),
            float(line_voltage),
            float(voltage),
            float(apparent_power),
            float(hour_sin),
            float(hour_cos),
            float(month_sin),
            float(month_cos),
            float(tv),
            float(dryer),
            float(oven),
            float(fridge),
            float(microwave),
            float(weather_encoded)
        ]
        # C. Predict
        # LightGBM expects 2D array
        pred = lgb_model.predict([features])[0]

        return float(pred)

    except Exception as e:
        return -1.0

# Register UDF
predict_energy_udf = udf(predict_energy, FloatType())
print("‚úì Prediction UDF Registered")


‚úì LightGBM Model loaded
‚úì Prediction UDF Registered


In [3]:
import time
import os
import shutil
import pandas as pd
import numpy as np
from threading import Thread

# Config
SOURCE_CSV_PATH = 'smart_home_dataset_with_weather.csv' # UPDATE THIS
input_dir = "/content/streaming_energy"

if os.path.exists(input_dir): shutil.rmtree(input_dir)
os.makedirs(input_dir)

def stream_generator():
    print(f"üì° Simulator started...")
    try:
        full_df = pd.read_csv(SOURCE_CSV_PATH)
    except:
        print("CSV not found.")
        return

    # Weather Mapping (Label Encoding was used in training)
    # You need to know the mapping. Assuming standard LabelEncoder behavior:
    # clear=0, cloudy=1, etc. Let's approximate or load the encoder if possible.
    # For now, we'll map randomly or simple hash to keep it running.
    weather_map = {'clear': 0, 'cloudy': 1, 'rainy': 2, 'overcast': 3, 'sunny': 4}

    batch_id = 0
    while True:
        try:
            # Sample
            raw = full_df.sample(np.random.randint(5, 10)).copy()

            # Feature Eng
            raw['timestamp'] = pd.to_datetime(raw['Unix Timestamp'], unit='s')
            raw['hour'] = raw['timestamp'].dt.hour
            raw['month'] = raw['timestamp'].dt.month

            raw['hour_sin'] = np.sin(2 * np.pi * raw['hour'] / 24)
            raw['hour_cos'] = np.cos(2 * np.pi * raw['hour'] / 24)
            raw['month_sin'] = np.sin(2 * np.pi * raw['month'] / 12)
            raw['month_cos'] = np.cos(2 * np.pi * raw['month'] / 12)

            # Weather Encoding
            raw['Weather_Type_encoded'] = raw['Weather_Type'].map(weather_map).fillna(0)

            # Select Columns needed for UDF
            cols = [
                'Outside_Temperature_C', 'Line Voltage', 'Voltage', 'Apparent Power',
                'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
                'Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave',
                'Weather_Type_encoded'
            ]

            out_df = raw[cols]
            out_df.to_csv(f"{input_dir}/batch_{batch_id}.csv", index=False)
            batch_id += 1
            time.sleep(3)

        except Exception as e:
            print(e)
            time.sleep(1)

# Start
t = Thread(target=stream_generator)
t.daemon = True
t.start()
print("‚úì Simulator Running")


üì° Simulator started...‚úì Simulator Running



In [4]:
# Schema matches the Simulator output
schema_cols = [
    'Outside_Temperature_C', 'Line Voltage', 'Voltage', 'Apparent Power',
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
    'Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave',
    'Weather_Type_encoded'
]

schema = StructType([StructField(c, FloatType(), True) for c in schema_cols])

# Read Stream
stream = spark.readStream.option("header", "true").schema(schema).csv(input_dir)

# Predict
predictions = stream.withColumn(
    "Predicted_Energy_kWh",
    predict_energy_udf(
        col("Outside_Temperature_C"), col("Line Voltage"), col("Voltage"), col("Apparent Power"),
        col("hour_sin"), col("hour_cos"), col("month_sin"), col("month_cos"),
        col("Television"), col("Dryer"), col("Oven"), col("Refrigerator"), col("Microwave"),
        col("Weather_Type_encoded")
    )
)

# Display
query = predictions.select(
    col("Apparent Power"),
    col("Television"),
    col("Predicted_Energy_kWh")
).writeStream.format("memory").queryName("energy_preds").outputMode("append").start()

print("‚úì Streaming Started")


‚úì Streaming Started


In [5]:
import time
from IPython.display import clear_output, display
import pandas as pd

start = time.time()
while time.time() - start < 90:
    clear_output(wait=True)
    if spark.catalog.tableExists("energy_preds"):
        df = spark.sql("SELECT * FROM energy_preds")
        display(df.toPandas().tail(10))
    time.sleep(2)


Unnamed: 0,Apparent Power,Television,Predicted_Energy_kWh
311,1660.0,1.0,-1.0
312,1899.372803,0.0,-1.0
313,2154.472656,1.0,-1.0
314,1637.115479,1.0,-1.0
315,1943.30603,0.0,-1.0
316,2247.158691,0.0,-1.0
317,1700.101074,0.0,-1.0
318,1874.267578,1.0,-1.0
319,2359.470947,1.0,-1.0
320,1627.0,0.0,-1.0


In [6]:
query.stop()