In [1]:
# 1. Install PySpark
!pip install pyspark findspark -q

# 2. Initialize Spark Session with MEMORY LIMITS
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, array
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType, ArrayType

# Create a local Spark Session with explicit memory caps to prevent OOM
spark = SparkSession.builder \
    .appName("IoT_SmartHouse_Streaming") \
    .master("local[2]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()

print("‚úì Spark Session Created (Memory Optimized)")


‚úì Spark Session Created (Memory Optimized)


In [2]:
import pickle
import numpy as np
import pandas as pd

# 1. Load Models AND Scaler
try:
    with open('appliance_models.pkl', 'rb') as f:
        rf_models = pickle.load(f)
    print("‚úì Models loaded")

    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    print("‚úì Scaler loaded")
except:
    print("‚ö†Ô∏è CRITICAL: Models or Scaler not found. Prediction will be wrong!")

# 2. Define UDF with Scaling
def predict_appliances(temp, hour_sin, hour_cos, month_sin, month_cos,
                       day_of_week, season, time_of_day, day_of_month,
                       apparent_power, weather_encoded_list):
    try:
        # Check inputs
        if weather_encoded_list is None: return [-1] * 5

        # --- A. CONSTRUCT RAW ROW ---
        # We need a DataFrame to use scaler.transform() easily,
        # or we can manually scale if we know mean/std.
        # Using the scaler object is safest.

        # The scaler expects these specific columns in this order:
        # ['Outside_Temperature_C', 'day_of_week', 'day_of_month', 'Apparent Power',
        #  'hour_sin', 'hour_cos', 'month_sin', 'month_cos']

        # Let's organize our raw numerical inputs into that shape
        raw_nums = np.array([[
            temp, day_of_week, day_of_month, apparent_power,
            hour_sin, hour_cos, month_sin, month_cos
        ]])

        # --- B. SCALE NUMERICAL FEATURES ---
        # This transforms 2000.0 -> 1.5 (or whatever the scaled value is)
        scaled_nums = scaler.transform(raw_nums)[0]

        # Extract back the scaled values
        s_temp = scaled_nums[0]
        s_dow = scaled_nums[1]
        s_dom = scaled_nums[2]
        s_power = scaled_nums[3]
        s_h_sin = scaled_nums[4]
        s_h_cos = scaled_nums[5]
        s_m_sin = scaled_nums[6]
        s_m_cos = scaled_nums[7]

        # --- C. CONSTRUCT FINAL FEATURE VECTOR FOR MODEL ---
        # Now we rebuild the list using the SCALED values + raw weather (weather is 0/1, no scaling needed)
        # Order: [hour_sin, hour_cos, month_sin, month_cos, day_of_week, season,
        #         time_of_day, day_of_month, Outside_Temperature_C, ...weather..., Apparent Power]

        features = [
            s_h_sin, s_h_cos, s_m_sin, s_m_cos,
            s_dow, season, time_of_day, s_dom,
            s_temp,
            *weather_encoded_list,
            s_power
        ]

        # --- D. PREDICT ---
        features_array = np.array(features).reshape(1, -1)
        results = []
        for device in ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']:
            model = rf_models[device]
            pred = model.predict(features_array)[0]
            results.append(int(pred))

        return results

    except Exception as e:
        # print(str(e)) # Debug if needed
        return [-1] * 5

# Re-register UDF
predict_udf = udf(predict_appliances, ArrayType(IntegerType()))
print("‚úì UDF updated with SCALING")


‚úì Models loaded
‚úì Scaler loaded
‚úì UDF updated with SCALING


In [3]:
import time
import os
import shutil
import pandas as pd
from threading import Thread

# SETUP
SOURCE_CSV_PATH = 'smart_home_dataset_with_weather.csv' # UPDATE THIS
input_dir = "/content/streaming_input"

if os.path.exists(input_dir):
    shutil.rmtree(input_dir)
os.makedirs(input_dir)

def stream_data_generator():
    print(f"üì° Simulator started using: {SOURCE_CSV_PATH}")

    # Load Data
    try:
        full_df = pd.read_csv(SOURCE_CSV_PATH)
    except:
        print("‚ùå Error: CSV not found. Check path.")
        return

    # THESE ARE THE 10 WEATHER COLUMNS YOUR MODEL EXPECTS
    required_weather_cols = [
        'weather_clear', 'weather_cloudy', 'weather_foggy', 'weather_overcast',
        'weather_partly_cloudy', 'weather_rainy', 'weather_snowy', 'weather_sunny',
        'weather_thunderstorm', 'weather_windy'
    ]

    batch_id = 0
    while True:
        try:
            # Sample Data
            raw_batch = full_df.sample(np.random.randint(5, 15)).copy()

            # --- FEATURE ENGINEERING (Must match training exactly) ---
            raw_batch['timestamp'] = pd.to_datetime(raw_batch['Unix Timestamp'], unit='s')
            raw_batch['hour'] = raw_batch['timestamp'].dt.hour
            raw_batch['month'] = raw_batch['timestamp'].dt.month

            # 1. Cyclical
            raw_batch['hour_sin'] = np.sin(2 * np.pi * raw_batch['hour'] / 24)
            raw_batch['hour_cos'] = np.cos(2 * np.pi * raw_batch['hour'] / 24)
            raw_batch['month_sin'] = np.sin(2 * np.pi * raw_batch['month'] / 12)
            raw_batch['month_cos'] = np.cos(2 * np.pi * raw_batch['month'] / 12)

            # 2. Categorical
            raw_batch['day_of_week'] = raw_batch['timestamp'].dt.dayofweek
            raw_batch['day_of_month'] = raw_batch['timestamp'].dt.day

            # Season mapping
            raw_batch['season'] = raw_batch['month'].apply(lambda m: 0 if m in [12,1,2] else 1 if m in [3,4,5] else 2 if m in [6,7,8] else 3)

            # Time of Day mapping
            raw_batch['time_of_day'] = raw_batch['hour'].apply(lambda h: 0 if 5<=h<12 else 1 if 12<=h<17 else 2 if 17<=h<21 else 3)

            # 3. Weather Encoding (The Critical Fix)
            # Create all 10 columns, initialized to 0
            for col in required_weather_cols:
                weather_type = col.replace('weather_', '')
                # If the raw weather type matches, set to 1, else 0
                raw_batch[col] = (raw_batch['Weather_Type'] == weather_type).astype(int)

            # 4. Select Columns in EXACT ORDER
            final_cols = [
                'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
                'day_of_week', 'season', 'time_of_day', 'day_of_month',
                'Outside_Temperature_C'
            ] + required_weather_cols + ['Apparent Power']

            output_df = raw_batch[final_cols]

            # Write batch
            output_df.to_csv(f"{input_dir}/batch_{batch_id}.csv", index=False)
            batch_id += 1
            time.sleep(3) # Slow down to prevent OOM

        except Exception as e:
            print(f"Simulator Error: {e}")
            time.sleep(1)

# Start Simulator
t = Thread(target=stream_data_generator)
t.daemon = True
t.start()
print("‚úì Simulator running...")


üì° Simulator started using: smart_home_dataset_with_weather.csv
‚úì Simulator running...


In [4]:
# 1. Define Columns matching the simulator output
weather_cols_names = [
    'weather_clear', 'weather_cloudy', 'weather_foggy', 'weather_overcast',
    'weather_partly_cloudy', 'weather_rainy', 'weather_snowy', 'weather_sunny',
    'weather_thunderstorm', 'weather_windy'
]

stream_cols = [
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
    'day_of_week', 'season', 'time_of_day', 'day_of_month',
    'Outside_Temperature_C'
] + weather_cols_names + ['Apparent Power']

# 2. Create Schema
schema = StructType([StructField(c, FloatType(), True) for c in stream_cols])

# 3. Read Stream
iot_stream = spark.readStream \
    .option("header", "true") \
    .schema(schema) \
    .csv(input_dir)

# 4. Transform & Predict
weather_struct = [col(c) for c in weather_cols_names]

predictions = iot_stream.withColumn(
    "weather_list",
    array(weather_struct)
).withColumn(
    "pred",
    predict_udf(
        col("Outside_Temperature_C"),
        col("hour_sin"), col("hour_cos"),
        col("month_sin"), col("month_cos"),
        col("day_of_week"), col("season"),
        col("time_of_day"), col("day_of_month"),
        col("Apparent Power"),
        col("weather_list")
    )
)

# 5. Select Output
final_stream = predictions.select(
    col("Outside_Temperature_C").alias("Temp"),
    col("Apparent Power").alias("Power"),
    col("pred")[0].alias("TV"),
    col("pred")[1].alias("Dryer"),
    col("pred")[2].alias("Oven"),
    col("pred")[3].alias("Fridge"),
    col("pred")[4].alias("Micro")
)

# 6. Start Query
# Stop existing queries to free memory
for q in spark.streams.active:
    q.stop()

query = final_stream.writeStream \
    .queryName("iot_predictions") \
    .format("memory") \
    .outputMode("append") \
    .start()

print("‚úì Streaming started.")


‚úì Streaming started.


In [5]:
import time
from IPython.display import clear_output, display

start = time.time()
while time.time() - start < 90:
    clear_output(wait=True)
    if spark.catalog.tableExists("iot_predictions"):
        # Show top 10 recent predictions
        df = spark.sql("SELECT * FROM iot_predictions")
        # Convert only the tail to Pandas to save memory
        pdf = df.tail(10)
        # Manually create DataFrame from list of Rows for display
        if len(pdf) > 0:
            disp_df = pd.DataFrame(pdf, columns=df.columns)
            print(f"üü¢ LIVE STREAMING: {int(time.time()-start)}s elapsed")
            display(disp_df)
        else:
            print("Waiting for data...")
    time.sleep(2)

print("Stopped.")


üü¢ LIVE STREAMING: 89s elapsed


Unnamed: 0,Temp,Power,TV,Dryer,Oven,Fridge,Micro
0,24.299999,1684.0,1,1,1,1,1
1,21.9,1909.0,1,1,1,1,1
2,7.1,1578.0,1,1,1,1,1
3,3.0,1689.0,1,1,1,1,1
4,9.4,1938.0,1,1,1,1,1
5,26.9,1892.854248,1,1,1,1,1
6,6.5,1916.223267,1,1,1,1,1
7,23.700001,1573.0,1,1,1,1,1
8,17.700001,1658.0,1,1,1,1,1
9,14.2,1982.0,1,1,1,1,1


Stopped.


In [6]:
query.stop()
