## Experiments data manipulation

Import packages and set global variables

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import random
from datetime import timedelta

room_ids = [1, 2, 3, 4, 6, 7, 8, 9, 20, 21, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 311, 51, 56, 57, 59, 60, 87, 88, 89, 90, 91, 92, 93, 94, 95, 104, 105, 115, 116, 117, 297, 261]

N_INJECTIONS_PER_TYPE = 10
ROWS_PER_INJECTION = 12

sensors = ['temperature', 'airquality', 'daylight', 'light']

def log_change(cursor, row_id, field, old_value, new_value, room_id, timestamp, drift_type):
    if hasattr(timestamp, "isoformat"):
        timestamp = timestamp.isoformat(sep=" ")
    cursor.execute("""
        INSERT INTO injected_drift (room_id, timestamp, field, old_value, new_value, drift_type)
        VALUES (?, ?, ?, ?, ?, ?)
    """, (room_id, timestamp, field, old_value, new_value, drift_type))


Experiment - Sudden change

In [None]:
conn = sqlite3.connect("exp6.db")
cursor = conn.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS injected_drift (
        room_id INTEGER,
        timestamp TEXT,
        field TEXT,
        old_value TEXT,
        new_value TEXT,
        drift_type TEXT
    )
""")

timestamp_query = pd.read_sql("SELECT MAX(timestamp) as max_ts FROM sensor_data_history", conn)
last_day = pd.to_datetime(timestamp_query['max_ts'][0]).normalize()
df = pd.read_sql(f"""
    SELECT rowid, * FROM sensor_data_history
    WHERE DATE(timestamp) = DATE('{last_day.date()}')
      AND room_id IN ({','.join(map(str, room_ids))})
""", conn, parse_dates=['timestamp'])

for _ in range(10):
    room = random.choice(room_ids)
    subset = df[df['room_id'] == room]
    if len(subset) < 6:
        continue

    event_row = subset.sample(1).iloc[0]
    event_ts = event_row['timestamp']
    event_ts_str = event_ts.strftime('%Y-%m-%d %H:%M:%S')

    changed_sensors = random.sample(sensors, random.randint(1, len(sensors)))
    drift_amounts = {s: 10 for s in changed_sensors}

    for s, amt in drift_amounts.items():
        orig_val = event_row[s]
        new_val = orig_val + amt

        log_change(
            cursor,
            event_row['rowid'],
            s,
            str(orig_val),
            str(new_val),
            room,
            event_ts_str,
            'abrupt'
        )

        cursor.execute(
            f"""
            UPDATE sensor_data_history
            SET {s} = {s} + ?
            WHERE room_id = ? AND timestamp >= ?
            """, (amt, room, event_ts_str)
        )

        cursor.execute(
            f"""
            SELECT rowid, timestamp, {s} - ? AS old_val, {s} AS new_val
            FROM sensor_data_history
            WHERE room_id = ? AND timestamp >= ?
            """, (amt, room, event_ts_str)
        )
        for rid, ts, old_val, new_val in cursor.fetchall():
            ts_str = ts.strftime('%Y-%m-%d %H:%M:%S') if isinstance(ts, pd.Timestamp) else str(ts)
            log_change(cursor, rid, s, str(old_val), str(new_val), room, ts_str, 'propagated')

conn.commit()
conn.close()


Experiment - Incremental change

In [None]:
conn = sqlite3.connect("exp8.db")
cursor = conn.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS injected_drift (
        room_id INTEGER,
        timestamp TEXT,
        field TEXT,
        old_value TEXT,
        new_value TEXT,
        drift_type TEXT
    )
""")
conn.commit()

total_drifts = {
    'temperature': 10,
    'airquality': 200,
    'daylight': 50,
    'light': 50
}

df_meta = pd.read_sql("SELECT MAX(timestamp) AS max_ts FROM sensor_data_history", conn)
last_day = pd.to_datetime(df_meta['max_ts'][0]).normalize()
df_full = pd.read_sql(
    f"SELECT rowid, room_id, timestamp, temperature, airquality, daylight, light FROM sensor_data_history"
    f" WHERE DATE(timestamp)=DATE('{last_day.date()}')",
    conn,
    parse_dates=['timestamp']
)

for _ in range(10):
    rooms = df_full['room_id'].unique().tolist()
    room = random.choice(rooms)
    df_room = df_full[df_full['room_id'] == room].sort_values('timestamp')
    if len(df_room) < 6:
        continue
    event_row = df_room.sample(1).iloc[0]
    event_ts = event_row['timestamp']
    event_ts_str = event_ts.strftime('%Y-%m-%d %H:%M:%S')

    sensor = random.choice(list(total_drifts.keys()))
    drift_total = total_drifts[sensor]

    cutoff_ts = event_ts + pd.Timedelta(hours=3)
    cutoff_ts_str = cutoff_ts.strftime('%Y-%m-%d %H:%M:%S')

    window_rows = df_room[(df_room['timestamp'] >= event_ts) & (df_room['timestamp'] <= cutoff_ts)].copy()
    window_rows = window_rows.sort_values('timestamp')
    n = len(window_rows)
    if n == 0:
        continue

    step = drift_total / n

    for i, row in enumerate(window_rows.itertuples(), start=1):
        rid = row.rowid
        ts = row.timestamp
        ts_str = ts.strftime('%Y-%m-%d %H:%M:%S')
        orig_val = getattr(row, sensor)
        new_val = orig_val + step * i
        log_change(cursor, rid, sensor, str(orig_val), str(new_val), room, ts_str, 'incremental')
        cursor.execute(
            f"UPDATE sensor_data_history SET {sensor}=? WHERE rowid=?",
            (new_val, rid)
        )
    conn.commit()

    plateau_rows = df_room[df_room['timestamp'] > cutoff_ts]
    for row in plateau_rows.itertuples():
        rid = row.rowid
        ts = row.timestamp
        ts_str = ts.strftime('%Y-%m-%d %H:%M:%S')
        orig_val = getattr(row, sensor)
        new_val = orig_val + drift_total
        log_change(cursor, rid, sensor, str(orig_val), str(new_val), room, ts_str, 'propagated')
    cursor.execute(
        f"UPDATE sensor_data_history SET {sensor} = {sensor} + ? WHERE room_id = ? AND timestamp > ?",
        (drift_total, room, cutoff_ts_str)
    )
    conn.commit()

conn.close()


Experiment - Reoccurring Change

In [None]:
conn = sqlite3.connect("exp7.db")
cursor = conn.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS injected_drift (
        room_id INTEGER,
        timestamp TEXT,
        field TEXT,
        old_value TEXT,
        new_value TEXT,
        drift_type TEXT
    )
""")

last_day = pd.to_datetime(
    pd.read_sql("SELECT MAX(timestamp) as max_ts FROM sensor_data_history", conn)['max_ts'][0]
).normalize()
df = pd.read_sql(
    f"SELECT rowid, * FROM sensor_data_history WHERE DATE(timestamp)=DATE('{last_day.date()}') AND room_id IN ({','.join(map(str, room_ids))})",
    conn,
    parse_dates=['timestamp']
)

for _ in range(10):
    room = random.choice(room_ids)
    subset = df[df['room_id'] == room]
    if len(subset) < 6:
        continue

    sample = subset.sample(12).sort_values('timestamp')
    event_ts = sample['timestamp'].iloc[0]
    event_ts_str = event_ts.strftime('%Y-%m-%d %H:%M:%S')

    changed_sensors = random.sample(sensors, random.randint(1, len(sensors)))
    total_drifts = {
        'temperature': 10,
        'airquality': 200,
        'daylight': 50,
        'light': 50
    }

    for _, row in sample.iterrows():
        row_ts = row['timestamp']
        dt_hours = (row_ts - event_ts).total_seconds() / 3600.0
        factor = dt_hours / 3.0 if dt_hours <= 3 else 1.0
        ts_str = row_ts.strftime('%Y-%m-%d %H:%M:%S')
        for s in changed_sensors:
            orig_val = row[s]
            drift_amt = total_drifts[s] * factor
            new_val = orig_val + drift_amt
            log_change(
                cursor,
                row['rowid'],
                s,
                str(orig_val),
                str(new_val),
                room,
                ts_str,
                'incremental'
            )
            cursor.execute(
                f"UPDATE sensor_data_history SET {s} = ? WHERE rowid = ?",
                (new_val, row['rowid'])
            )
    conn.commit()

conn.commit()
conn.close()


Experiment - Gradual change

In [None]:
conn = sqlite3.connect("exp9.db")
cursor = conn.cursor()
print("Connected to exp9.db and preparing drift-log table.")
cursor.execute("""
    CREATE TABLE IF NOT EXISTS injected_drift (
        room_id INTEGER,
        timestamp TEXT,
        field TEXT,
        old_value TEXT,
        new_value TEXT,
        drift_type TEXT
    )
""")
conn.commit()
print("Ensured injected_drift table exists.")

total_drifts = {'temperature':10,'airquality':200,'daylight':50,'light':50}
sensors = list(total_drifts.keys())
print(f"Drift amounts defined: {total_drifts}")

ts_max = pd.read_sql("SELECT MAX(timestamp) AS max_ts FROM sensor_data_history", conn)['max_ts'][0]
last_day = pd.to_datetime(ts_max).normalize()
print(f"Last day detected: {last_day.date()}")
df = pd.read_sql(
    f"SELECT rowid, room_id, timestamp, temperature, airquality, daylight, light"
    f" FROM sensor_data_history WHERE DATE(timestamp)=DATE('{last_day.date()}')",
    conn, parse_dates=['timestamp']
).sort_values(['room_id','timestamp'])
print(f"Loaded {len(df)} rows of last-day sensor data.")

rooms = df['room_id'].unique().tolist()
selected_rooms = random.sample(rooms, min(10, len(rooms)))
print(f"Selected rooms: {selected_rooms}")

for room in selected_rooms:
    df_room = df[df['room_id']==room].copy()
    if len(df_room)<60:
        print(f"Skipping room {room}, only {len(df_room)} rows")
        continue
    sensor = random.choice(sensors)
    drift_total = total_drifts[sensor]
    print(f"\nRoom {room}: drifting '{sensor}' by {drift_total} over windows")
    num_windows = random.randint(2,4)
    last_end_ts = None
    for i in range(num_windows):
        start_idx = random.randint(0, len(df_room)-30)
        start_ts = df_room.iloc[start_idx]['timestamp']
        span = random.randint(30,90)
        end_ts = start_ts + pd.Timedelta(minutes=span)
        print(f" Window {i+1}: {start_ts} – {end_ts}")
        window_df = df_room[(df_room['timestamp']>=start_ts)&(df_room['timestamp']<=end_ts)]
        print(f"  df has {len(window_df)} rows in window")
        for _, row in window_df.iterrows():
            rid = row['rowid']; ts=row['timestamp']; old=row[sensor]
            new=old+drift_total
            ts_str=ts.strftime('%Y-%m-%d %H:%M:%S')
            print(f"   -> row {rid} at {ts_str}: {old}->{new}")
            cursor.execute(
                "INSERT INTO injected_drift(room_id,timestamp,field,old_value,new_value,drift_type) VALUES(?,?,?,?,?,'abrupt')",
                (room,ts_str,sensor,str(old),str(new))
            )
            cursor.execute(
                f"UPDATE sensor_data_history SET {sensor}=? WHERE rowid=?",
                (new,rid)
            )
        conn.commit()
        last_end_ts = end_ts
    if last_end_ts:
        plateau_df = df_room[df_room['timestamp']>last_end_ts]
        print(f" Plateau: {len(plateau_df)} rows post-{last_end_ts}")
        for _,row in plateau_df.iterrows():
            rid=row['rowid']; ts=row['timestamp']; old=row[sensor]
            new=old+drift_total
            ts_str=ts.strftime('%Y-%m-%d %H:%M:%S')
            print(f"   -> plateau row {rid} at {ts_str}: {old}->{new}")
            cursor.execute(
                "INSERT INTO injected_drift(room_id,timestamp,field,old_value,new_value,drift_type) VALUES(?,?,?,?,?,'propagated')",
                (room,ts_str,sensor,str(old),str(new))
            )
            cursor.execute(
                f"UPDATE sensor_data_history SET {sensor}=? WHERE rowid=?",
                (new,rid)
            )
        conn.commit()

conn.close()
print("Done.")


Experiment - Outliers

In [None]:
conn = sqlite3.connect("exp5.db")
cursor = conn.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS injected_drift (
        room_id INTEGER,
        timestamp TEXT,
        field TEXT,
        old_value TEXT,
        new_value TEXT,
        drift_type TEXT
    )
""")

last_day = pd.to_datetime(pd.read_sql("SELECT MAX(timestamp) as max_ts FROM sensor_data_history", conn)['max_ts'][0]).normalize()
df = pd.read_sql(f"""
    SELECT rowid, * FROM sensor_data_history
    WHERE DATE(timestamp) = DATE('{last_day.date()}') AND room_id IN ({','.join(map(str, room_ids))})
""", conn, parse_dates=['timestamp'])

for _ in range(10):
    room = random.choice(room_ids)
    subset = df[df['room_id'] == room]
    if subset.empty: continue
    row = subset.sample(1).iloc[0]
    sensor = random.choice(sensors)
    old_val = row[sensor]
    new_val = 9999 if sensor in ['daylight', 'light'] else -1000
    cursor.execute(f"""
        UPDATE sensor_data_history SET {sensor} = ?
        WHERE rowid = ?
    """, (new_val, row['rowid']))
    log_change(cursor, row['rowid'], sensor, str(old_val), str(new_val), int(row['room_id']), row['timestamp'], 'outlier')

conn.commit()
conn.close()
