<a href="https://colab.research.google.com/github/sudo-Oliver/Predictive-Analytics-Private/blob/main/notebooks/Preprocessing_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import gdown
import py7zr

### Load Data

In [15]:
 #Provide the file ID
#gdown.download("https://drive.google.com/uc?export=download&id=1ykDl_A5YRirIFUeKHCGJoaavaw6HiJqh", "sensor_data.7z", quiet=False)
 #Extract the .7z file to a specific folder
#!7z e "/content/sensor_data.7z" -o"/content/data/raw"

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1ykDl_A5YRirIFUeKHCGJoaavaw6HiJqh
From (redirected): https://drive.google.com/uc?export=download&id=1ykDl_A5YRirIFUeKHCGJoaavaw6HiJqh&confirm=t&uuid=6a2d1a30-e583-40e4-b1c6-c61a07f51a38
To: /content/sensor_data.7z
100%|██████████| 592M/592M [00:10<00:00, 59.1MB/s]



7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 591843432 bytes (565 MiB)

Extracting archive: /content/sensor_data.7z
--
Path = /content/sensor_data.7z
Type = 7z
Physical Size = 591843432
Headers Size = 282
Method = LZMA2:24
Solid = +
Blocks = 1

  0%      0% - household_sensors_electric.csv                                       1% - household_sensors_electric.csv                                       2% - household_sensors_electric.csv                                       3% - household_sensors_

In [16]:
# Load metadata dataframes
df_home = pd.read_csv('https://raw.githubusercontent.com/sudo-Oliver/Predictive-Analytics-Private/refs/heads/main/data/raw/metadata/home.csv')
df_room = pd.read_csv('https://raw.githubusercontent.com/sudo-Oliver/Predictive-Analytics-Private/refs/heads/main/data/raw/metadata/room.csv')
df_sensor = pd.read_csv('https://raw.githubusercontent.com/sudo-Oliver/Predictive-Analytics-Private/refs/heads/main/data/raw/metadata/sensor.csv')
df_person = pd.read_csv('https://raw.githubusercontent.com/sudo-Oliver/Predictive-Analytics-Private/refs/heads/main/data/raw/metadata/person.csv')

# Load individual dataframes
df_electric = pd.read_csv('/content/data/raw/household_sensors_electric.csv', parse_dates=['timestamp_local'])
df_gas = pd.read_csv('/content/data/raw/household_sensors_gas.csv', parse_dates=['timestamp_local'])
df_sensor_room = pd.read_csv('/content/data/raw/room_and_appliance_sensors_room.csv', parse_dates=['timestamp_local'], engine="pyarrow")
df_tempprobe = pd.read_csv('/content/data/raw/household_sensors_tempprobe.csv', parse_dates=['timestamp_local'], engine="pyarrow")

### Clean Electric data
1. Filtering out unreliable data period (April 17, 2018, 08:00-10:00)
2. Get Room and Home IDs
3. Drop columns
4. interpolation for std_consumption

In [17]:
def clean_electric_data(df_electric):
    # Create a copy of the dataframe
    df_clean = df_electric.copy()

    # Create mask for unreliable period
    unreliable_start = pd.Timestamp('2018-04-17 08:00:00')
    unreliable_end = pd.Timestamp('2018-04-17 10:00:00')

    # Filter out unreliable data
    df_clean = df_clean[~((df_clean['timestamp_local'] >= unreliable_start) &
                            (df_clean['timestamp_local'] <= unreliable_end))]

    # Extract numeric part from consumer_id and rename to homeid
    df_clean['homeid'] = df_clean['consumer_id'].str.extract(r'(\d+)')
    df_clean = df_clean.drop(columns=['consumer_id'])

    # Split room into type and roomid
    df_clean[['type', 'roomid']] = df_clean['room'].str.extract(r'([a-zA-Z]+)(\d+)')
    df_clean = df_clean.drop(columns=['room'])

    # Remove mean_consumption column
    df_clean = df_clean.drop(columns=['mean_consumption'])

    # Interpolate missing values in std_consumption
    df_clean.set_index('timestamp_local', inplace=True)
    df_clean['std_consumption'] = df_clean['std_consumption'].interpolate(method='time')
    df_clean.reset_index(inplace=True)

    # Rename columns
    df_clean.rename(columns={
        'min_consumption': 'electric_min_consumption',
        'max_consumption': 'electric_max_consumption',
        'median_consumption': 'electric_median_consumption',
        'total_consumption_Wh': 'electric_total_consumption_Wh',
        'sensor': 'sensorid'
    }, inplace=True)

    return df_clean

# Apply all cleaning steps at once
df_electric_clean = clean_electric_data(df_electric)
df_electric_clean.head()



Unnamed: 0.1,timestamp_local,Unnamed: 0,sensorid,electric_min_consumption,electric_max_consumption,std_consumption,electric_median_consumption,electric_total_consumption_Wh,homeid,type,roomid
0,2017-03-07 15:00:00,0,4472,0.0,0.34,0.047283,0.251,0.233624,100,livingroom,1038
1,2017-03-07 16:00:00,1,4472,0.038,2.321,0.785224,0.243,0.677716,100,livingroom,1038
2,2017-03-07 17:00:00,2,4472,0.083,4.746,0.670984,0.404,0.567129,100,livingroom,1038
3,2017-03-07 18:00:00,3,4472,0.048,0.792,0.125662,0.299,0.312481,100,livingroom,1038
4,2017-03-07 19:00:00,4,4472,0.044,1.278,0.043239,0.226,0.20986,100,livingroom,1038


### Clean gas data

In [18]:
def clean_gas_data(df_gas):
    # Create a copy of the dataframe
    df_clean = df_gas.copy()

    # Create mask for unreliable period
    unreliable_start = pd.Timestamp('2018-04-17 08:00:00')
    unreliable_end = pd.Timestamp('2018-04-17 10:00:00')

    # Filter out unreliable data
    df_clean = df_clean[~((df_clean['timestamp_local'] >= unreliable_start) &
                            (df_clean['timestamp_local'] <= unreliable_end))]

    # Extract numbers from consumer_id and room
    df_clean['homeid'] = df_clean['consumer_id'].str.extract(r'(\d+)')
    df_clean['roomid'] = df_clean['room'].str.extract(r'(\d+)')

    # Drop columns
    df_clean = df_clean.drop(columns=['std_consumption', 'consumer_id', 'room'])

    # Rename columns
    df_clean.rename(columns={
        'mean_consumption': 'gas_mean_consumption',
        'min_consumption': 'gas_min_consumption',
        'max_consumption': 'gas_max_consumption',
        'median_consumption': 'gas_median_consumption',
        'total_consumption_Wh': 'gas_total_consumption_Wh',
        'sensor': 'sensorid'
    }, inplace=True)
    return df_clean

df_gas_clean = clean_gas_data(df_gas)

### Household_sensors_tempprobe Preprocessing

In [None]:
def clean_data(df_tempprobe):
    # Create a copy for cleaning
    df_clean = df_tempprobe.copy()

    # Create mask for unreliable period
    unreliable_start = pd.Timestamp('2018-04-17 08:00:00')
    unreliable_end = pd.Timestamp('2018-04-17 10:00:00')

    # Filter out unreliable data
    df_clean = df_clean[~((df_clean['timestamp_local'] >= unreliable_start) &
                            (df_clean['timestamp_local'] <= unreliable_end))]

    # Extract numeric part from consumer_id
    df_clean['homeid'] = df_clean['consumer_id'].str.extract(r'(\d+)')
    # Split 'room' into 'roomid' and 'type'
    df_clean[['type', 'roomid']] = df_clean['room'].str.extract(r'([a-zA-Z]+)(\d+)')
    # Remove specified temperature columns
    df_clean = df_clean.drop(columns=['mean_temperature', 'min_temperature', 'std_temperature', 'max_temperature', 'room', 'consumer_id'])
    # Map text values to numbers
    mapping = {
        'central-heating-flow': 1,
        'hot-water-cold-pipe': 2,
        'central-heating-return': 3,
        'hot-water-hot-pipe': 4
    }
    # Apply mapping to the 'measured_entity' column
    df_clean['measured_entity'] = df_clean['measured_entity'].map(mapping)
    df_clean = df_clean.rename(columns={'sensor': 'sensorid'})
    return df_clean

df_tempprobe_clean = clean_data(df_tempprobe.copy())
df_tempprobe_clean.head()
df_tempprobe_clean

### Room and appliance sensor preprocessing

In [None]:
def clean_room_app_data(df_sensor_room):
    # Create a copy for cleaning
    df_clean = df_sensor_room.copy()

    # Create mask for unreliable period
    unreliable_start = pd.Timestamp('2018-04-17 08:00:00')
    unreliable_end = pd.Timestamp('2018-04-17 10:00:00')

    # Filter out unreliable data
    df_clean = df_clean[~((df_clean['timestamp_local'] >= unreliable_start) &
                            (df_clean['timestamp_local'] <= unreliable_end))]

    # Extract numbers from consumer_id and room
    # Lösung mit Numpy wegen Laufzeit und RAM
    consumer_id_array = df_clean['consumer_id'].values
    room_array = df_clean['room'].values

    df_clean['homeid'] = extract_digits_numpy(consumer_id_array)
    df_clean['roomid'] = extract_digits_numpy(room_array)

    # Filter out all rows where measured entity is not temperature (such as humidity)
    df_clean = df_clean.query('measurement == "temperature"')

    # Drop columns
    df_clean = df_clean.drop(columns=['mean_value', 'min_value', 'max_value', 'std_value', 'consumer_id', 'room'])

    # Rename columns
    df_clean.rename(columns={
        'sensor': 'sensorid'
    }, inplace=True)

    return df_clean

def extract_digits_numpy(arr):
    return np.array([''.join(c for c in s if c.isdigit()) for s in arr], dtype=int)

df_sensor_room_clean = clean_room_app_data(df_sensor_room)

### Sensor.csv Preprocessing

In [None]:
df_sensor = df_sensor[['sensorid', 'status']]
df_sensor

### Prepare metadata for income_band, education

In [None]:
# Prepare income_band data
df_home = df_home[["homeid", "income_band"]].groupby('homeid', as_index=False).sum()

income_midpoints = {
    "Missing": 0,
    "less than £10,800": 1,
    "£10,800 to £13,499": 2,
    "£13,500 to £16,199": 3,
    "£16,200 to £19,799": 4,
    "£19,800 to £23,399": 5,
    "£23,400 to £26,999": 6,
    "£27,000 to £32,399": 7,
    "£32,400 to £37,799": 8,
    "£37,800 to £43,199": 9,
    "£43,200 to £48,599": 10,
    "£48,600 to £53,999": 11,
    "£54,000 to £65,999": 12,
    "£66,000 to £77,999": 13,
    "£78,000 to £89,999": 14,
    "£90,000 or more": 15
}

df_home["income_band_mid"] = df_home["income_band"].map(income_midpoints)
df_home = df_home.drop(columns=['income_band'])

# Prepare education data per home
df_person['education'] = df_person['education'].str.strip().str.lower()
df_person['education'] = df_person['education'].fillna('unknown')

# Map qualifications to numerical values
qualification_mapping = {
    "phd": 9,  # Highest qualification
    "degree level qualification (or equivalent), e.g. bsc, ba, msc, ma": 8,
    "higher educational qualification below degree level": 7,
    "onc / national level btec": 6,
    "a-levels or highers": 5,
    "gcse grade d-g or cse grade 2-5 or standard grade level 4-6": 4,
    "o level or gcse equivalent (grade a-c) or o grade/cse equivalent (grade 1) or standard grade level 1": 3,
    "other qualifications": 2,
    "no formal qualifications": 1,
    "unknown": 0 } # Lowest qualification
df_person['education_map'] = df_person['education'].map(qualification_mapping)

df_person = df_person[["homeid", "education_map"]]

# Choose highest education
highest_education = df_person.groupby('homeid')['education_map'].max().reset_index()
df_person.drop(columns=['education_map'], inplace=True)
person_education = pd.merge(df_person, highest_education, on='homeid', how='inner')

# Map home and person including education
df_home_edu_income = pd.merge(df_home, person_education, on='homeid', how='inner')
df_home_edu_income


### TODO
- Merge all dataframes on timestamp
- Join education and income data for every row on same homeid
- Delete data from sensor where sensor appears as faulty or offline

### Merge the Dataframes

In [None]:
# Convert homeid to int64 in all relevant dataframes
df_electric_clean['homeid'] = df_electric_clean['homeid'].astype(int)
df_gas_clean['homeid'] = df_gas_clean['homeid'].astype(int)
df_tempprobe_clean['homeid'] = df_tempprobe_clean['homeid'].astype(int)
df_sensor_room_clean['homeid'] = df_sensor_room_clean['homeid'].astype(int)

# 1. First merge sensor status with sensor readings and clean up columns
df_electric_with_status = pd.merge(
    df_electric_clean,
    df_sensor[['sensorid', 'status']],
    on='sensorid',
    how='inner'
)

# 2. Filter out inactive/faulty sensors and drop unnecessary columns
df_electric_active = df_electric_with_status[df_electric_with_status['status'] == 'active'].copy()
df_electric_active = df_electric_active.drop(['status'], axis=1)

# 3. Similarly merge and filter other sensor data
df_gas_active = pd.merge(
    df_gas_clean,
    df_sensor[['sensorid', 'status']],
    on='sensorid',
    how='inner'
).query('status == "active"')

df_tempprobe_active = pd.merge(
    df_tempprobe_clean,
    df_sensor[['sensorid', 'status']],
    on='sensorid',
    how='inner'
).query('status == "active"')

df_sensor_room_active = pd.merge(
    df_sensor_room_clean,
    df_sensor[['sensorid', 'status']],
    on='sensorid',
    how='inner'
).query('status == "active"')

# AB hier GPU RAM und nicht System RAM verwenden
# 4. Merge all sensor data based on timestamp and homeid
# First merge electric and gas data
df_combined = pd.merge(
    df_electric_active,
    df_gas_active.drop(['status'] + ['Unnamed: 0'] if 'Unnamed: 0' in df_gas_active.columns else [], axis=1),
    on=['timestamp_local', 'homeid'],
    how='outer',
    suffixes=('_electric', '_gas')
)

# Then merge tempprobe data
columns_to_drop_tempprobe = ['status'] + ['Unnamed: 0'] if 'Unnamed: 0' in df_tempprobe_active.columns else []
df_combined = pd.merge(
    df_combined,
    df_tempprobe_active.drop(columns_to_drop_tempprobe, axis=1),
    on=['timestamp_local', 'homeid'],
    how='outer',
    suffixes=('', '_tempprobe')
)

# Finally merge room sensor data
columns_to_drop_room = ['status'] + ['Unnamed: 0'] if 'Unnamed: 0' in df_sensor_room_active.columns else []
df_combined = pd.merge(
    df_combined,
    df_sensor_room_active.drop(columns_to_drop_room, axis=1),
    on=['timestamp_local', 'homeid'],
    how='outer',
    suffixes=('', '_room')
)

# Ensure homeid is int
df_combined['homeid'] = df_combined['homeid'].astype(int)

# 5. Merge with education and income data
final_df = pd.merge(
    df_combined,
    df_home_edu_income,
    on='homeid',
    how='inner'
)

# 6. Sort by timestamp for LSTM
final_df = final_df.sort_values(['homeid', 'timestamp_local'])

# 7. Handle missing values which is crucial for time series
final_df = final_df.groupby('homeid').apply(
    lambda x: x.set_index('timestamp_local').resample('1H').interpolate()
).reset_index()