In [46]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns


Load Data

In [36]:
BASE_PATH = Path('/Users/olli/Desktop/Uni/Predictive Analytics/Predicitve Analytics Private/Predictive-Analytics-Private/data/raw')


# Load individual dataframes
df_electric = pd.read_csv(BASE_PATH / 'sensor_data/household_sensors_electric.csv', parse_dates=['timestamp_local'])
df_gas = pd.read_csv(BASE_PATH / 'sensor_data/household_sensors_gas.csv', parse_dates=['timestamp_local'])
df_sensor_room = pd.read_csv(BASE_PATH / 'sensor_data/room_and_appliance_sensors_room.csv', parse_dates=['timestamp_local'])
#df_room_tempprobe = pd.read_csv(BASE_PATH / 'sensor_data/room_and_appliance_sensors_tempprobe.csv', parse_dates=['timestamp_local'])
df_tempprobe = pd.read_csv(BASE_PATH / 'sensor_data/household_sensors_tempprobe.csv', parse_dates=['timestamp_local'])

# Load metadata dataframes
df_home = pd.read_csv(BASE_PATH / 'metadata/home.csv')
df_room = pd.read_csv(BASE_PATH / 'metadata/room.csv')
df_sensor = pd.read_csv(BASE_PATH / 'metadata/sensor.csv')
df_person = pd.read_csv(BASE_PATH / 'metadata/person.csv')

# Info about loaded dataframes
for name, df in [('Electric', df_electric), ('Gas', df_gas), ('Room', df_sensor_room), 
                 ('Tempprobe', df_tempprobe)]:
    print(f"Loaded {name}: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"{name} columns: {df.columns.tolist()}\n")
    

Loaded Electric: 1529499 rows, 11 columns
Electric columns: ['Unnamed: 0', 'timestamp_local', 'consumer_id', 'room', 'sensor', 'mean_consumption', 'min_consumption', 'max_consumption', 'std_consumption', 'median_consumption', 'total_consumption_Wh']

Loaded Gas: 634755 rows, 11 columns
Gas columns: ['Unnamed: 0', 'timestamp_local', 'consumer_id', 'room', 'sensor', 'mean_consumption', 'min_consumption', 'max_consumption', 'std_consumption', 'median_consumption', 'total_consumption_Wh']

Loaded Room: 22898596 rows, 11 columns
Room columns: ['Unnamed: 0', 'timestamp_local', 'consumer_id', 'room', 'sensor', 'measurement', 'mean_value', 'min_value', 'max_value', 'std_value', 'median_value']

Loaded Tempprobe: 5719133 rows, 11 columns
Tempprobe columns: ['Unnamed: 0', 'timestamp_local', 'consumer_id', 'room', 'sensor', 'measured_entity', 'mean_temperature', 'min_temperature', 'max_temperature', 'std_temperature', 'median_temperature']



### Known Issues aus dem PDF bei Sensor Daten lösen
1. Filtering out unreliable data period (April 17, 2018, 08:00-10:00)

In [38]:
# Create mask for unreliable period
unreliable_start = pd.Timestamp('2018-04-17 08:00:00')
unreliable_end = pd.Timestamp('2018-04-17 10:00:00')

# Filter out unreliable data
df_electric = df_electric[~((df_electric['timestamp_local'] >= unreliable_start) & 
                          (df_electric['timestamp_local'] <= unreliable_end))]

# Verify filtering worked
print("Records in unreliable period:", 
      len(df_electric[(df_electric['timestamp_local'] >= unreliable_start) & 
                      (df_electric['timestamp_local'] <= unreliable_end)]))

Records in unreliable period: 0


In [39]:
def clean_electric_data(df_electric):
    # Create a copy of the dataframe
    df_clean = df_electric.copy()

    # Create mask for unreliable period
    unreliable_start = pd.Timestamp('2018-04-17 08:00:00')
    unreliable_end = pd.Timestamp('2018-04-17 10:00:00')

    # Filter out unreliable data
    df_clean = df_clean[~((df_clean['timestamp_local'] >= unreliable_start) & 
                            (df_clean['timestamp_local'] <= unreliable_end))]
    
    # Extract numeric part from consumer_id and rename to homeid
    df_clean['homeid'] = df_clean['consumer_id'].str.extract(r'(\d+)')
    df_clean = df_clean.drop(columns=['consumer_id'])
    
    # Split room into type and roomid
    df_clean[['type', 'roomid']] = df_clean['room'].str.extract(r'([a-zA-Z]+)(\d+)')
    df_clean = df_clean.drop(columns=['room'])
    
    # Remove mean_consumption column
    df_clean = df_clean.drop(columns=['mean_consumption'])
    
    # Interpolate missing values in std_consumption
    df_clean.set_index('timestamp_local', inplace=True)
    df_clean['std_consumption'] = df_clean['std_consumption'].interpolate(method='time')
    df_clean.reset_index(inplace=True)
    
    # Rename columns
    df_clean.rename(columns={
        'min_consumption': 'electric_min_consumption',
        'max_consumption': 'electric_max_consumption',
        'median_consumption': 'electric_median_consumption',
        'total_consumption_Wh': 'electric_total_consumption_Wh',
        'sensor': 'sensorid'
    }, inplace=True)
    
    return df_clean

# Apply all cleaning steps at once
df_electric_clean = clean_electric_data(df_electric)
df_electric_clean.head()



Unnamed: 0.1,timestamp_local,Unnamed: 0,sensorid,electric_min_consumption,electric_max_consumption,std_consumption,electric_median_consumption,electric_total_consumption_Wh,homeid,type,roomid
0,2017-03-07 15:00:00,0,4472,0.0,0.34,0.047283,0.251,0.233624,100,livingroom,1038
1,2017-03-07 16:00:00,1,4472,0.038,2.321,0.785224,0.243,0.677716,100,livingroom,1038
2,2017-03-07 17:00:00,2,4472,0.083,4.746,0.670984,0.404,0.567129,100,livingroom,1038
3,2017-03-07 18:00:00,3,4472,0.048,0.792,0.125662,0.299,0.312481,100,livingroom,1038
4,2017-03-07 19:00:00,4,4472,0.044,1.278,0.043239,0.226,0.20986,100,livingroom,1038


### Clean gas data

In [40]:
# Extract numbers from consumer_id and room
df_gas['homeid'] = df_gas['consumer_id'].str.extract(r'(\d+)')
df_gas['roomid'] = df_gas['room'].str.extract(r'(\d+)')

df_gas = df_gas.drop(columns=['std_consumption', 'consumer_id', 'room'])
df_gas.rename(columns={
    'mean_consumption': 'gas_mean_consumption',
    'min_consumption': 'gas_min_consumption',
    'max_consumption': 'gas_max_consumption',
    'median_consumption': 'gas_median_consumption',
    'total_consumption_Wh': 'gas_total_consumption_Wh',
    'sensor': 'sensorid'
    }, inplace=True)
df_gas

Unnamed: 0.1,Unnamed: 0,timestamp_local,sensorid,gas_mean_consumption,gas_min_consumption,gas_max_consumption,gas_median_consumption,gas_total_consumption_Wh,homeid,roomid
0,0,2016-09-20 09:00:00,1221,0.112000,0.112,0.112,0.112,0.224,47,654
1,1,2016-09-20 17:00:00,1221,0.112000,0.112,0.112,0.112,0.112,47,654
2,2,2016-09-20 18:00:00,1221,0.112000,0.112,0.112,0.112,1.568,47,654
3,3,2016-09-20 19:00:00,1221,0.112000,0.112,0.112,0.112,0.224,47,654
4,4,2016-09-20 20:00:00,1221,0.112000,0.112,0.112,0.112,1.344,47,654
...,...,...,...,...,...,...,...,...,...,...
634750,2870,2018-06-29 06:00:00,4391,0.125875,0.112,0.223,0.112,1.007,99,1028
634751,2871,2018-06-29 07:00:00,4391,0.151786,0.112,0.558,0.112,2.125,99,1028
634752,2872,2018-06-29 11:00:00,4391,0.112000,0.112,0.112,0.112,0.112,99,1028
634753,2873,2018-06-29 12:00:00,4391,0.112000,0.112,0.112,0.112,1.120,99,1028


### Household_sensors_tempprobe Preprocessing

In [49]:

def clean_data(df_tempprobe):
    # Extract numeric part from consumer_id
    df_tempprobe['homeid'] = df_tempprobe['consumer_id'].str.extract(r'(\d+)')
    # Split 'room' into 'roomid' and 'type'
    df_tempprobe[['type', 'roomid']] = df_tempprobe['room'].str.extract(r'([a-zA-Z]+)(\d+)')
    # Remove specified temperature columns
    df_tempprobe = df_tempprobe.drop(columns=['mean_temperature', 'min_temperature', 'std_temperature', 'max_temperature', 'room', 'consumer_id'])
    # Map text values to numbers
    mapping = {
        'central-heating-flow': 1,
        'hot-water-cold-pipe': 2,
        'central-heating-return': 3,
        'hot-water-hot-pipe': 4
    }
    # Apply mapping to the 'measured_entity' column
    df_tempprobe['measured_entity'] = df_tempprobe['measured_entity'].map(mapping)
    return df_tempprobe

df_tempprobe_clean = clean_data(df_tempprobe.copy())
df_tempprobe_clean.head()
df_tempprobe_clean

Unnamed: 0.1,Unnamed: 0,timestamp_local,sensor,measured_entity,median_temperature,homeid,type,roomid
0,0,2017-03-07 15:00:00,4450,3,25.5,100,kitchen,1037
1,1,2017-03-07 16:00:00,4450,3,19.0,100,kitchen,1037
2,2,2017-03-07 17:00:00,4450,3,17.5,100,kitchen,1037
3,3,2017-03-07 18:00:00,4450,3,17.5,100,kitchen,1037
4,4,2017-03-07 19:00:00,4450,3,17.0,100,kitchen,1037
...,...,...,...,...,...,...,...,...
5719128,9008,2018-06-30 20:00:00,4376,4,25.5,99,kitchen,1026
5719129,9009,2018-06-30 21:00:00,4376,4,27.3,99,kitchen,1026
5719130,9010,2018-06-30 22:00:00,4376,4,25.8,99,kitchen,1026
5719131,9011,2018-06-30 23:00:00,4376,4,25.5,99,kitchen,1026


### Room and appliance sensor preprocessing

In [43]:
# Extract numbers from consumer_id and room
df_sensor_room['homeid'] = df_sensor_room['consumer_id'].str.extract(r'(\d+)')
df_sensor_room['roomid'] = df_sensor_room['room'].str.extract(r'(\d+)')

df_sensor_room = df_sensor_room.drop(columns=['mean_value', 'min_value', 'max_value', 'std_value', 'consumer_id', 'room'])
df_sensor_room.rename(columns={'sensor': 'sensorid'}, inplace=True)

# Filter out all rows where measured entity is not temperature (such as humidity)
df_sensor_room = df_sensor_room.query('measurement == "temperature"')

### Sensor.csv Preprocessing 

In [44]:
df_sensor = df_sensor[['sensorid', 'status']]
df_sensor

Unnamed: 0,sensorid,status
0,1174,active
1,1175,active
2,1176,active
3,1177,active
4,1178,active
...,...,...
20076,21843,active
20077,21844,active
20078,21845,active
20079,21846,active


### Prepare metadata for income_band, education

In [58]:
df_person = pd.read_csv(BASE_PATH / 'metadata/person.csv')
df_home = pd.read_csv(BASE_PATH / 'metadata/home.csv')
# Prepare income_band data
df_home = df_home[["homeid", "income_band"]].groupby('homeid', as_index=False).sum()

income_midpoints = {
    "Missing": 0,
    "less than £10,800": 1,
    "£10,800 to £13,499": 2,
    "£13,500 to £16,199": 3,
    "£16,200 to £19,799": 4,
    "£19,800 to £23,399": 5,
    "£23,400 to £26,999": 6,
    "£27,000 to £32,399": 7,
    "£32,400 to £37,799": 8,
    "£37,800 to £43,199": 9,
    "£43,200 to £48,599": 10,
    "£48,600 to £53,999": 11,
    "£54,000 to £65,999": 12,
    "£66,000 to £77,999": 13,
    "£78,000 to £89,999": 14,
    "£90,000 or more": 15
}

df_home["income_band_mid"] = df_home["income_band"].map(income_midpoints)
df_home = df_home.drop(columns=['income_band'])

# Prepare education data per home
df_person['education'] = df_person['education'].str.strip().str.lower()
df_person['education'] = df_person['education'].fillna('unknown')

# Map qualifications to numerical values
qualification_mapping = {
    "phd": 9,  # Highest qualification
    "degree level qualification (or equivalent), e.g. bsc, ba, msc, ma": 8,
    "higher educational qualification below degree level": 7,
    "onc / national level btec": 6,
    "a-levels or highers": 5,
    "gcse grade d-g or cse grade 2-5 or standard grade level 4-6": 4,
    "o level or gcse equivalent (grade a-c) or o grade/cse equivalent (grade 1) or standard grade level 1": 3,
    "other qualifications": 2, 
    "no formal qualifications": 1,
    "unknown": 0 } # Lowest qualification
df_person['education_map'] = df_person['education'].map(qualification_mapping)

df_person = df_person[["homeid", "education_map"]]

# Choose highest education
highest_education = df_person.groupby('homeid')['education_map'].max().reset_index()
df_person.drop(columns=['education_map'], inplace=True)
person_education = pd.merge(df_person, highest_education, on='homeid', how='inner')

# Map home and person including education
df_home_edu_income = pd.merge(df_home, person_education, on='homeid', how='inner')
df_home_edu_income


Unnamed: 0,homeid,income_band_mid,education_map
0,47,0,8
1,47,0,8
2,59,15,8
3,59,15,8
4,59,15,8
...,...,...,...
611,334,1,9
612,334,1,9
613,334,1,9
614,335,14,8


### TODO
- Merge all dataframes on timestamp
- Join education and income data for every row on same homeid
- Delete data from sensor where sensor appears as faulty or offline