In [1]:
import pandas as pd
import numpy as np

# Load the datasets
df_consumption = pd.read_csv('1a) consumption_and_tariff_data_hourly.csv', low_memory=False)
df_survey = pd.read_csv('2) responses_to_selected_survey_questions_filtered.csv')

### I opted to encode the variables (categorical and binary) right after extracting the features to allow for immediate observation/feedback on the data transformation. This approach makes the debugging process much easier compared to encoding them at the very end.

In [2]:
# Q01-Q11: List of appliances and their corresponding questions
appliances = ['Washing machine', 'Tumble dryer', 'Dishwasher', 'Immersion water heater', 
              'Electric oven', 'Electric hob', 'Ironing', 'Electric shower', 'Kettle', 
              'Lighting', 'Electric heater']

# Mapping for appliance questions
appliance_questions = {appliance: f'Q{i+1:02d}' for i, appliance in enumerate(appliances)}

# Q01-Q11: Process fixed schedules
for appliance, question in appliance_questions.items():
    df_survey[f'{appliance.lower().replace(" ", "_")}_fixed_schedule'] = df_survey[question].apply(
        lambda x: np.nan if x == 'No reply' else (0 if pd.isnull(x) or x == 'No' else 1)
    )

In [3]:
# Q12-Q16: Timer switch questions and their corresponding appliances
timer_switch_questions = {
    'Q12': 'Washer-dryer (combined)',
    'Q13': 'Washing machine',
    'Q14': 'Tumble dryer',
    'Q15': 'Dishwasher',
    'Q16': 'Electric space heating' 
}

'''
Note: Q17, the 'Electric water heating' variable has been removed from the analysis due to data quality issues. In this column, the 'We don't have this appliance' 
option did not receive a single response, resulting in an 89% missing response rate, making it impossible to distinguish between non-ownership and truly 
missing values.
'''

# Q12-Q16: Function to map timer switch responses
def map_timer_usage(x):
    if pd.isnull(x) or x == 'No reply':
        return np.nan, np.nan
    elif x == "We don't have this appliance":
        return 0, 0  # No appliance, no timer use
    elif x in ["We don't have a timer function on this appliance",
               "I'm not sure if we have a timer on this appliance",
               "We have a timer on this appliance but we never use the timer"]:
        return 1, 0  # Have appliance, no timer or never use timer
    elif x == "We have a timer on this appliance and sometimes use the timer":
        return 1, 1  # Sometimes use timer
    elif x == "We have a timer on this appliance and often use the timer":
        return 1, 2  # Often use timer
    else:
        return np.nan, np.nan

# Q12-Q16: Apply the function
for question, appliance in timer_switch_questions.items():
    appliance_name = appliance.lower().replace(" ", "_").replace("(", "").replace(")", "")
    df_survey[f'{appliance_name}_ownership'], df_survey[f'{appliance_name}_timer_use'] = zip(*df_survey[question].apply(map_timer_usage))

In [4]:
# Prepare columns for merging (attitudes survey)
attitudes_columns = ['Household_id'] + \
                    [f'{appliance.lower().replace(" ", "_")}_fixed_schedule' for appliance in appliances] + \
                    [f'{appliance.lower().replace(" ", "_").replace("(", "").replace(")", "")}_timer_use' for appliance in timer_switch_questions.values()] + \
                    [f'{appliance.lower().replace(" ", "_").replace("(", "").replace(")", "")}_ownership' for appliance in timer_switch_questions.values()]

# Merge attitudes survey with consumption data
df_merged = pd.merge(df_consumption, df_survey[attitudes_columns], left_on='household_id', right_on='Household_id', how='inner')

# Drop the redundant Household_id column
df_merged = df_merged.drop('Household_id', axis=1)

# Export the merged data
df_merged.to_csv('3) household_energy_consumption_with_attitudinal_features.csv', index=False)
print("\nHousehold energy profile data has been exported to '3) household_energy_consumption_with_attitudinal_features.csv'")


Household energy profile data has been exported to '3) household_energy_consumption_with_attitudinal_features.csv'


In [5]:
print(f"Merged data shape: {df_merged.shape}")
print(f"Number of households in merged data: {df_merged['household_id'].nunique()}")

Merged data shape: (5326080, 25)
Number of households in merged data: 608
