In [40]:
import pandas as pd
import os
import sqlite3
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    check_constant_columns,
    check_empty_columns
)

### Cleaning raw thermostat data previously saved as a CSV for data processing. 

- Converting the `date` and `time` column into a proper `timestamp`
- Renaming column names for better readability and to be consistent with weather data
- Dropping the original `date` and `time` column
- Removing duplicate or fully empty rows on need basis 
- Fill or zero-out runtime columns
- Reordering columns for consistency
- Detecting Outliers and tagging them for hadling and visualizing later

The cleaned weather data is saved in `data/processed/thermostat_data_cleaned.csv` and will be ready for merging with weather data.

In [41]:
# Define input/output paths. Create directory if doesn't exists already

raw_path = "../data/raw/thermostat_combined.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "thermostat_data_cleaned.csv")


In [42]:
# Load raw thermostat data . This dataset contains thermostat runtime parameters such as timestamp, indoor temperature, humidity, setpoint temperature, HVAC mode, HVAC Runtime.

thermostat_df = pd.read_csv(raw_path)
#print("Current Working Directory:", os.getcwd())
#print(thermostat_df)

### BASIC OVERVIEW
### profile_dataframe_overview() 
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [43]:
profile_dataframe_overview(thermostat_df)


 Basic Structure
----------------------------------------
Rows: 122688, Columns: 21
Column Names: ['Date', 'Time', 'System Setting', 'System Mode', 'Calendar Event', 'Program Mode', 'Cool Set Temp (F)', 'Heat Set Temp (F)', 'Current Temp (F)', 'Humidity Set Point (%RH)', 'Current Humidity (%RH)', 'Outdoor Temp (F)', 'Wind Speed (km/h)', 'Cool Stage 1 (sec)', 'Heat Stage 1 (sec)', 'Fan (sec)', 'Humidifier (sec)', 'DM Offset', 'Thermostat Temperature (F)', 'Thermostat Humidity (%RH)', 'Thermostat Motion']

 Data Types
----------------------------------------
Date                           object
Time                           object
System Setting                 object
System Mode                    object
Calendar Event                 object
Program Mode                   object
Cool Set Temp (F)             float64
Heat Set Temp (F)             float64
Current Temp (F)              float64
Humidity Set Point (%RH)      float64
Current Humidity (%RH)        float64
Outdoor Temp (F)  

### DISPLAY STATS SUMMARY
### summary_stats()

This function returns standard summary statistics for all numeric columns in the DataFrame:

- count, mean, std
- min, 25th percentile, median (50%), 75th percentile, max

In [44]:
summary_statistics = summary_stats(thermostat_df)
print(summary_statistics)


 Summary Statistics:
                               count    mean    std   min    25%    50%  \
Cool Set Temp (F)           122470.0   73.15   2.20  68.0   72.0   73.0   
Heat Set Temp (F)           122470.0   70.67   2.18  67.5   69.5   70.2   
Current Temp (F)            122470.0   71.45   2.37  62.1   69.7   71.7   
Humidity Set Point (%RH)    122470.0   17.95  18.00   0.0    0.0    0.0   
Current Humidity (%RH)      122470.0   52.26   7.70  27.0   46.0   54.0   
Outdoor Temp (F)            122550.0   56.84  19.19  -3.8   42.8   60.7   
Wind Speed (km/h)           122550.0    0.00   0.00   0.0    0.0    0.0   
Cool Stage 1 (sec)          122470.0   40.33  98.44   0.0    0.0    0.0   
Heat Stage 1 (sec)          122470.0   20.03  65.53   0.0    0.0    0.0   
Fan (sec)                   122470.0  278.18  32.59   0.0  255.0  300.0   
Humidifier (sec)            122470.0    0.75  13.49   0.0    0.0    0.0   
DM Offset                    47534.0    0.15   0.62  -2.6   -0.2    0.2   
The

### check_constant_columns

This function identifies columns where all rows have the same value (e.g., a column that always says "My Ecobee" or 0 values).

Such columns are usually not informative and can be dropped to simplify the dataset.

In [45]:
const_columns = check_constant_columns(thermostat_df)
#print("\n Constant Columns:", const_columns)


 Constant Columns: ['Wind Speed (km/h)']


### check_empty_columns

This function returns a list of columns that are completely empty (i.e., 100% null values).

These columns typically be removed unless or populate them later.

In [46]:
empty_columns = check_empty_columns(thermostat_df)
#print("\n Empty Columns:", empty_columns)


 Empty Columns: []


### Standardize Column Names

Clean up messy column names

In [49]:
thermostat_df.columns = thermostat_df.columns.str.strip().str.lower().str.replace(' ', '_')

thermostat_df.rename(columns={
    'cool_set_temp_(f)': 'cool_set_temp_f',
    'heat_set_temp_(f)': 'heat_set_temp_f',
    'current_temp_(f)': 'current_temp_f',
    'humidity_set_point_(%rh)': 'humidity_set_point_rh',
    'current_humidity_(%rh)': 'current_humidity_rh',
    'outdoor_temp_(f)': 'outdoor_temp_f',
    'wind_speed_(km/h)': 'wind_speed_kmh',
    'cool_stage_1_(sec)': 'cool_stage_1_sec',
    'heat_stage_1_(sec)': 'heat_stage_1_sec',
    'fan_(sec)': 'fan_sec',
    'humidifier_(sec)': 'humidifier_sec',
    'thermostat_temperature_(f)': 'thermostat_temperature_f',
    'thermostat_humidity_(%rh)': 'thermostat_humidity_rh'
}, inplace=True)

print(thermostat_df.columns)

Index(['date', 'time', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion', 'timestamp'],
      dtype='object')


### Convert date and Time to Timestamp Format
The thermostat dataset includes separate `date` and `time` columns. For time-based analysis and merging with other data (e.g., weather), it is essential to combine them into a single `timestamp` column and convert it to proper datetime format.

The `date` and `time` column is combined and then converted to `datetime` format and renamed as `timestamp` for compatibility with the weather dataset.

If datatype is not String, convert `date` and `time` to String using .astype(str)

To handle invalid date values, use errors=`coerce` argument, to replace invalid dates to NaT (Not a Time)

In order to avoid warning, slow and less consitent parsing, explicitly specify datetime format.

format='%Y-%m-%d %H:%M:%S',

In [55]:
# Combine 'date' and 'time' into 'timestamp' only if both columns are present

if 'date' in thermostat_df.columns and 'time' in thermostat_df.columns:
    thermostat_df['timestamp'] = pd.to_datetime(
        thermostat_df['date'] + ' ' + thermostat_df['time'], format='%m/%d/%y %H:%M:%S', errors='coerce'
        )
    # Drop original 'date' and 'time' columns safely
    thermostat_df.drop(columns=['date', 'time'], inplace=True)
else:
  print("Column 'date' and 'time' not found. Skipping 'datetime' conversion.")

print(thermostat_df.columns)

Column 'date' and 'time' not found. Skipping 'datetime' conversion.
Index(['system_setting', 'system_mode', 'calendar_event', 'program_mode',
       'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion', 'timestamp'],
      dtype='object')


### Reorder Columns
Rearrange the columns to place `timestamp` as the first column (if needed), followed by the HVAC runtime variables. This improves readability and aligns with the weather dataset.

In [58]:
# Reorder columns - timestamp first
cols = ['timestamp'] + [col for col in thermostat_df.columns if col != 'timestamp']
thermostat_df = thermostat_df[cols]

print(thermostat_df.columns)

Index(['timestamp', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion'],
      dtype='object')
