In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    check_constant_columns,
    check_empty_columns
)

### Cleaning raw thermostat data previously saved as a CSV for data processing. 

- Converting the `date` and `time` column into a proper `timestamp`
- Renaming column names for better readability and to be consistent with weather data
- Dropping the original `date` and `time` column
- Removing duplicate or fully empty rows on need basis 
- Fill or zero-out runtime columns
- Reordering columns for consistency
- Detecting Outliers and tagging them for hadling and visualizing later

The cleaned weather data is saved in `data/processed/thermostat_data_cleaned.csv` and will be ready for merging with weather data.

In [2]:
# Define input/output paths. Create directory if doesn't exists already

raw_path = "../data/raw/thermostat_combined.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "thermostat_data_cleaned_final.csv")


In [3]:
# Load raw thermostat data . This dataset contains thermostat runtime parameters such as timestamp, indoor temperature, humidity, setpoint temperature, HVAC mode, HVAC Runtime.

thermostat_df_cleaned = pd.read_csv(raw_path)
#print("Current Working Directory:", os.getcwd())
#print(thermostat_df_cleaned)

### BASIC OVERVIEW
### profile_dataframe_overview() 
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [4]:
profile_dataframe_overview(thermostat_df_cleaned)


 Basic Structure
----------------------------------------
Rows: 122688, Columns: 21
Column Names: ['Date', 'Time', 'System Setting', 'System Mode', 'Calendar Event', 'Program Mode', 'Cool Set Temp (F)', 'Heat Set Temp (F)', 'Current Temp (F)', 'Humidity Set Point (%RH)', 'Current Humidity (%RH)', 'Outdoor Temp (F)', 'Wind Speed (km/h)', 'Cool Stage 1 (sec)', 'Heat Stage 1 (sec)', 'Fan (sec)', 'Humidifier (sec)', 'DM Offset', 'Thermostat Temperature (F)', 'Thermostat Humidity (%RH)', 'Thermostat Motion']

 Data Types
----------------------------------------
Date                           object
Time                           object
System Setting                 object
System Mode                    object
Calendar Event                 object
Program Mode                   object
Cool Set Temp (F)             float64
Heat Set Temp (F)             float64
Current Temp (F)              float64
Humidity Set Point (%RH)      float64
Current Humidity (%RH)        float64
Outdoor Temp (F)  

### DISPLAY STATS SUMMARY
### summary_stats()

This function returns standard summary statistics for all numeric columns in the DataFrame:

- count, mean, std
- min, 25th percentile, median (50%), 75th percentile, max

In [5]:
summary_statistics = summary_stats(thermostat_df_cleaned)
print(summary_statistics)


 Summary Statistics:


                               count    mean    std   min    25%    50%  \
Cool Set Temp (F)           122470.0   73.15   2.20  68.0   72.0   73.0   
Heat Set Temp (F)           122470.0   70.67   2.18  67.5   69.5   70.2   
Current Temp (F)            122470.0   71.45   2.37  62.1   69.7   71.7   
Humidity Set Point (%RH)    122470.0   17.95  18.00   0.0    0.0    0.0   
Current Humidity (%RH)      122470.0   52.26   7.70  27.0   46.0   54.0   
Outdoor Temp (F)            122550.0   56.84  19.19  -3.8   42.8   60.7   
Wind Speed (km/h)           122550.0    0.00   0.00   0.0    0.0    0.0   
Cool Stage 1 (sec)          122470.0   40.33  98.44   0.0    0.0    0.0   
Heat Stage 1 (sec)          122470.0   20.03  65.53   0.0    0.0    0.0   
Fan (sec)                   122470.0  278.18  32.59   0.0  255.0  300.0   
Humidifier (sec)            122470.0    0.75  13.49   0.0    0.0    0.0   
DM Offset                    47534.0    0.15   0.62  -2.6   -0.2    0.2   
Thermostat Temperature (F

### check_constant_columns

This function identifies columns where all rows have the same value (e.g., a column that always says "My Ecobee" or 0 values).

Such columns are usually not informative and can be dropped to simplify the dataset.

In [6]:
const_columns = check_constant_columns(thermostat_df_cleaned)
#print("\n Constant Columns:", const_columns)


 Constant Columns: ['Wind Speed (km/h)']


### check_empty_columns

This function returns a list of columns that are completely empty (i.e., 100% null values).

These columns typically be removed unless or populate them later.

In [7]:
empty_columns = check_empty_columns(thermostat_df_cleaned)
#print("\n Empty Columns:", empty_columns)


 Empty Columns: []


### Standardize Column Names

- Clean up messy column names

In [8]:
thermostat_df_cleaned.columns = thermostat_df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')

thermostat_df_cleaned.rename(columns={
    'cool_set_temp_(f)': 'cool_set_temp_f',
    'heat_set_temp_(f)': 'heat_set_temp_f',
    'current_temp_(f)': 'current_temp_f',
    'humidity_set_point_(%rh)': 'humidity_set_point_rh',
    'current_humidity_(%rh)': 'current_humidity_rh',
    'outdoor_temp_(f)': 'outdoor_temp_f',
    'wind_speed_(km/h)': 'wind_speed_kmh',
    'cool_stage_1_(sec)': 'cool_stage_1_sec',
    'heat_stage_1_(sec)': 'heat_stage_1_sec',
    'fan_(sec)': 'fan_sec',
    'humidifier_(sec)': 'humidifier_sec',
    'thermostat_temperature_(f)': 'thermostat_temperature_f',
    'thermostat_humidity_(%rh)': 'thermostat_humidity_rh'
}, inplace=True)

print(thermostat_df_cleaned.columns)

Index(['date', 'time', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion'],
      dtype='object')


### Convert date and Time to Timestamp Format
The thermostat dataset includes separate `date` and `time` columns. For time-based analysis and merging with other data (e.g., weather), it is essential to combine them into a single `timestamp` column and convert it to proper datetime format.

The `date` and `time` column is combined and then converted to `datetime` format and renamed as `timestamp` for compatibility with the weather dataset.

If datatype is not String, convert `date` and `time` to String using .astype(str)

To handle invalid date values, use errors=`coerce` argument, to replace invalid dates to NaT (Not a Time)

In order to avoid warning, slow and less consitent parsing, explicitly specify datetime format.

format = `%m/%d/%y %H:%M:%S`

In [9]:
# Combine 'date' and 'time' into 'timestamp' only if both columns are present

if 'date' in thermostat_df_cleaned.columns and 'time' in thermostat_df_cleaned.columns:
    thermostat_df_cleaned['timestamp'] = pd.to_datetime(
        thermostat_df_cleaned['date'] + ' ' + thermostat_df_cleaned['time'], format='%m/%d/%y %H:%M:%S', errors='coerce'
        )
    # Drop original 'date' and 'time' columns safely
    thermostat_df_cleaned.drop(columns=['date', 'time'], inplace=True)
else:
  print("Column 'date' and 'time' not found. Skipping 'datetime' conversion.")

print(thermostat_df_cleaned.columns)

Index(['system_setting', 'system_mode', 'calendar_event', 'program_mode',
       'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion', 'timestamp'],
      dtype='object')


### Reorder Columns
Rearrange the columns to place `timestamp` as the first column (if needed), followed by the HVAC runtime variables. This improves readability and aligns with the weather dataset.

In [10]:
# Reorder columns - timestamp first
cols = ['timestamp'] + [col for col in thermostat_df_cleaned.columns if col != 'timestamp']
thermostat_df_cleaned = thermostat_df_cleaned[cols]


In [11]:

print(thermostat_df_cleaned.columns)
print(thermostat_df_cleaned.dtypes)

Index(['timestamp', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion'],
      dtype='object')
timestamp                   datetime64[ns]
system_setting                      object
system_mode                         object
calendar_event                      object
program_mode                        object
cool_set_temp_f                    float64
heat_set_temp_f                    float64
current_temp_f                     float64
humidity_set_point_rh              float64
current_humidity_rh                float64
outdoor_temp_f                     float64
wind_speed_kmh                     float64
cool_stage_1_sec                   float64

### Clean Categorical Columns (All lower case)

To improve consistency and make filtering, grouping and plotting easier

In [12]:
cat_cols = ['system_mode', 'calendar_event', 'program_mode', 'system_setting']

for col in cat_cols:
    if col in thermostat_df_cleaned.columns:
        thermostat_df_cleaned[col] = thermostat_df_cleaned[col].astype(str).str.strip().str.lower()

thermostat_df_cleaned

Unnamed: 0,timestamp,system_setting,system_mode,calendar_event,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,dm_offset,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
0,2024-05-01 00:00:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,,73.0,57.0,0.0
1,2024-05-01 00:05:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,,73.0,57.0,0.0
2,2024-05-01 00:10:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,270.0,0.0,,73.0,57.0,0.0
3,2024-05-01 00:15:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,,73.0,57.0,0.0
4,2024-05-01 00:20:00,cool,compressorcooloff,,sleep,74.5,69.5,72.9,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,,72.9,57.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,,74.5,61.0,0.0
122684,2025-06-30 23:40:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,300.0,0.0,,74.6,61.0,0.0
122685,2025-06-30 23:45:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,270.0,0.0,,74.6,61.0,0.0
122686,2025-06-30 23:50:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,,74.5,61.0,0.0


### Missing Data Analysis & Dropping Non-Essential Colunms

- Before dropping, I created copy of entire dataframe

####  1. `calendar_event`
- Represents calendar-based scheduling events such as "vacation" or "smartAway".
- Not required for my core analysis, which focuses on runtime patterns, setpoint behavior, and weather correlation.
- Has ~20.75% missing data.
- Not aligned with my current objectives,and hence dropping.

#### 2. `dm_offset`
- Stands for "Device Management Offset" — an internal correction applied to sensor readings.
- Not user-controlled or visible in the interface.
- Has ~61.26% missing data and no analytical value for runtime, weather, or setpoint correlation.
- Dropped due to irrelevance to all core features and high missing percentage.

#### 3. remove rows where only `timestamp` has value and rest are nan

In [13]:
# Create a backup copy
thermostat_df_full = thermostat_df_cleaned.copy()

In [14]:
# Drop non-essential columns from main DataFrame
thermostat_df_cleaned.drop(columns=['calendar_event', 'dm_offset'], inplace=True)

In [15]:
#thermostat_df_full

In [16]:
#missing_pct = thermostat_df_cleaned.isnull().mean() * 100
#print(missing_pct)

In [17]:
# Display rows where only timestamp has avalue

# Ensure timestamp is not null
non_null_timestamp = thermostat_df_cleaned['timestamp'].notnull()
#print(non_null_timestamp)

# Check if all other columns are null or blank
other_cols_blank = thermostat_df_cleaned.drop(columns=['timestamp']).isna().all(axis=1)
#print(other_cols_blank)

# Filter rows where only timestamp has a value
only_timestamp_rows = thermostat_df_cleaned[non_null_timestamp & other_cols_blank]

# Display the result
only_timestamp_rows

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


In [18]:
#Drop rows where all columns except 'timestamp' are blank

# thermostat_df = thermostat_df_cleaned[~thermostat_df_cleaned.drop(columns=['timestamp']).isnull().all(axis=1)]


### Remove Rows with Only Weather Data

During initial check, a small number of rows were found to contain values **only** in the `timestamp`, `outdoor_temp_f`, and `wind_speed_kmh` columns, while all other thermostat-related fields were missing.

These rows likely represent partial or incorrectly logged data and could interfere with forward-filling or time-based analysis.

To maintain data integrity:
- Identified 218 such rows using a column-wise null check.
- Then dropped them from the dataset before applying any forward-filling or missing value imputation.

This ensures that only valid thermostat readings are retained in the cleaned dataset.

In [19]:
thermostat_df_cleaned.loc[503, 'cool_set_temp_f']
thermostat_df_cleaned.loc[503, 'system_setting']


'nan'

In [20]:
thermostat_df_cleaned.isna().sum()

timestamp                     0
system_setting                0
system_mode                   0
program_mode                  0
cool_set_temp_f             218
heat_set_temp_f             218
current_temp_f              218
humidity_set_point_rh       218
current_humidity_rh         218
outdoor_temp_f              138
wind_speed_kmh              138
cool_stage_1_sec            218
heat_stage_1_sec            218
fan_sec                     218
humidifier_sec              218
thermostat_temperature_f    218
thermostat_humidity_rh      218
thermostat_motion           218
dtype: int64

In [21]:
# Define thermostat-related columns (excluding timestamp, outdoor_temp_f, wind_speed_kmh,'program_mode','system_mode', 'system_setting')
thermo_cols = thermostat_df_cleaned.columns.difference(['timestamp', 'outdoor_temp_f', 'wind_speed_kmh', 'program_mode',
       'system_mode', 'system_setting'])
print(thermo_cols)

Index(['cool_set_temp_f', 'cool_stage_1_sec', 'current_humidity_rh',
       'current_temp_f', 'fan_sec', 'heat_set_temp_f', 'heat_stage_1_sec',
       'humidifier_sec', 'humidity_set_point_rh', 'thermostat_humidity_rh',
       'thermostat_motion', 'thermostat_temperature_f'],
      dtype='object')


In [22]:
# Create a mask for rows where all key thermostat-field related columns are NaN

rows_to_drop_mask = thermostat_df_cleaned[thermo_cols].isna().all(axis=1)
#rows_to_drop_mask


In [23]:
# Confirm count
print("Rows to drop:", rows_to_drop_mask.sum())

Rows to drop: 218


### Log or Save Dropped Rows Before Deletion

- Just for safer side, Im making a copy of rows-to_drop_mask rows into a .csv file

In [24]:
dropped_rows = thermostat_df_cleaned[rows_to_drop_mask]
dropped_rows.to_csv("../data/raw/dropped_thermostat_rows.csv", index=False)

In [25]:
# Final step is to drop the 218 rows
thermostat_df_cleaned = thermostat_df_cleaned[~rows_to_drop_mask]

In [26]:
# Checking to see whether 218 rows exists

thermostat_df_cleaned.isna().sum()

timestamp                     0
system_setting                0
system_mode                   0
program_mode                  0
cool_set_temp_f               0
heat_set_temp_f               0
current_temp_f                0
humidity_set_point_rh         0
current_humidity_rh           0
outdoor_temp_f              138
wind_speed_kmh              138
cool_stage_1_sec              0
heat_stage_1_sec              0
fan_sec                       0
humidifier_sec                0
thermostat_temperature_f      0
thermostat_humidity_rh        0
thermostat_motion             0
dtype: int64

### Making a copy before filling

In [27]:
# making a copy before filling
thermostat_df_before_fill = thermostat_df_cleaned.copy()
#thermostat_df_before_fill

###  Handling Missing Values in Thermostat Data

After removing rows with only timestamp values and dropping non-essential columns, a small percentage of missing data remains in several sensor and runtime columns (approximately 0.18%).

Since the data is time series and logged at 5-minute intervals, forward-filling (`ffill`) is an appropriate strategy to maintain continuity. This method fills missing values with the most recent valid entry, which is especially useful for sensor readings and environmental metrics.

Before applying forward-fill, sort  data chronologically by `timestamp`.


In [28]:
# Sort data by timestamp for chronological order
thermostat_df_cleaned = thermostat_df_cleaned.copy()
thermostat_df_cleaned.sort_values('timestamp', inplace=True)
#thermostat_df_cleaned

In [29]:
# Forward-fill missing values in all columns
#thermostat_df.fillna(method='ffill', inplace=True)
thermostat_df_cleaned.ffill(inplace=True)

In [30]:
missing_pct1 = thermostat_df_cleaned.isnull().mean() * 100
print(missing_pct1)

timestamp                   0.0
system_setting              0.0
system_mode                 0.0
program_mode                0.0
cool_set_temp_f             0.0
heat_set_temp_f             0.0
current_temp_f              0.0
humidity_set_point_rh       0.0
current_humidity_rh         0.0
outdoor_temp_f              0.0
wind_speed_kmh              0.0
cool_stage_1_sec            0.0
heat_stage_1_sec            0.0
fan_sec                     0.0
humidifier_sec              0.0
thermostat_temperature_f    0.0
thermostat_humidity_rh      0.0
thermostat_motion           0.0
dtype: float64


In [31]:
# check if there are any missing (NaN) values in the entire DataFrame.

thermostat_df_cleaned.isnull().values.any()

np.False_

In [32]:
# Missing per column
thermostat_df_cleaned.isnull().sum() 

timestamp                   0
system_setting              0
system_mode                 0
program_mode                0
cool_set_temp_f             0
heat_set_temp_f             0
current_temp_f              0
humidity_set_point_rh       0
current_humidity_rh         0
outdoor_temp_f              0
wind_speed_kmh              0
cool_stage_1_sec            0
heat_stage_1_sec            0
fan_sec                     0
humidifier_sec              0
thermostat_temperature_f    0
thermostat_humidity_rh      0
thermostat_motion           0
dtype: int64

In [33]:
# Rows with at least one missing value
thermostat_df_cleaned[thermostat_df_cleaned.isnull().any(axis=1)]

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


In [34]:
# count the number of duplicate rows

thermostat_df_cleaned.duplicated().sum()

np.int64(0)

In [35]:
# Check duplicates based on timestamp only

thermostat_df_cleaned.duplicated(subset=['timestamp']).sum()

np.int64(12)

In [36]:
# Finding duplicate timestamps
duplicate_ts = thermostat_df_cleaned[thermostat_df_cleaned.duplicated(subset=['timestamp'], keep=False)]
duplicate_ts.sort_values('timestamp')

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
53580,2024-11-03 01:00:00,cool,compressorcooloff,sleep,72.0,72.0,68.6,0.0,57.0,43.5,0.0,0.0,0.0,300.0,0.0,68.6,57.0,0.0
53592,2024-11-03 01:00:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,300.0,0.0,68.4,57.0,0.0
53581,2024-11-03 01:05:00,cool,compressorcooloff,sleep,72.0,72.0,68.6,0.0,57.0,43.5,0.0,0.0,0.0,270.0,0.0,68.6,57.0,0.0
53593,2024-11-03 01:05:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,270.0,0.0,68.4,57.0,0.0
53594,2024-11-03 01:10:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,255.0,0.0,68.4,57.0,0.0
53582,2024-11-03 01:10:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,255.0,0.0,68.7,57.0,0.0
53583,2024-11-03 01:15:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,300.0,0.0,68.7,57.0,0.0
53595,2024-11-03 01:15:00,cool,compressorcooloff,sleep,72.0,72.0,68.3,0.0,57.0,42.2,0.0,0.0,0.0,300.0,0.0,68.3,57.0,0.0
53584,2024-11-03 01:20:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,270.0,0.0,68.7,57.0,0.0
53596,2024-11-03 01:20:00,cool,compressorcooloff,sleep,72.0,72.0,68.3,0.0,57.0,42.2,0.0,0.0,0.0,270.0,0.0,68.3,57.0,0.0


### Handling Duplicate Timestamps in Data Due to DST (Fall Back)

- While checking for full_row duplicates, the code returned no duplicate values. But when checked based on `timestamps`, some timestamps are duplicated.
- 12 timestamps are repeated twice and only `current_temp_f` and `thermostat_temperature_f` column values differ slightly between rows logged at the same timestamp.  
- On further analysis, it was found out that the date **Nov 3, 2024** is the day DST ends. The reason for **duplicate timestamps** was clocks moved backward from 2:00 AM to 1:00 AM, creating a repeated hour (1:00 AM to 2:00 AM occurs twice). 
- Resolved this by dropping duplicates, retaining only the first occurrence

##### Outcome:
- Ensured accurate hourly aggregation without overcounting fan_sec values.
- Maintained data consistency across the DST transition without introducing artifacts.

### Handling Missing Hours Due to Daylight Saving Time (Spring Forward)
On **March 9, 2025**, clocks moved forward from 2:00 AM to 3:00 AM — meaning **the hour 2:00 AM to 2:59 AM does not exist** in the local time series.
- A natural one-hour gap in the data on 2025-03-09.
- Did not forward-fill or interpolate the missing hour. Preseved the missing hour as-is.

##### Why This Matters:
- Preserving missing time periods avoids creating artificial data.
- Demonstrates real-world sensor behavior, including offline hours, DST shifts, or device downtime.

In [47]:
# creating a backup copy
thermostat_df_with_dupes = thermostat_df_cleaned.copy()

In [48]:
# Remove duplicate timestamps and retain first occurarence

thermostat_df_cleaned = thermostat_df_cleaned.drop_duplicates(subset='timestamp', keep='first').copy()

In [49]:
# Check duplicates based on timestamp only

thermostat_df_cleaned.duplicated(subset=['timestamp']).sum()

np.int64(0)

In [50]:
# Finding duplicate timestamps
duplicate_ts = thermostat_df_cleaned[thermostat_df_cleaned.duplicated(subset=['timestamp'], keep=False)]
duplicate_ts.sort_values('timestamp')

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


### Outlier Check (Summary Only)

Outlier detection was performed in the earlier version of this notebook (`clean_thermostat_data.ipynb`) using rule-based thresholds for temperature, humidity, and runtime columns.

However, for this final version:
- After careful analysis, I found out that I'm not going to use outliers for filtering, visualization or analysis. So removed the logic for detecting and tagging the `outlier`.
- We assume the cleaned dataset is reliable and all data points are retained unless explicitly invalid.
- Unused or redundant columns have been dropped in the final cleanup.

### Convert Categorical Columns to `category` Dtype

To optimize memory usage and improve processing efficiency, categorical string columns are explicitly converted to the `category` data type.

This is particularly useful for columns with repeated values such as:

- `system_setting`
- `system_mode`
- `program_mode`

Converting these to the `category` dtype:
- Reduces memory footprint
- Enables faster comparisons and groupby operations
- Helps pandas treat them as discrete labels rather than raw strings

This step is especially important before merging, analysis, or exporting the data to databases.

In [43]:
thermostat_categorical_cols = [
    'system_setting', 'system_mode', 'program_mode'
]
for col in thermostat_categorical_cols:
    thermostat_df_cleaned[col] = thermostat_df_cleaned[col].astype('category')

In [46]:
thermostat_df_cleaned.dtypes

timestamp                   datetime64[ns]
system_setting                    category
system_mode                       category
program_mode                      category
cool_set_temp_f                    float64
heat_set_temp_f                    float64
current_temp_f                     float64
humidity_set_point_rh              float64
current_humidity_rh                float64
outdoor_temp_f                     float64
wind_speed_kmh                     float64
cool_stage_1_sec                   float64
heat_stage_1_sec                   float64
fan_sec                            float64
humidifier_sec                     float64
thermostat_temperature_f           float64
thermostat_humidity_rh             float64
thermostat_motion                  float64
dtype: object

### Extracting Date and Time Details from Timestamp

To better understand patterns in HVAC usage over time, I extracted specific components of the `timestamp` column. This includes:

- The **date** (like 2025-08-04)
- The **hour** of the day (0 to 23)
- The **minute**
- The **day of the week** (like Monday or Friday)
- The **month name** (like August or December)

These new columns help me to explore trends based on time—such as which hours or days have more HVAC activity, or how behavior changes across seasons.

Since the `timestamp` column was already in the right datetime format, I didn’t need to convert it again. I just extracted what is needed for analysis.


In [None]:
# Extracting components from timestamp

thermostat_df_cleaned['date'] = thermostat_df_cleaned['timestamp'].dt.date
thermostat_df_cleaned['hour'] = thermostat_df_cleaned['timestamp'].dt.hour
thermostat_df_cleaned['minute'] = thermostat_df_cleaned['timestamp'].dt.minute
thermostat_df_cleaned['weekday'] = thermostat_df_cleaned['timestamp'].dt.day_name()
thermostat_df_cleaned['month'] = thermostat_df_cleaned['timestamp'].dt.month_name()

In [None]:
thermostat_df_cleaned['hour'].dtype

dtype('int32')

In [None]:
#thermostat_df_cleaned

### Dropping Unused Columns

To streamline the dataset and reduce unnecessary storage before saving and pushing to the database, I dropped a set of columns that are not currently used in my analysis or visualizations. These include:

- `humidity_set_point_rh`
- `wind_speed_kmh`
- `cool_stage_1_sec`
- `heat_stage_1_sec`
- `humidifier_sec`
- `thermostat_temperature_f`
- `thermostat_humidity_rh`
- `thermostat_motion`

These fields were either unused, or not directly relevant to the core goals of my project—like HVAC runtime, comfort trends, or indoor vs. outdoor temperature comparison. Keeping only the relevant data makes further processing faster and the database footprint smaller.

In [None]:
# create back up before dropping unused columns

thermostat_df_cleaned_before_drop = thermostat_df_cleaned.copy()
thermostat_df_cleaned_before_drop

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,...,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion,date,hour,minute,weekday,month
0,2024-05-01 00:00:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,...,300.0,0.0,73.0,57.0,0.0,2024-05-01,0,0,Wednesday,May
1,2024-05-01 00:05:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,...,255.0,0.0,73.0,57.0,0.0,2024-05-01,0,5,Wednesday,May
2,2024-05-01 00:10:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,...,270.0,0.0,73.0,57.0,0.0,2024-05-01,0,10,Wednesday,May
3,2024-05-01 00:15:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,...,300.0,0.0,73.0,57.0,0.0,2024-05-01,0,15,Wednesday,May
4,2024-05-01 00:20:00,cool,compressorcooloff,sleep,74.5,69.5,72.9,0.0,57.0,59.9,...,255.0,0.0,72.9,57.0,0.0,2024-05-01,0,20,Wednesday,May
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,...,255.0,0.0,74.5,61.0,0.0,2025-06-30,23,35,Monday,June
122684,2025-06-30 23:40:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,...,300.0,0.0,74.6,61.0,0.0,2025-06-30,23,40,Monday,June
122685,2025-06-30 23:45:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,...,270.0,0.0,74.6,61.0,0.0,2025-06-30,23,45,Monday,June
122686,2025-06-30 23:50:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,...,255.0,0.0,74.5,61.0,0.0,2025-06-30,23,50,Monday,June


In [None]:
# Drop Unused Columns

columns_to_drop = [
    "humidity_set_point_rh",
    "wind_speed_kmh",
    "cool_stage_1_sec",
    "heat_stage_1_sec",
    "humidifier_sec",
    "thermostat_temperature_f",
    "thermostat_humidity_rh",
    "thermostat_motion"
]

thermostat_df_cleaned.drop(columns=columns_to_drop, inplace=True)

In [None]:
thermostat_df_cleaned

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,current_humidity_rh,outdoor_temp_f,fan_sec,date,hour,minute,weekday,month
0,2024-05-01 00:00:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,57.0,59.9,300.0,2024-05-01,0,0,Wednesday,May
1,2024-05-01 00:05:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,57.0,59.9,255.0,2024-05-01,0,5,Wednesday,May
2,2024-05-01 00:10:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,57.0,59.9,270.0,2024-05-01,0,10,Wednesday,May
3,2024-05-01 00:15:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,57.0,59.9,300.0,2024-05-01,0,15,Wednesday,May
4,2024-05-01 00:20:00,cool,compressorcooloff,sleep,74.5,69.5,72.9,57.0,59.9,255.0,2024-05-01,0,20,Wednesday,May
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,61.0,75.0,255.0,2025-06-30,23,35,Monday,June
122684,2025-06-30 23:40:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,61.0,75.0,300.0,2025-06-30,23,40,Monday,June
122685,2025-06-30 23:45:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,61.0,75.0,270.0,2025-06-30,23,45,Monday,June
122686,2025-06-30 23:50:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,61.0,75.0,255.0,2025-06-30,23,50,Monday,June


### Categorizing Fan Runtime (Before Aggregation)

categorized the `fan_sec` values recorded at 5-minute intervals to understand the distribution of fan usage over time. This helps in analyzing patterns such as how indoor temperature varies based on short-duration fan activity.
To acheive this:
- created a new column `fan_runtime_category` using the `.apply()` function. This helped convert continuous `fan_sec` values into interpretable labels:
- **Low** for runtime < 10 minutes
- **Medium** for 10–30 minutes
- **High** for >30 minutes
- **Unknown** if value is missing

This is useful for visualizations like boxplots comparing temperature across different levels of fan usage.


In [None]:
# Define a function to categorize fan runtime
def categorize_fan_runtime(seconds):
    if seconds == 0:
        return 'No Runtime'
    elif seconds <= 180:
        return 'Low'
    elif seconds <= 300:
        return 'Medium'
    else:
        return 'High'

# Apply it to 5-min level data
thermostat_df_cleaned['fan_runtime_category'] = thermostat_df_cleaned['fan_sec'].apply(categorize_fan_runtime)


In [None]:
thermostat_df_cleaned[['fan_sec', 'fan_runtime_category']].head(10)

Unnamed: 0,fan_sec,fan_runtime_category
0,300.0,Medium
1,255.0,Medium
2,270.0,Medium
3,300.0,Medium
4,255.0,Medium
5,270.0,Medium
6,300.0,Medium
7,255.0,Medium
8,270.0,Medium
9,300.0,Medium


In [None]:
#thermostat_df_cleaned[thermostat_df_cleaned['fan_runtime_category'] == 'Low'].head(10)
#thermostat_df_cleaned.loc[thermostat_df_cleaned['fan_runtime_category'] == 'Low', ['fan_sec', 'fan_runtime_category']].head(10)

### Converting 5-Minute Data to Hourly Data

The raw thermostat data was originally collected every 5 minutes, which can be too detailed for high-level analysis. To make it easier to work with—and to match it better with the hourly weather data— I grouped the data by hour.


### 1 .Custom Aggregation
This methd applies diff agg functions to each column
- Temperature & Humidity values are averaged.
- Fan Runtime is summed to reflect total usage within the hour.
- Categorical Columns like system_mode, program_mode, and fan_runtime_category use the mode (most frequent value) to preserve interpretability.

This hourly version of the data is much easier to analyze, visualize, and compare with weather conditions.

In [None]:
# To ensure timestamp is datetime , sort chronologically and set timestamp index 

thermostat_df_cleaned['timestamp'] = pd.to_datetime(thermostat_df_cleaned['timestamp'], errors='coerce')
thermostat_df_cleaned = thermostat_df_cleaned.sort_values('timestamp')
thermostat_df_cleaned.set_index('timestamp', inplace=True)


In [None]:
# 2. Custom Aggregation
# type: ignore

thermostat_df_hourly_custom = thermostat_df_cleaned.resample('1h').agg({
    'current_temp_f': 'mean',
    'cool_set_temp_f': 'mean',
    'heat_set_temp_f': 'mean',
    'current_humidity_rh': 'mean',
    'fan_sec': 'sum',
    'system_mode': lambda x: x.mode()[0] if not x.mode().empty else None,
    'program_mode': lambda x: x.mode()[0] if not x.mode().empty else None,
    'fan_runtime_category': lambda x: x.mode()[0] if not x.mode().empty else None
}).reset_index()

In [None]:
thermostat_df_hourly_custom.columns

Index(['timestamp', 'current_temp_f', 'cool_set_temp_f', 'heat_set_temp_f',
       'current_humidity_rh', 'fan_sec', 'system_mode', 'program_mode',
       'fan_runtime_category'],
      dtype='object')

In [None]:
# Round only numeric columns after aggregation
numeric_cols_cust = ['current_temp_f', 'cool_set_temp_f', 'heat_set_temp_f', 'current_humidity_rh']
thermostat_df_hourly_custom[numeric_cols_cust] = thermostat_df_hourly_custom[numeric_cols_cust].round(2)

In [None]:
thermostat_df_hourly_custom.head()

Unnamed: 0,timestamp,current_temp_f,cool_set_temp_f,heat_set_temp_f,current_humidity_rh,fan_sec,system_mode,program_mode,fan_runtime_category
0,2024-05-01 00:00:00,72.88,74.5,69.5,57.0,3300.0,compressorcooloff,sleep,Medium
1,2024-05-01 01:00:00,72.43,74.5,69.5,57.0,3300.0,compressorcooloff,sleep,Medium
2,2024-05-01 02:00:00,72.05,74.5,69.5,57.0,3300.0,compressorcooloff,sleep,Medium
3,2024-05-01 03:00:00,71.59,74.5,69.5,57.0,3300.0,compressorcooloff,sleep,Medium
4,2024-05-01 04:00:00,71.17,74.5,69.5,57.0,3300.0,compressorcooloff,sleep,Medium


In [None]:
thermostat_df_hourly_custom.describe().round(2)

Unnamed: 0,timestamp,current_temp_f,cool_set_temp_f,heat_set_temp_f,current_humidity_rh,fan_sec
count,10224,10221.0,10221.0,10221.0,10221.0,10224.0
mean,2024-11-29 23:30:00,71.45,73.15,70.67,52.25,3331.96
min,2024-05-01 00:00:00,62.28,68.0,67.5,33.17,0.0
25%,2024-08-15 11:45:00,69.75,72.0,69.5,46.17,3300.0
50%,2024-11-29 23:30:00,71.69,73.0,70.17,54.5,3300.0
75%,2025-03-16 11:15:00,73.01,75.0,72.0,58.0,3300.0
max,2025-06-30 23:00:00,78.62,80.0,79.0,69.75,3600.0
std,,2.36,2.19,2.18,7.67,178.17


### Categorizing Fan Runtime (After Aggregation)

After aggregating the data to hourly intervals, the `fan_sec` column represents the **total fan runtime in seconds per hour**. Categorizing the `fan_sec` values into different runtime levels, makes the data easier to analyze and visualize

### Categorization Logic:
- **No Runtime**: `fan_sec == 0`
- **Low**: Fan ran for ≤ 25% of the hour (≤ 900 seconds)
- **Medium**: Fan ran for > 25% and ≤ 75% of the hour (900 < `fan_sec` ≤ 2700 seconds)
- **High**: Fan ran for > 75% of the hour (`fan_sec` > 2700 seconds)

### Outcome:
- This feature simplifies HVAC usage analysis.
- Helps classify time periods into actionable categories for energy optimization.
- Used later in visualizations like fan runtime trends, comfort analysis, and efficiency comparisons.

In [None]:
def categorize_hourly_fan_runtime(fan_sec, max_runtime=3600):
    if fan_sec == 0:
        return 'No Runtime'
    usage_pct = (fan_sec / max_runtime) * 100
    if usage_pct <= 25:
        return 'Low'
    elif usage_pct <= 75:
        return 'Medium'
    else:
        return 'High'

In [None]:
thermostat_df_hourly_custom['fan_runtime_category'] = thermostat_df_hourly_custom['fan_sec'].apply(categorize_hourly_fan_runtime)

In [None]:
thermostat_df_hourly_custom

Unnamed: 0,timestamp,current_temp_f,cool_set_temp_f,heat_set_temp_f,current_humidity_rh,fan_sec,system_mode,program_mode,fan_runtime_category
0,2024-05-01 00:00:00,72.88,74.5,69.5,57.00,3300.0,compressorcooloff,sleep,High
1,2024-05-01 01:00:00,72.43,74.5,69.5,57.00,3300.0,compressorcooloff,sleep,High
2,2024-05-01 02:00:00,72.05,74.5,69.5,57.00,3300.0,compressorcooloff,sleep,High
3,2024-05-01 03:00:00,71.59,74.5,69.5,57.00,3300.0,compressorcooloff,sleep,High
4,2024-05-01 04:00:00,71.17,74.5,69.5,57.00,3300.0,compressorcooloff,sleep,High
...,...,...,...,...,...,...,...,...,...
10219,2025-06-30 19:00:00,75.24,75.0,73.0,56.83,3150.0,compressorcooloff,home,High
10220,2025-06-30 20:00:00,75.00,75.0,73.0,55.92,3375.0,compressorcoolstage1on,home,High
10221,2025-06-30 21:00:00,74.68,75.0,73.0,58.58,3300.0,compressorcooloff,home,High
10222,2025-06-30 22:00:00,74.80,75.0,73.0,60.00,3300.0,compressorcooloff,sleep,High


In [None]:
thermostat_df_hourly_custom[['fan_sec', 'fan_runtime_category']].head(10)

Unnamed: 0,fan_sec,fan_runtime_category
0,3300.0,High
1,3300.0,High
2,3300.0,High
3,3300.0,High
4,3300.0,High
5,3300.0,High
6,3300.0,High
7,3300.0,High
8,3300.0,High
9,3300.0,High


In [None]:

#thermostat_df_hourly_custom.loc[thermostat_df_hourly_custom['fan_runtime_category'] == 'Low', ['fan_sec', 'fan_runtime_category']].head(10)

In [None]:
# reset index
#thermostat_df_cleaned.reset_index(inplace=True)

In [None]:
print(thermostat_df_hourly_custom.index.name)

None


In [None]:
# Save custom aggregation result
thermostat_df_hourly_custom.to_csv("../data/processed/thermostat_hourly_custom.csv", index=False, float_format="%.2f")

In [None]:
# Save cleaned version
thermostat_df_cleaned.to_csv(cleaned_path, index=False, float_format="%.2f")
print(f" Cleaned thermostat data saved to: {cleaned_path}")

 Cleaned thermostat data saved to: ../data/processed/thermostat_data_cleaned_final.csv
