In [2]:
import pandas as pd
import os
import sqlite3
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    check_constant_columns,
    check_empty_columns
)

### Cleaning raw thermostat data previously saved as a CSV for data processing. 

- Converting the `date` and `time` column into a proper `timestamp`
- Renaming column names for better readability and to be consistent with weather data
- Dropping the original `date` and `time` column
- Removing duplicate or fully empty rows on need basis 
- Fill or zero-out runtime columns
- Reordering columns for consistency
- Detecting Outliers and tagging them for hadling and visualizing later

The cleaned weather data is saved in `data/processed/thermostat_data_cleaned.csv` and will be ready for merging with weather data.

In [3]:
# Define input/output paths. Create directory if doesn't exists already

raw_path = "../data/raw/thermostat_combined.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "thermostat_data_cleaned.csv")


In [4]:
# Load raw thermostat data . This dataset contains thermostat runtime parameters such as timestamp, indoor temperature, humidity, setpoint temperature, HVAC mode, HVAC Runtime.

thermostat_df = pd.read_csv(raw_path)
#print("Current Working Directory:", os.getcwd())
#print(thermostat_df)

### BASIC OVERVIEW
### profile_dataframe_overview() 
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [5]:
profile_dataframe_overview(thermostat_df)


 Basic Structure
----------------------------------------
Rows: 122688, Columns: 21
Column Names: ['Date', 'Time', 'System Setting', 'System Mode', 'Calendar Event', 'Program Mode', 'Cool Set Temp (F)', 'Heat Set Temp (F)', 'Current Temp (F)', 'Humidity Set Point (%RH)', 'Current Humidity (%RH)', 'Outdoor Temp (F)', 'Wind Speed (km/h)', 'Cool Stage 1 (sec)', 'Heat Stage 1 (sec)', 'Fan (sec)', 'Humidifier (sec)', 'DM Offset', 'Thermostat Temperature (F)', 'Thermostat Humidity (%RH)', 'Thermostat Motion']

 Data Types
----------------------------------------
Date                           object
Time                           object
System Setting                 object
System Mode                    object
Calendar Event                 object
Program Mode                   object
Cool Set Temp (F)             float64
Heat Set Temp (F)             float64
Current Temp (F)              float64
Humidity Set Point (%RH)      float64
Current Humidity (%RH)        float64
Outdoor Temp (F)  

### DISPLAY STATS SUMMARY
### summary_stats()

This function returns standard summary statistics for all numeric columns in the DataFrame:

- count, mean, std
- min, 25th percentile, median (50%), 75th percentile, max

In [6]:
summary_statistics = summary_stats(thermostat_df)
print(summary_statistics)


 Summary Statistics:
                               count    mean    std   min    25%    50%  \
Cool Set Temp (F)           122470.0   73.15   2.20  68.0   72.0   73.0   
Heat Set Temp (F)           122470.0   70.67   2.18  67.5   69.5   70.2   
Current Temp (F)            122470.0   71.45   2.37  62.1   69.7   71.7   
Humidity Set Point (%RH)    122470.0   17.95  18.00   0.0    0.0    0.0   
Current Humidity (%RH)      122470.0   52.26   7.70  27.0   46.0   54.0   
Outdoor Temp (F)            122550.0   56.84  19.19  -3.8   42.8   60.7   
Wind Speed (km/h)           122550.0    0.00   0.00   0.0    0.0    0.0   
Cool Stage 1 (sec)          122470.0   40.33  98.44   0.0    0.0    0.0   
Heat Stage 1 (sec)          122470.0   20.03  65.53   0.0    0.0    0.0   
Fan (sec)                   122470.0  278.18  32.59   0.0  255.0  300.0   
Humidifier (sec)            122470.0    0.75  13.49   0.0    0.0    0.0   
DM Offset                    47534.0    0.15   0.62  -2.6   -0.2    0.2   
The

### check_constant_columns

This function identifies columns where all rows have the same value (e.g., a column that always says "My Ecobee" or 0 values).

Such columns are usually not informative and can be dropped to simplify the dataset.

In [7]:
const_columns = check_constant_columns(thermostat_df)
#print("\n Constant Columns:", const_columns)


 Constant Columns: ['Wind Speed (km/h)']


### check_empty_columns

This function returns a list of columns that are completely empty (i.e., 100% null values).

These columns typically be removed unless or populate them later.

In [8]:
empty_columns = check_empty_columns(thermostat_df)
#print("\n Empty Columns:", empty_columns)


 Empty Columns: []


### Standardize Column Names

Clean up messy column names

In [9]:
thermostat_df.columns = thermostat_df.columns.str.strip().str.lower().str.replace(' ', '_')

thermostat_df.rename(columns={
    'cool_set_temp_(f)': 'cool_set_temp_f',
    'heat_set_temp_(f)': 'heat_set_temp_f',
    'current_temp_(f)': 'current_temp_f',
    'humidity_set_point_(%rh)': 'humidity_set_point_rh',
    'current_humidity_(%rh)': 'current_humidity_rh',
    'outdoor_temp_(f)': 'outdoor_temp_f',
    'wind_speed_(km/h)': 'wind_speed_kmh',
    'cool_stage_1_(sec)': 'cool_stage_1_sec',
    'heat_stage_1_(sec)': 'heat_stage_1_sec',
    'fan_(sec)': 'fan_sec',
    'humidifier_(sec)': 'humidifier_sec',
    'thermostat_temperature_(f)': 'thermostat_temperature_f',
    'thermostat_humidity_(%rh)': 'thermostat_humidity_rh'
}, inplace=True)

print(thermostat_df.columns)

Index(['date', 'time', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion'],
      dtype='object')


### Convert date and Time to Timestamp Format
The thermostat dataset includes separate `date` and `time` columns. For time-based analysis and merging with other data (e.g., weather), it is essential to combine them into a single `timestamp` column and convert it to proper datetime format.

The `date` and `time` column is combined and then converted to `datetime` format and renamed as `timestamp` for compatibility with the weather dataset.

If datatype is not String, convert `date` and `time` to String using .astype(str)

To handle invalid date values, use errors=`coerce` argument, to replace invalid dates to NaT (Not a Time)

In order to avoid warning, slow and less consitent parsing, explicitly specify datetime format.

format = `%m/%d/%y %H:%M:%S`

In [10]:
# Combine 'date' and 'time' into 'timestamp' only if both columns are present

if 'date' in thermostat_df.columns and 'time' in thermostat_df.columns:
    thermostat_df['timestamp'] = pd.to_datetime(
        thermostat_df['date'] + ' ' + thermostat_df['time'], format='%m/%d/%y %H:%M:%S', errors='coerce'
        )
    # Drop original 'date' and 'time' columns safely
    thermostat_df.drop(columns=['date', 'time'], inplace=True)
else:
  print("Column 'date' and 'time' not found. Skipping 'datetime' conversion.")

#print(thermostat_df.columns)

### Reorder Columns
Rearrange the columns to place `timestamp` as the first column (if needed), followed by the HVAC runtime variables. This improves readability and aligns with the weather dataset.

In [11]:
# Reorder columns - timestamp first
cols = ['timestamp'] + [col for col in thermostat_df.columns if col != 'timestamp']
thermostat_df = thermostat_df[cols]

#print(thermostat_df.columns)
#print(thermostat_df.dtypes)

### Conversion of datatype for thermostat_motion

It is a motion sensor flag (0 or 1) can be int or bool but not float

In [12]:
thermostat_df['thermostat_motion'] = thermostat_df['thermostat_motion'].fillna(0).astype(int)
#print(thermostat_df.dtypes)

### Clean Categorical Columns (All lower case)

To improve consistency and make filtering, grouping and plotting easier

In [13]:
cat_cols = ['system_mode', 'calendar_event', 'program_mode', 'system_setting']

for col in cat_cols:
    if col in thermostat_df.columns:
        thermostat_df[col] = thermostat_df[col].astype(str).str.strip().str.lower()

#thermostat_df

### Missing Data Analysis & Dropping Non-Essential Colunms

- Before dropping, I created copy of entire dataframe

####  1. `calendar_event`
- Represents calendar-based scheduling events such as "vacation" or "smartAway".
- Not required for my core analysis, which focuses on runtime patterns, setpoint behavior, and weather correlation.
- Has ~20.75% missing data.
- Not aligned with my current objectives,and hence dropping.

#### 2. `dm_offset`
- Stands for "Device Management Offset" — an internal correction applied to sensor readings.
- Not user-controlled or visible in the interface.
- Has ~61.26% missing data and no analytical value for runtime, weather, or setpoint correlation.
- Dropped due to irrelevance to all core features and high missing percentage.

#### 3. remove rows where only `timestamp` has value and rest are nan

In [14]:
# Create a backup copy
#thermostat_df_full = thermostat_df.copy()


# Drop non-essential columns from main DataFrame
thermostat_df.drop(columns=['calendar_event', 'dm_offset'], inplace=True)


In [15]:
#Drop rows where all columns except 'timestamp' are blank

thermostat_df = thermostat_df[~thermostat_df.drop(columns=['timestamp']).isnull().all(axis=1)]

missing_pct = thermostat_df.isnull().mean() * 100
print(missing_pct)

timestamp                   0.000000
system_setting              0.000000
system_mode                 0.000000
program_mode                0.000000
cool_set_temp_f             0.177686
heat_set_temp_f             0.177686
current_temp_f              0.177686
humidity_set_point_rh       0.177686
current_humidity_rh         0.177686
outdoor_temp_f              0.112480
wind_speed_kmh              0.112480
cool_stage_1_sec            0.177686
heat_stage_1_sec            0.177686
fan_sec                     0.177686
humidifier_sec              0.177686
thermostat_temperature_f    0.177686
thermostat_humidity_rh      0.177686
thermostat_motion           0.000000
dtype: float64


In [16]:
# making a copy before filling
thermostat_df_before_fill = thermostat_df.copy()
thermostat_df_before_fill


Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
0,2024-05-01 00:00:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,73.0,57.0,0
1,2024-05-01 00:05:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,73.0,57.0,0
2,2024-05-01 00:10:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,270.0,0.0,73.0,57.0,0
3,2024-05-01 00:15:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,73.0,57.0,0
4,2024-05-01 00:20:00,cool,compressorcooloff,sleep,74.5,69.5,72.9,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,72.9,57.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,74.5,61.0,0
122684,2025-06-30 23:40:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,300.0,0.0,74.6,61.0,0
122685,2025-06-30 23:45:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,270.0,0.0,74.6,61.0,0
122686,2025-06-30 23:50:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,74.5,61.0,0


In [17]:
missing_pct = thermostat_df_before_fill.isnull().mean() * 100
print(missing_pct)

timestamp                   0.000000
system_setting              0.000000
system_mode                 0.000000
program_mode                0.000000
cool_set_temp_f             0.177686
heat_set_temp_f             0.177686
current_temp_f              0.177686
humidity_set_point_rh       0.177686
current_humidity_rh         0.177686
outdoor_temp_f              0.112480
wind_speed_kmh              0.112480
cool_stage_1_sec            0.177686
heat_stage_1_sec            0.177686
fan_sec                     0.177686
humidifier_sec              0.177686
thermostat_temperature_f    0.177686
thermostat_humidity_rh      0.177686
thermostat_motion           0.000000
dtype: float64


###  Handling Missing Values in Thermostat Data

After removing rows with only timestamp values and dropping non-essential columns, a small percentage of missing data remains in several sensor and runtime columns (approximately 0.18%).

Since the data is time series and logged at 5-minute intervals, forward-filling (`ffill`) is an appropriate strategy to maintain continuity. This method fills missing values with the most recent valid entry, which is especially useful for sensor readings and environmental metrics.

Before applying forward-fill, sort  data chronologically by `timestamp`.


In [18]:

# Sort data by timestamp for chronological order
thermostat_df.sort_values('timestamp', inplace=True)
thermostat_df

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
0,2024-05-01 00:00:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,73.0,57.0,0
1,2024-05-01 00:05:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,73.0,57.0,0
2,2024-05-01 00:10:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,270.0,0.0,73.0,57.0,0
3,2024-05-01 00:15:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,73.0,57.0,0
4,2024-05-01 00:20:00,cool,compressorcooloff,sleep,74.5,69.5,72.9,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,72.9,57.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,74.5,61.0,0
122684,2025-06-30 23:40:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,300.0,0.0,74.6,61.0,0
122685,2025-06-30 23:45:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,270.0,0.0,74.6,61.0,0
122686,2025-06-30 23:50:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,74.5,61.0,0


In [19]:
# Forward-fill missing values in all columns
#thermostat_df.fillna(method='ffill', inplace=True)
thermostat_df.ffill(inplace=True)

In [20]:
thermostat_df

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
0,2024-05-01 00:00:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,73.0,57.0,0
1,2024-05-01 00:05:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,73.0,57.0,0
2,2024-05-01 00:10:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,270.0,0.0,73.0,57.0,0
3,2024-05-01 00:15:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,73.0,57.0,0
4,2024-05-01 00:20:00,cool,compressorcooloff,sleep,74.5,69.5,72.9,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,72.9,57.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,74.5,61.0,0
122684,2025-06-30 23:40:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,300.0,0.0,74.6,61.0,0
122685,2025-06-30 23:45:00,cool,compressorcooloff,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,270.0,0.0,74.6,61.0,0
122686,2025-06-30 23:50:00,cool,compressorcooloff,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,74.5,61.0,0


In [21]:
missing_pct1 = thermostat_df.isnull().mean() * 100
print(missing_pct1)

timestamp                   0.0
system_setting              0.0
system_mode                 0.0
program_mode                0.0
cool_set_temp_f             0.0
heat_set_temp_f             0.0
current_temp_f              0.0
humidity_set_point_rh       0.0
current_humidity_rh         0.0
outdoor_temp_f              0.0
wind_speed_kmh              0.0
cool_stage_1_sec            0.0
heat_stage_1_sec            0.0
fan_sec                     0.0
humidifier_sec              0.0
thermostat_temperature_f    0.0
thermostat_humidity_rh      0.0
thermostat_motion           0.0
dtype: float64


In [22]:
# check if there are any missing (NaN) values in the entire DataFrame.
thermostat_df.isnull().values.any()

np.False_

In [23]:
# Missing per column
thermostat_df.isnull().sum() 

timestamp                   0
system_setting              0
system_mode                 0
program_mode                0
cool_set_temp_f             0
heat_set_temp_f             0
current_temp_f              0
humidity_set_point_rh       0
current_humidity_rh         0
outdoor_temp_f              0
wind_speed_kmh              0
cool_stage_1_sec            0
heat_stage_1_sec            0
fan_sec                     0
humidifier_sec              0
thermostat_temperature_f    0
thermostat_humidity_rh      0
thermostat_motion           0
dtype: int64

In [24]:
# Rows with at least one missing value
thermostat_df[thermostat_df.isnull().any(axis=1)]

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


In [25]:
# count the number of duplicate rows
thermostat_df.duplicated().sum()

np.int64(0)

In [26]:
# Check duplicates based on timestamp only
thermostat_df.duplicated(subset=['timestamp']).sum()

np.int64(12)

In [27]:
# Finding duplicate timestamps
duplicate_ts = thermostat_df[thermostat_df.duplicated(subset=['timestamp'], keep=False)]
duplicate_ts.sort_values('timestamp')

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
53592,2024-11-03 01:00:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,300.0,0.0,68.4,57.0,0
53580,2024-11-03 01:00:00,cool,compressorcooloff,sleep,72.0,72.0,68.6,0.0,57.0,43.5,0.0,0.0,0.0,300.0,0.0,68.6,57.0,0
53581,2024-11-03 01:05:00,cool,compressorcooloff,sleep,72.0,72.0,68.6,0.0,57.0,43.5,0.0,0.0,0.0,270.0,0.0,68.6,57.0,0
53593,2024-11-03 01:05:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,270.0,0.0,68.4,57.0,0
53594,2024-11-03 01:10:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,255.0,0.0,68.4,57.0,0
53582,2024-11-03 01:10:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,255.0,0.0,68.7,57.0,0
53583,2024-11-03 01:15:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,300.0,0.0,68.7,57.0,0
53595,2024-11-03 01:15:00,cool,compressorcooloff,sleep,72.0,72.0,68.3,0.0,57.0,42.2,0.0,0.0,0.0,300.0,0.0,68.3,57.0,0
53584,2024-11-03 01:20:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,270.0,0.0,68.7,57.0,0
53596,2024-11-03 01:20:00,cool,compressorcooloff,sleep,72.0,72.0,68.3,0.0,57.0,42.2,0.0,0.0,0.0,270.0,0.0,68.3,57.0,0


### Handling Duplicate Timestamps in Data

- While checking for full_row duplicates, the code returned no duplicate values. But when checked based on `timestamps`, some timestamps are duplicated.
- 12 timestamps are repeated twice and only `current_temp_f` and `thermostat_temperature_f` column values differ slightly between rows logged at the same timestamp.
- so for merging with weather data and building time-based visualization, in order to avoid misalign and duplicate charts, I decided to remove the duplicate row
- before doing , created a backup copy of full dataset 

In [28]:
thermostat_df_with_dupes = thermostat_df.copy()

In [29]:
# Remove duplicate timestamps and retain first occurarence

#thermostat_df = thermostat_df.drop_duplicates(subset='timestamp', keep='first')

In [30]:
thermostat_df = thermostat_df.drop_duplicates(subset='timestamp', keep='first').copy()

In [31]:
# Check duplicates based on timestamp only
thermostat_df.duplicated(subset=['timestamp']).sum()

np.int64(0)

In [32]:
duplicate_ts = thermostat_df[thermostat_df.duplicated(subset=['timestamp'], keep=False)]
duplicate_ts.sort_values('timestamp')

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


### Outlier Detection

Identifying potential outliers using the Interquartile Range (IQR) method. This helps in detecting abnormal sensor reading or runtime anomalies values that might distort analysis.
Since the thermostat dataset contains multiple continuous numeric columns, **Interquartile Range (IQR)** method is applied to each relevant column individually.
- .quantile() takes a float value between 0 and 1 to specify the desired quantile.
- Q1 = .25 * (N-1) where N is number of datapoints in the list
- Q3 = .75 * (N-1)
1. interpolation:
- Q = Xi + f * (Xi+1 - Xi) 
- Xi - Value at lower index
- Xi+1 - Value at the upper index
- f - fractional distance between lower and upper index (e.g 0.25, .75)
2. Created a new boolean column for each targeted column, where:
        - `True` indicates an outlier
        - `False` indicates a normal value

In [33]:
numeric_cols_for_outliers = [
    'current_temp_f', 'current_humidity_rh',
    'cool_set_temp_f', 'heat_set_temp_f',
    'humidity_set_point_rh',
    'thermostat_temperature_f', 'thermostat_humidity_rh',
    'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec', 'humidifier_sec'
]

for col in numeric_cols_for_outliers:
    q1 = thermostat_df[col].quantile(0.25)
    q3 = thermostat_df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    thermostat_df[f'{col}_outlier'] = ~thermostat_df[col].between(lower_bound, upper_bound)

    
print(thermostat_df)

                 timestamp system_setting        system_mode program_mode  \
0      2024-05-01 00:00:00           cool  compressorcooloff        sleep   
1      2024-05-01 00:05:00           cool  compressorcooloff        sleep   
2      2024-05-01 00:10:00           cool  compressorcooloff        sleep   
3      2024-05-01 00:15:00           cool  compressorcooloff        sleep   
4      2024-05-01 00:20:00           cool  compressorcooloff        sleep   
...                    ...            ...                ...          ...   
122683 2025-06-30 23:35:00           cool  compressorcooloff        sleep   
122684 2025-06-30 23:40:00           cool  compressorcooloff        sleep   
122685 2025-06-30 23:45:00           cool  compressorcooloff        sleep   
122686 2025-06-30 23:50:00           cool  compressorcooloff        sleep   
122687 2025-06-30 23:55:00           cool  compressorcooloff        sleep   

        cool_set_temp_f  heat_set_temp_f  current_temp_f  \
0              

### Summarize the number of outliers in each column

- Helps to identify which variables hav the most frquent anamolies and whether further data handling is needed.

In [34]:
# Count total outliers per column

outlier_summary = {
    col: thermostat_df[f'{col}_outlier'].sum()
    for col in numeric_cols_for_outliers
}
pd.Series(outlier_summary).sort_values(ascending=False)

cool_stage_1_sec            19522
heat_stage_1_sec            13401
heat_set_temp_f              3496
fan_sec                      1570
current_temp_f                721
cool_set_temp_f               581
humidifier_sec                490
thermostat_temperature_f      257
current_humidity_rh             1
thermostat_humidity_rh          1
humidity_set_point_rh           0
dtype: int64

### Filtering Rows with Outliers – and Why `.copy()` Matters

- To analyze thermostat readings flagged as outliers, we filter the dataset to extract only those rows where one or more outlier conditions are `True`.

- Why .copy() is used:
When filtering a DataFrame like df[condition], pandas may return a view (not a full copy). This can lead to unexpected behavior or SettingWithCopyWarning when we try to modify the filtered result.

By appending .copy():

Ensure that the new DataFrame is an independent object

It is safe to manipulate (e.g., assign new columns, drop rows)

Avoid ambiguous behavior during further analysis

Rule of Thumb:

If planning to modify the filtered result — always use .copy() to ensure safety and avoid warnings.

In [35]:
outlier_cols = [col for col in thermostat_df.columns if col.endswith('_outlier')]
thermostat_outliers_df = thermostat_df[thermostat_df[outlier_cols].any(axis=1)].copy()
thermostat_outliers_df.sort_values('timestamp').head(10)

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,...,current_humidity_rh_outlier,cool_set_temp_f_outlier,heat_set_temp_f_outlier,humidity_set_point_rh_outlier,thermostat_temperature_f_outlier,thermostat_humidity_rh_outlier,cool_stage_1_sec_outlier,heat_stage_1_sec_outlier,fan_sec_outlier,humidifier_sec_outlier
242,2024-05-01 20:10:00,cool,compressorcooloff,home,74.7,70.2,75.1,0.0,58.0,75.6,...,False,False,False,False,False,False,True,False,False,False
243,2024-05-01 20:15:00,cool,compressorcoolstage1on,home,73.0,70.2,75.1,0.0,58.0,75.6,...,False,False,False,False,False,False,True,False,False,False
244,2024-05-01 20:20:00,cool,compressorcoolstage1on,home,73.0,70.2,74.9,0.0,57.0,75.6,...,False,False,False,False,False,False,True,False,False,False
245,2024-05-01 20:25:00,cool,compressorcoolstage1on,home,73.0,70.2,74.7,0.0,56.0,75.6,...,False,False,False,False,False,False,True,False,False,False
246,2024-05-01 20:30:00,cool,compressorcoolstage1on,home,73.0,70.2,74.7,0.0,55.0,73.5,...,False,False,False,False,False,False,True,False,False,False
247,2024-05-01 20:35:00,cool,compressorcoolstage1on,home,73.0,70.2,74.5,0.0,55.0,73.5,...,False,False,False,False,False,False,True,False,False,False
248,2024-05-01 20:40:00,cool,compressorcoolstage1on,home,73.0,70.2,74.5,0.0,54.0,73.5,...,False,False,False,False,False,False,True,False,False,False
249,2024-05-01 20:45:00,cool,compressorcoolstage1on,home,73.0,70.2,74.2,0.0,54.0,73.5,...,False,False,False,False,False,False,True,False,False,False
250,2024-05-01 20:50:00,cool,compressorcoolstage1on,home,73.0,70.2,74.0,0.0,54.0,73.5,...,False,False,False,False,False,False,True,False,False,False
251,2024-05-01 20:55:00,cool,compressorcoolstage1on,home,73.0,70.2,73.9,0.0,54.0,73.5,...,False,False,False,False,False,False,True,False,False,False


### Convert Category Dtype

In [36]:
thermostat_categorical_cols = [
    'system_setting', 'system_mode', 'program_mode'
]
for col in thermostat_categorical_cols:
    thermostat_df[col] = thermostat_df[col].astype('category')

In [37]:
# reset index
thermostat_df.reset_index(drop=True, inplace=True)

In [38]:
# Save cleaned version
thermostat_df.to_csv(cleaned_path, index=False, float_format="%.2f")
print(f" Cleaned thermostat data saved to: {cleaned_path}")

 Cleaned thermostat data saved to: ../data/processed/thermostat_data_cleaned.csv
