In [86]:
import pandas as pd
import numpy as np
import os
import sqlite3
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    check_constant_columns,
    check_empty_columns
)

### Cleaning raw thermostat data previously saved as a CSV for data processing. 

- Converting the `date` and `time` column into a proper `timestamp`
- Renaming column names for better readability and to be consistent with weather data
- Dropping the original `date` and `time` column
- Removing duplicate or fully empty rows on need basis 
- Fill or zero-out runtime columns
- Reordering columns for consistency
- Detecting Outliers and tagging them for hadling and visualizing later

The cleaned weather data is saved in `data/processed/thermostat_data_cleaned.csv` and will be ready for merging with weather data.

In [47]:
# Define input/output paths. Create directory if doesn't exists already

raw_path = "../data/raw/thermostat_combined.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "thermostat_data_cleaned_final.csv")


In [46]:
# Load raw thermostat data . This dataset contains thermostat runtime parameters such as timestamp, indoor temperature, humidity, setpoint temperature, HVAC mode, HVAC Runtime.

thermostat_df_cleaned = pd.read_csv(raw_path)
#print("Current Working Directory:", os.getcwd())
#print(thermostat_df_cleaned)

### BASIC OVERVIEW
### profile_dataframe_overview() 
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [45]:
profile_dataframe_overview(thermostat_df_cleaned)


 Basic Structure
----------------------------------------
Rows: 122688, Columns: 21
Column Names: ['Date', 'Time', 'System Setting', 'System Mode', 'Calendar Event', 'Program Mode', 'Cool Set Temp (F)', 'Heat Set Temp (F)', 'Current Temp (F)', 'Humidity Set Point (%RH)', 'Current Humidity (%RH)', 'Outdoor Temp (F)', 'Wind Speed (km/h)', 'Cool Stage 1 (sec)', 'Heat Stage 1 (sec)', 'Fan (sec)', 'Humidifier (sec)', 'DM Offset', 'Thermostat Temperature (F)', 'Thermostat Humidity (%RH)', 'Thermostat Motion']

 Data Types
----------------------------------------
Date                           object
Time                           object
System Setting                 object
System Mode                    object
Calendar Event                 object
Program Mode                   object
Cool Set Temp (F)             float64
Heat Set Temp (F)             float64
Current Temp (F)              float64
Humidity Set Point (%RH)      float64
Current Humidity (%RH)        float64
Outdoor Temp (F)  

### DISPLAY STATS SUMMARY
### summary_stats()

This function returns standard summary statistics for all numeric columns in the DataFrame:

- count, mean, std
- min, 25th percentile, median (50%), 75th percentile, max

In [48]:
summary_statistics = summary_stats(thermostat_df_cleaned)
print(summary_statistics)


 Summary Statistics:
                               count    mean    std   min    25%    50%  \
Cool Set Temp (F)           122470.0   73.15   2.20  68.0   72.0   73.0   
Heat Set Temp (F)           122470.0   70.67   2.18  67.5   69.5   70.2   
Current Temp (F)            122470.0   71.45   2.37  62.1   69.7   71.7   
Humidity Set Point (%RH)    122470.0   17.95  18.00   0.0    0.0    0.0   
Current Humidity (%RH)      122470.0   52.26   7.70  27.0   46.0   54.0   
Outdoor Temp (F)            122550.0   56.84  19.19  -3.8   42.8   60.7   
Wind Speed (km/h)           122550.0    0.00   0.00   0.0    0.0    0.0   
Cool Stage 1 (sec)          122470.0   40.33  98.44   0.0    0.0    0.0   
Heat Stage 1 (sec)          122470.0   20.03  65.53   0.0    0.0    0.0   
Fan (sec)                   122470.0  278.18  32.59   0.0  255.0  300.0   
Humidifier (sec)            122470.0    0.75  13.49   0.0    0.0    0.0   
DM Offset                    47534.0    0.15   0.62  -2.6   -0.2    0.2   
The

### check_constant_columns

This function identifies columns where all rows have the same value (e.g., a column that always says "My Ecobee" or 0 values).

Such columns are usually not informative and can be dropped to simplify the dataset.

In [49]:
const_columns = check_constant_columns(thermostat_df_cleaned)
#print("\n Constant Columns:", const_columns)


 Constant Columns: ['Wind Speed (km/h)']


### check_empty_columns

This function returns a list of columns that are completely empty (i.e., 100% null values).

These columns typically be removed unless or populate them later.

In [50]:
empty_columns = check_empty_columns(thermostat_df_cleaned)
#print("\n Empty Columns:", empty_columns)


 Empty Columns: []


### Standardize Column Names

- Clean up messy column names

In [59]:
thermostat_df_cleaned.columns = thermostat_df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')

thermostat_df_cleaned.rename(columns={
    'cool_set_temp_(f)': 'cool_set_temp_f',
    'heat_set_temp_(f)': 'heat_set_temp_f',
    'current_temp_(f)': 'current_temp_f',
    'humidity_set_point_(%rh)': 'humidity_set_point_rh',
    'current_humidity_(%rh)': 'current_humidity_rh',
    'outdoor_temp_(f)': 'outdoor_temp_f',
    'wind_speed_(km/h)': 'wind_speed_kmh',
    'cool_stage_1_(sec)': 'cool_stage_1_sec',
    'heat_stage_1_(sec)': 'heat_stage_1_sec',
    'fan_(sec)': 'fan_sec',
    'humidifier_(sec)': 'humidifier_sec',
    'thermostat_temperature_(f)': 'thermostat_temperature_f',
    'thermostat_humidity_(%rh)': 'thermostat_humidity_rh'
}, inplace=True)

print(thermostat_df_cleaned.columns)

Index(['date', 'time', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion'],
      dtype='object')


### Convert date and Time to Timestamp Format
The thermostat dataset includes separate `date` and `time` columns. For time-based analysis and merging with other data (e.g., weather), it is essential to combine them into a single `timestamp` column and convert it to proper datetime format.

The `date` and `time` column is combined and then converted to `datetime` format and renamed as `timestamp` for compatibility with the weather dataset.

If datatype is not String, convert `date` and `time` to String using .astype(str)

To handle invalid date values, use errors=`coerce` argument, to replace invalid dates to NaT (Not a Time)

In order to avoid warning, slow and less consitent parsing, explicitly specify datetime format.

format = `%m/%d/%y %H:%M:%S`

In [60]:
# Combine 'date' and 'time' into 'timestamp' only if both columns are present

if 'date' in thermostat_df_cleaned.columns and 'time' in thermostat_df_cleaned.columns:
    thermostat_df_cleaned['timestamp'] = pd.to_datetime(
        thermostat_df_cleaned['date'] + ' ' + thermostat_df_cleaned['time'], format='%m/%d/%y %H:%M:%S', errors='coerce'
        )
    # Drop original 'date' and 'time' columns safely
    thermostat_df_cleaned.drop(columns=['date', 'time'], inplace=True)
else:
  print("Column 'date' and 'time' not found. Skipping 'datetime' conversion.")

print(thermostat_df_cleaned.columns)

Index(['system_setting', 'system_mode', 'calendar_event', 'program_mode',
       'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion', 'timestamp'],
      dtype='object')


### Reorder Columns
Rearrange the columns to place `timestamp` as the first column (if needed), followed by the HVAC runtime variables. This improves readability and aligns with the weather dataset.

In [61]:
# Reorder columns - timestamp first
cols = ['timestamp'] + [col for col in thermostat_df_cleaned.columns if col != 'timestamp']
thermostat_df_cleaned = thermostat_df_cleaned[cols]


In [62]:

print(thermostat_df_cleaned.columns)
print(thermostat_df_cleaned.dtypes)

Index(['timestamp', 'system_setting', 'system_mode', 'calendar_event',
       'program_mode', 'cool_set_temp_f', 'heat_set_temp_f', 'current_temp_f',
       'humidity_set_point_rh', 'current_humidity_rh', 'outdoor_temp_f',
       'wind_speed_kmh', 'cool_stage_1_sec', 'heat_stage_1_sec', 'fan_sec',
       'humidifier_sec', 'dm_offset', 'thermostat_temperature_f',
       'thermostat_humidity_rh', 'thermostat_motion'],
      dtype='object')
timestamp                   datetime64[ns]
system_setting                      object
system_mode                         object
calendar_event                      object
program_mode                        object
cool_set_temp_f                    float64
heat_set_temp_f                    float64
current_temp_f                     float64
humidity_set_point_rh              float64
current_humidity_rh                float64
outdoor_temp_f                     float64
wind_speed_kmh                     float64
cool_stage_1_sec                   float64

### Clean Categorical Columns (All lower case)

To improve consistency and make filtering, grouping and plotting easier

In [63]:
cat_cols = ['system_mode', 'calendar_event', 'program_mode', 'system_setting']

for col in cat_cols:
    if col in thermostat_df_cleaned.columns:
        thermostat_df_cleaned[col] = thermostat_df_cleaned[col].astype(str).str.strip().str.lower()

thermostat_df_cleaned

Unnamed: 0,timestamp,system_setting,system_mode,calendar_event,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,dm_offset,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
0,2024-05-01 00:00:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,,73.0,57.0,0.0
1,2024-05-01 00:05:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,,73.0,57.0,0.0
2,2024-05-01 00:10:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,270.0,0.0,,73.0,57.0,0.0
3,2024-05-01 00:15:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,,73.0,57.0,0.0
4,2024-05-01 00:20:00,cool,compressorcooloff,,sleep,74.5,69.5,72.9,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,,72.9,57.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,,74.5,61.0,0.0
122684,2025-06-30 23:40:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,300.0,0.0,,74.6,61.0,0.0
122685,2025-06-30 23:45:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,270.0,0.0,,74.6,61.0,0.0
122686,2025-06-30 23:50:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,,74.5,61.0,0.0


### Missing Data Analysis & Dropping Non-Essential Colunms

- Before dropping, I created copy of entire dataframe

####  1. `calendar_event`
- Represents calendar-based scheduling events such as "vacation" or "smartAway".
- Not required for my core analysis, which focuses on runtime patterns, setpoint behavior, and weather correlation.
- Has ~20.75% missing data.
- Not aligned with my current objectives,and hence dropping.

#### 2. `dm_offset`
- Stands for "Device Management Offset" — an internal correction applied to sensor readings.
- Not user-controlled or visible in the interface.
- Has ~61.26% missing data and no analytical value for runtime, weather, or setpoint correlation.
- Dropped due to irrelevance to all core features and high missing percentage.

#### 3. remove rows where only `timestamp` has value and rest are nan

In [64]:
# Create a backup copy
thermostat_df_full = thermostat_df_cleaned.copy()

In [65]:
# Drop non-essential columns from main DataFrame
thermostat_df_cleaned.drop(columns=['calendar_event', 'dm_offset'], inplace=True)

In [None]:
#thermostat_df_full

Unnamed: 0,timestamp,system_setting,system_mode,calendar_event,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,dm_offset,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
0,2024-05-01 00:00:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,,73.0,57.0,0.0
1,2024-05-01 00:05:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,,73.0,57.0,0.0
2,2024-05-01 00:10:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,270.0,0.0,,73.0,57.0,0.0
3,2024-05-01 00:15:00,cool,compressorcooloff,,sleep,74.5,69.5,73.0,0.0,57.0,59.9,0.0,0.0,0.0,300.0,0.0,,73.0,57.0,0.0
4,2024-05-01 00:20:00,cool,compressorcooloff,,sleep,74.5,69.5,72.9,0.0,57.0,59.9,0.0,0.0,0.0,255.0,0.0,,72.9,57.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122683,2025-06-30 23:35:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,,74.5,61.0,0.0
122684,2025-06-30 23:40:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,300.0,0.0,,74.6,61.0,0.0
122685,2025-06-30 23:45:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.6,36.0,61.0,75.0,0.0,0.0,0.0,270.0,0.0,,74.6,61.0,0.0
122686,2025-06-30 23:50:00,cool,compressorcooloff,auto,sleep,75.0,73.0,74.5,36.0,61.0,75.0,0.0,0.0,0.0,255.0,0.0,,74.5,61.0,0.0


In [72]:
#missing_pct = thermostat_df_cleaned.isnull().mean() * 100
#print(missing_pct)

In [77]:
# Display rows where only timestamp has avalue

# Ensure timestamp is not null
non_null_timestamp = thermostat_df_cleaned['timestamp'].notnull()
#print(non_null_timestamp)

# Check if all other columns are null or blank
other_cols_blank = thermostat_df_cleaned.drop(columns=['timestamp']).isna().all(axis=1)
#print(other_cols_blank)

# Filter rows where only timestamp has a value
only_timestamp_rows = thermostat_df_cleaned[non_null_timestamp & other_cols_blank]

# Display the result
only_timestamp_rows

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


In [None]:
#Drop rows where all columns except 'timestamp' are blank

# thermostat_df = thermostat_df_cleaned[~thermostat_df_cleaned.drop(columns=['timestamp']).isnull().all(axis=1)]


### Remove Rows with Only Weather Data

During initial check, a small number of rows were found to contain values **only** in the `timestamp`, `outdoor_temp_f`, and `wind_speed_kmh` columns, while all other thermostat-related fields were missing.

These rows likely represent partial or incorrectly logged data and could interfere with forward-filling or time-based analysis.

To maintain data integrity:
- Identified 218 such rows using a column-wise null check.
- Then dropped them from the dataset before applying any forward-filling or missing value imputation.

This ensures that only valid thermostat readings are retained in the cleaned dataset.

In [114]:
thermostat_df_cleaned.loc[503, 'cool_set_temp_f']
thermostat_df_cleaned.loc[503, 'system_setting']


'nan'

In [102]:
thermostat_df_cleaned.isna().sum()

timestamp                     0
system_setting                0
system_mode                   0
program_mode                  0
cool_set_temp_f             218
heat_set_temp_f             218
current_temp_f              218
humidity_set_point_rh       218
current_humidity_rh         218
outdoor_temp_f              138
wind_speed_kmh              138
cool_stage_1_sec            218
heat_stage_1_sec            218
fan_sec                     218
humidifier_sec              218
thermostat_temperature_f    218
thermostat_humidity_rh      218
thermostat_motion           218
dtype: int64

In [None]:
# Define thermostat-related columns (excluding timestamp, outdoor_temp_f, wind_speed_kmh,'program_mode','system_mode', 'system_setting')
thermo_cols = thermostat_df_cleaned.columns.difference(['timestamp', 'outdoor_temp_f', 'wind_speed_kmh', 'program_mode',
       'system_mode', 'system_setting'])
print(thermo_cols)

Index(['cool_set_temp_f', 'cool_stage_1_sec', 'current_humidity_rh',
       'current_temp_f', 'fan_sec', 'heat_set_temp_f', 'heat_stage_1_sec',
       'humidifier_sec', 'humidity_set_point_rh', 'thermostat_humidity_rh',
       'thermostat_motion', 'thermostat_temperature_f'],
      dtype='object')


In [115]:
# Create a mask for rows where all key thermostat-field related columns are NaN

rows_to_drop_mask = thermostat_df_cleaned[thermo_cols].isna().all(axis=1)
#rows_to_drop_mask


In [113]:
# Confirm count
print("Rows to drop:", rows_to_drop_mask.sum())

Rows to drop: 218


### Log or Save Dropped Rows Before Deletion

- Just for safer side, Im making a copy of rows-to_drop_mask rows into a .csv file

In [116]:
dropped_rows = thermostat_df_cleaned[rows_to_drop_mask]
dropped_rows.to_csv("../data/raw/dropped_thermostat_rows.csv", index=False)

In [117]:
# Final step is to drop the 218 rows
thermostat_df_cleaned = thermostat_df_cleaned[~rows_to_drop_mask]

In [None]:
# Checking to see whether 218 rows exists

thermostat_df_cleaned.isna().sum()

timestamp                     0
system_setting                0
system_mode                   0
program_mode                  0
cool_set_temp_f               0
heat_set_temp_f               0
current_temp_f                0
humidity_set_point_rh         0
current_humidity_rh           0
outdoor_temp_f              138
wind_speed_kmh              138
cool_stage_1_sec              0
heat_stage_1_sec              0
fan_sec                       0
humidifier_sec                0
thermostat_temperature_f      0
thermostat_humidity_rh        0
thermostat_motion             0
dtype: int64

### Making a copy before filling

In [125]:
# making a copy before filling
thermostat_df_before_fill = thermostat_df_cleaned.copy()
#thermostat_df_before_fill

###  Handling Missing Values in Thermostat Data

After removing rows with only timestamp values and dropping non-essential columns, a small percentage of missing data remains in several sensor and runtime columns (approximately 0.18%).

Since the data is time series and logged at 5-minute intervals, forward-filling (`ffill`) is an appropriate strategy to maintain continuity. This method fills missing values with the most recent valid entry, which is especially useful for sensor readings and environmental metrics.

Before applying forward-fill, sort  data chronologically by `timestamp`.


In [131]:
# Sort data by timestamp for chronological order
thermostat_df_cleaned = thermostat_df_cleaned.copy()
thermostat_df_cleaned.sort_values('timestamp', inplace=True)
#thermostat_df_cleaned

In [129]:
# Forward-fill missing values in all columns
#thermostat_df.fillna(method='ffill', inplace=True)
thermostat_df_cleaned.ffill(inplace=True)

In [132]:
missing_pct1 = thermostat_df_cleaned.isnull().mean() * 100
print(missing_pct1)

timestamp                   0.0
system_setting              0.0
system_mode                 0.0
program_mode                0.0
cool_set_temp_f             0.0
heat_set_temp_f             0.0
current_temp_f              0.0
humidity_set_point_rh       0.0
current_humidity_rh         0.0
outdoor_temp_f              0.0
wind_speed_kmh              0.0
cool_stage_1_sec            0.0
heat_stage_1_sec            0.0
fan_sec                     0.0
humidifier_sec              0.0
thermostat_temperature_f    0.0
thermostat_humidity_rh      0.0
thermostat_motion           0.0
dtype: float64


In [133]:
# check if there are any missing (NaN) values in the entire DataFrame.

thermostat_df_cleaned.isnull().values.any()

np.False_

In [134]:
# Missing per column
thermostat_df_cleaned.isnull().sum() 

timestamp                   0
system_setting              0
system_mode                 0
program_mode                0
cool_set_temp_f             0
heat_set_temp_f             0
current_temp_f              0
humidity_set_point_rh       0
current_humidity_rh         0
outdoor_temp_f              0
wind_speed_kmh              0
cool_stage_1_sec            0
heat_stage_1_sec            0
fan_sec                     0
humidifier_sec              0
thermostat_temperature_f    0
thermostat_humidity_rh      0
thermostat_motion           0
dtype: int64

In [135]:
# Rows with at least one missing value
thermostat_df_cleaned[thermostat_df_cleaned.isnull().any(axis=1)]

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion


In [136]:
# count the number of duplicate rows

thermostat_df_cleaned.duplicated().sum()

np.int64(0)

In [138]:
# Check duplicates based on timestamp only

thermostat_df_cleaned.duplicated(subset=['timestamp']).sum()

np.int64(12)

In [141]:
# Finding duplicate timestamps
duplicate_ts = thermostat_df_cleaned[thermostat_df_cleaned.duplicated(subset=['timestamp'], keep=False)]
duplicate_ts.sort_values('timestamp')

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,humidity_set_point_rh,current_humidity_rh,outdoor_temp_f,wind_speed_kmh,cool_stage_1_sec,heat_stage_1_sec,fan_sec,humidifier_sec,thermostat_temperature_f,thermostat_humidity_rh,thermostat_motion
53580,2024-11-03 01:00:00,cool,compressorcooloff,sleep,72.0,72.0,68.6,0.0,57.0,43.5,0.0,0.0,0.0,300.0,0.0,68.6,57.0,0.0
53592,2024-11-03 01:00:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,300.0,0.0,68.4,57.0,0.0
53581,2024-11-03 01:05:00,cool,compressorcooloff,sleep,72.0,72.0,68.6,0.0,57.0,43.5,0.0,0.0,0.0,270.0,0.0,68.6,57.0,0.0
53593,2024-11-03 01:05:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,270.0,0.0,68.4,57.0,0.0
53594,2024-11-03 01:10:00,cool,compressorcooloff,sleep,72.0,72.0,68.4,0.0,57.0,42.2,0.0,0.0,0.0,255.0,0.0,68.4,57.0,0.0
53582,2024-11-03 01:10:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,255.0,0.0,68.7,57.0,0.0
53583,2024-11-03 01:15:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,300.0,0.0,68.7,57.0,0.0
53595,2024-11-03 01:15:00,cool,compressorcooloff,sleep,72.0,72.0,68.3,0.0,57.0,42.2,0.0,0.0,0.0,300.0,0.0,68.3,57.0,0.0
53584,2024-11-03 01:20:00,cool,compressorcooloff,sleep,72.0,72.0,68.7,0.0,57.0,43.5,0.0,0.0,0.0,270.0,0.0,68.7,57.0,0.0
53596,2024-11-03 01:20:00,cool,compressorcooloff,sleep,72.0,72.0,68.3,0.0,57.0,42.2,0.0,0.0,0.0,270.0,0.0,68.3,57.0,0.0


### Handling Duplicate Timestamps in Data

- While checking for full_row duplicates, the code returned no duplicate values. But when checked based on `timestamps`, some timestamps are duplicated.
- 12 timestamps are repeated twice and only `current_temp_f` and `thermostat_temperature_f` column values differ slightly between rows logged at the same timestamp.
- so for merging with weather data and building time-based visualization, in order to avoid misalign and duplicate charts, I decided to remove the duplicate row
- before doing , created a backup copy of full dataset 

In [140]:
# creating a backup copy
thermostat_df_with_dupes = thermostat_df_cleaned.copy()

In [142]:
# Remove duplicate timestamps and retain first occurarence

thermostat_df = thermostat_df_cleaned.drop_duplicates(subset='timestamp', keep='first').copy()

In [39]:
# Check duplicates based on timestamp only

thermostat_df.duplicated(subset=['timestamp']).sum()

np.int64(0)

In [144]:
# Finding duplicate timestamps
# duplicate_ts = thermostat_df[thermostat_df.duplicated(subset=['timestamp'], keep=False)]
# duplicate_ts.sort_values('timestamp')

### Outlier Check (Summary Only)

Outlier detection was performed in the earlier version of this notebook (`clean_thermostat_data.ipynb`) using rule-based thresholds for temperature, humidity, and runtime columns.

However, for this final version:
- After careful analysis, I found out that I'm not going to use outliers for filtering, visualization or analysis. So removed the logic for detecting and tagging the `outlier`.
- We assume the cleaned dataset is reliable and all data points are retained unless explicitly invalid.
- Unused or redundant columns have been dropped in the final cleanup.

### Convert Categorical Columns to `category` Dtype

To optimize memory usage and improve processing efficiency, categorical string columns are explicitly converted to the `category` data type.

This is particularly useful for columns with repeated values such as:

- `system_setting`
- `system_mode`
- `program_mode`

Converting these to the `category` dtype:
- Reduces memory footprint
- Enables faster comparisons and groupby operations
- Helps pandas treat them as discrete labels rather than raw strings

This step is especially important before merging, analysis, or exporting the data to databases.

In [145]:
thermostat_categorical_cols = [
    'system_setting', 'system_mode', 'program_mode'
]
for col in thermostat_categorical_cols:
    thermostat_df[col] = thermostat_df[col].astype('category')