In [3]:
import pandas as pd
import os
import sqlite3
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    check_constant_columns,
    check_empty_columns
)

### Cleaning raw thermostat data previously saved as a CSV for data processing. 

- Converting the `date` and `time` column into a proper `timestamp`
- Renaming column names for better readability and to be consistent with weather data
- Dropping the original `date` and `time` column
- Removing duplicate or fully empty rows on need basis 
- Fill or zero-out runtime columns
- Reordering columns for consistency
- Detecting Outliers and tagging them for hadling and visualizing later

The cleaned weather data is saved in `data/processed/thermostat_data_cleaned.csv` and will be ready for merging with weather data.

In [4]:
# Define input/output paths. Create directory if doesn't exists already

raw_path = "../data/raw/thermostat_combined.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "thermostat_data_cleaned.csv")


In [6]:
# Load raw thermostat data . This dataset contains thermostat runtime parameters such as timestamp, indoor temperature, humidity, setpoint temperature, HVAC mode, HVAC Runtime.

thermostat_df = pd.read_csv(raw_path)
#print("Current Working Directory:", os.getcwd())
#print(thermostat_df)

### BASIC OVERVIEW
### profile_dataframe_overview() 
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [7]:
profile_dataframe_overview(thermostat_df)


 Basic Structure
----------------------------------------
Rows: 127872, Columns: 21
Column Names: ['Date', 'Time', 'System Setting', 'System Mode', 'Calendar Event', 'Program Mode', 'Cool Set Temp (F)', 'Heat Set Temp (F)', 'Current Temp (F)', 'Humidity Set Point (%RH)', 'Current Humidity (%RH)', 'Outdoor Temp (F)', 'Wind Speed (km/h)', 'Cool Stage 1 (sec)', 'Heat Stage 1 (sec)', 'Fan (sec)', 'Humidifier (sec)', 'DM Offset', 'Thermostat Temperature (F)', 'Thermostat Humidity (%RH)', 'Thermostat Motion']

 Data Types
----------------------------------------
Date                           object
Time                           object
System Setting                 object
System Mode                    object
Calendar Event                 object
Program Mode                   object
Cool Set Temp (F)             float64
Heat Set Temp (F)             float64
Current Temp (F)              float64
Humidity Set Point (%RH)      float64
Current Humidity (%RH)        float64
Outdoor Temp (F)  

### DISPLAY STATS SUMMARY
### summary_stats()

This function returns standard summary statistics for all numeric columns in the DataFrame:

- count, mean, std
- min, 25th percentile, median (50%), 75th percentile, max

In [8]:
summary_statistics = summary_stats(thermostat_df)
print(summary_statistics)


 Summary Statistics:
                               count    mean     std   min    25%    50%  \
Cool Set Temp (F)           127654.0   73.20    2.18  68.0   72.0   73.0   
Heat Set Temp (F)           127654.0   70.76    2.19  67.5   69.5   70.2   
Current Temp (F)            127654.0   71.57    2.42  62.1   69.8   71.8   
Humidity Set Point (%RH)    127654.0   18.61   17.99   0.0    0.0   36.0   
Current Humidity (%RH)      127654.0   52.34    7.57  27.0   47.0   54.0   
Outdoor Temp (F)            127734.0   57.66   19.26  -3.8   43.5   61.7   
Wind Speed (km/h)           127734.0    0.00    0.00   0.0    0.0    0.0   
Cool Stage 1 (sec)          127654.0   44.16  102.30   0.0    0.0    0.0   
Heat Stage 1 (sec)          127654.0   19.21   64.30   0.0    0.0    0.0   
Fan (sec)                   127654.0  278.23   32.86   0.0  255.0  300.0   
Humidifier (sec)            127654.0    0.72   13.22   0.0    0.0    0.0   
DM Offset                    49710.0    0.17    0.61  -2.6   -0.2 

### check_constant_columns

This function identifies columns where all rows have the same value (e.g., a column that always says "My Ecobee" or 0 values).

Such columns are usually not informative and can be dropped to simplify the dataset.

In [10]:
const_columns = check_constant_columns(thermostat_df)
#print("\n Constant Columns:", const_columns)


 Constant Columns: ['Wind Speed (km/h)']


### check_empty_columns

This function returns a list of columns that are completely empty (i.e., 100% null values).

These columns typically be removed unless or populate them later.

In [12]:
empty_columns = check_empty_columns(thermostat_df)
#print("\n Empty Columns:", empty_columns)


 Empty Columns: []
