In [1]:
import pandas as pd
import os
import sqlite3
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    check_constant_columns,
    check_empty_columns
)

### Cleaning the raw weather data previously saved as a CSV for data processing. 

- Converting the `time` column into a proper `timestamp`
- Dropping the original `time` column
- Removing duplicate or fully empty rows
- Reordering columns for consistency

The cleaned weather data is saved in `data/processed/weather_data_cleaned_final.csv` and will be ready for merging with thermostat data.

In [2]:
# Define input/output paths
raw_path = "../data/raw/weather_data_hourly.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "weather_data_cleaned_final.csv")


In [None]:
# Load raw weather data . This dataset contains external weather parameters such as temperature, humidity, and wind speed at hourly intervals.

weather_df_cleaned = pd.read_csv(raw_path)
# print("Current Working Directory:", os.getcwd())


### BASIC OVERVIEW
### profile_dataframe_overview() 
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [6]:
profile_dataframe_overview(weather_df_cleaned)


 Basic Structure
----------------------------------------
Rows: 10224, Columns: 4
Column Names: ['time', 'temperature_2m', 'relative_humidity_2m', 'windspeed_10m']

 Data Types
----------------------------------------
time                     object
temperature_2m          float64
relative_humidity_2m      int64
windspeed_10m           float64
dtype: object

 DataFrame Summary Info
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10224 entries, 0 to 10223
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   time                  10224 non-null  object 
 1   temperature_2m        10224 non-null  float64
 2   relative_humidity_2m  10224 non-null  int64  
 3   windspeed_10m         10224 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 319.6+ KB

 Missing Data Overview (%)
----------------------------------------
No missing values.

 Uniqu

###  DISPLAY STATS SUMMARY
###  summary_stats()

This function returns standard summary statistics for all numeric columns in the DataFrame:

- count, mean, std
- min, 25th percentile, median (50%), 75th percentile, max


In [7]:
summary_statistics = summary_stats(weather_df_cleaned)
print(summary_statistics)


 Summary Statistics:
                        count   mean    std   min   25%   50%   75%    max
temperature_2m        10224.0  13.99  10.65 -20.3   6.4  16.3  22.1   35.6
relative_humidity_2m  10224.0  70.88  17.32  19.0  58.0  72.0  85.0  100.0
windspeed_10m         10224.0  11.09   5.66   0.0   7.1  10.1  14.4   55.7


### check_constant_columns

This function identifies columns where all rows have the same value (e.g., a column that always says "My Ecobee" or 0 values).

Such columns are usually not informative and can be dropped to simplify the dataset.

In [9]:
const_columns = check_constant_columns(weather_df_cleaned)


 Constant Columns: []


### check_empty_columns

This function returns a list of columns that are completely empty (i.e., 100% null values).

These columns typically be removed unless or populate them later.

In [11]:
empty_columns = check_empty_columns(weather_df_cleaned)


 Empty Columns: []


In [13]:
# Drop duplicates
# weather_df_cleaned.drop_duplicates(inplace=True)

print("Duplicate rows:", weather_df_cleaned.duplicated().sum())

Duplicate rows: 0


### Rename Columns for Consistency

To maintain consistency and ease of merging with the thermostat dataset later, renamed the columns using more descriptive and uniform names:

- `time` → `timestamp`
- `temperature_2m` → `outdoor_temp_c` 
- `relative_humidity_2m` → `outdoor_humidity`
- `windspeed_10m` → `wind_speed_kmh`


In [14]:
# Rename columns for consistency and clarity
weather_df_cleaned.rename(columns={
    'time': 'timestamp',
    'temperature_2m': 'outdoor_temp_c',  # temp in °C temporarily
    'relative_humidity_2m': 'outdoor_humidity',
    'windspeed_10m': 'wind_speed_kmh'
}, inplace=True)

column_names = weather_df_cleaned.columns.tolist()
print(column_names)

['timestamp', 'outdoor_temp_c', 'outdoor_humidity', 'wind_speed_kmh']


### Convert Time to Timestamp Format
The `time` column is converted to `datetime` format and renamed as `timestamp` for compatibility with the thermostat dataset.

In [15]:
# Convert 'time' to timestamp

weather_df_cleaned['timestamp'] = pd.to_datetime(weather_df_cleaned['timestamp'])
print(weather_df_cleaned['timestamp'])

0       2024-05-01 00:00:00
1       2024-05-01 01:00:00
2       2024-05-01 02:00:00
3       2024-05-01 03:00:00
4       2024-05-01 04:00:00
                ...        
10219   2025-06-30 19:00:00
10220   2025-06-30 20:00:00
10221   2025-06-30 21:00:00
10222   2025-06-30 22:00:00
10223   2025-06-30 23:00:00
Name: timestamp, Length: 10224, dtype: datetime64[ns]


### Convert Temperature from Celsius to Fahrenheit
Since the thermostat dataset reports temperatures in Fahrenheit, convert the `outdoor_temp_c` column from Celsius to Fahrenheit and rename to `outdoor_temp_f`. Then drop the `outdoor_temp_c`.

In [19]:
#weather_df_cleaned['outdoor_temp_f'] = (weather_df_cleaned['outdoor_temp_c'] * 9/5) + 32
#column_names = weather_df_cleaned.columns.tolist()
#print(column_names)

#weather_df_cleaned.drop(columns=['outdoor_temp_c'], inplace=True)
#print(column_names)

if 'outdoor_temp_c' in weather_df_cleaned.columns:
    weather_df_cleaned['outdoor_temp_f'] = (weather_df_cleaned['outdoor_temp_c'] * 9/5) + 32
    weather_df_cleaned['outdoor_temp_f'] = weather_df_cleaned['outdoor_temp_f'].round(2)
    weather_df_cleaned.drop(columns=['outdoor_temp_c'], inplace=True)
else:
    print("Column 'outdoor_temp_c' not found. Skipping temperature conversion.")

print(weather_df_cleaned)

Column 'outdoor_temp_c' not found. Skipping temperature conversion.
                timestamp  outdoor_humidity  wind_speed_kmh  outdoor_temp_f
0     2024-05-01 00:00:00                75             6.1           58.46
1     2024-05-01 01:00:00                80             5.5           56.30
2     2024-05-01 02:00:00                83             4.8           55.04
3     2024-05-01 03:00:00                86             7.4           54.14
4     2024-05-01 04:00:00                88             9.3           54.14
...                   ...               ...             ...             ...
10219 2025-06-30 19:00:00                78            11.9           80.42
10220 2025-06-30 20:00:00                82             7.2           79.52
10221 2025-06-30 21:00:00                91             4.5           77.18
10222 2025-06-30 22:00:00                90             5.9           76.64
10223 2025-06-30 23:00:00                92             5.9           75.74

[10224 rows x 4 col

### Reorder Columns
Rearrange the columns to place `timestamp` as the first column (if needed), followed by the weather variables. This improves readability and aligns with the thermostat dataset.

In [20]:
# Reorder columns - timestamp first
cols = ['timestamp'] + [col for col in weather_df_cleaned.columns if col != 'timestamp']
weather_df_cleaned = weather_df_cleaned[cols]

print(weather_df_cleaned.columns)

Index(['timestamp', 'outdoor_humidity', 'wind_speed_kmh', 'outdoor_temp_f'], dtype='object')


In [21]:
# reset index
weather_df_cleaned.reset_index(drop=True, inplace=True)

In [22]:
# Save cleaned version
weather_df_cleaned.to_csv(cleaned_path, index=False, float_format="%.2f")
print(f" Cleaned weather data saved to: {cleaned_path}")

 Cleaned weather data saved to: ../data/processed/weather_data_cleaned_final.csv
