Clean the raw weather data

In [6]:
import pandas as pd
import os
import sys
sys.path.append('../scripts') #Add scripts folder to path

from utilities import (
    profile_dataframe_overview,
    summary_stats,
    value_counts_all_categoricals,
    check_constant_columns,
    check_empty_columns
)

cleaning the raw weather data previously saved as a CSV for data processing. 

- Converting the `time` column into a proper `timestamp`
- Dropping the original `time` column
- Removing duplicate or fully empty rows
- Reordering columns for consistency

The cleaned weather data is saved in `data/processed/weather_data_cleaned.csv` and will be ready for merging with thermostat data.

In [None]:
# Define input/output paths
raw_path = "../data/raw/weather_data_hourly.csv"
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
cleaned_path = os.path.join(processed_dir, "weather_data_cleaned.csv")

# Load raw weather data
df = pd.read_csv(raw_path)
# print("Current Working Directory:", os.getcwd())
# print(df)

Current Working Directory: /Volumes/Seagate/CodeU/Python/Data_analytics/thermostat_energy_analysis/notebooks
                   time  temperature_2m  relative_humidity_2m  windspeed_10m
0      2024-05-01T00:00            14.7                    75            6.1
1      2024-05-01T01:00            13.5                    80            5.5
2      2024-05-01T02:00            12.8                    83            4.8
3      2024-05-01T03:00            12.3                    86            7.4
4      2024-05-01T04:00            12.3                    88            9.3
...                 ...             ...                   ...            ...
10219  2025-06-30T19:00            26.9                    78           11.9
10220  2025-06-30T20:00            26.4                    82            7.2
10221  2025-06-30T21:00            25.1                    91            4.5
10222  2025-06-30T22:00            24.8                    90            5.9
10223  2025-06-30T23:00            24.3     

### profile_dataframe_overview()
This function provides a structured overview of any DataFrame. It displays:

- The number of rows and columns
- Column names and data types
- DataFrame info summary (including memory usage)
- Missing values per column (as percentages)
- Unique values per column
- Sample preview of the first and last N rows

In [7]:
profile_dataframe_overview(df)


 Basic Structure
----------------------------------------
Rows: 10224, Columns: 4
Column Names: ['time', 'temperature_2m', 'relative_humidity_2m', 'windspeed_10m']

 Data Types
----------------------------------------
time                     object
temperature_2m          float64
relative_humidity_2m      int64
windspeed_10m           float64
dtype: object

 DataFrame Summary Info
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10224 entries, 0 to 10223
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   time                  10224 non-null  object 
 1   temperature_2m        10224 non-null  float64
 2   relative_humidity_2m  10224 non-null  int64  
 3   windspeed_10m         10224 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 319.6+ KB

 Missing Data Overview (%)
----------------------------------------
No missing values.

 Uniqu

In [None]:
# Basic cleaning
# Convert 'time' to timestamp

if 'time' in df.columns:
    df['timestamp'] = pd.to_datetime(df['time'])
    df.drop(columns=['time'], inplace=True)

In [None]:

# Reorder columns - timestamp first
cols = ['timestamp'] + [col for col in df.columns if col != 'timestamp']
df = df[cols]

In [None]:
# Drop duplicates or rows with all NaNs 
df.drop_duplicates(inplace=True)
df.dropna(how='all', inplace=True)

In [None]:

# Save cleaned version
df.to_csv(cleaned_path, index=False)
print(f" Cleaned weather data saved to: {cleaned_path}")