In [10]:
import pandas as pd
import sqlite3
import os


In [None]:
# Check DB file is accessible and connect to db

db_path = "../database/thermostat_analysis.db"

if os.path.exists(db_path):
    conn = sqlite3.connect(db_path)
    print("Connected to DB successfully.")
else:
    print("DB file not found. Please check the path.")

Connected to DB successfully.


In [None]:
# To ensure if the connection is successful

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables:", tables)

Tables: [('weather_data',), ('weather_data_final',), ('thermostat_data_final',)]


In [7]:
# Define SQL JOIN query

query = """
SELECT 
    t.timestamp,
    t.system_setting,
    t.system_mode,
    t.program_mode,
    t.cool_set_temp_f,
    t.heat_set_temp_f,
    t.current_temp_f,
    t.current_humidity_rh,
    t.outdoor_temp_f,             
    t.fan_sec,
    t.fan_runtime_category,
    t.date,
    t.hour,
    t.minute,
    t.weekday,
    t.month,
    w.outdoor_temp_f AS weather_outdoor_temp_f,  
    w.outdoor_humidity
FROM 
    thermostat_data_final t
JOIN 
    weather_data_final w
ON 
    t.timestamp = w.timestamp
ORDER BY 
    t.timestamp;
"""

In [11]:
# Execute the query

merged_df = pd.read_sql_query(query, conn)

In [12]:
# Close the connection

conn.close()

In [13]:
# Preview the merged data

merged_df.head(5)

Unnamed: 0,timestamp,system_setting,system_mode,program_mode,cool_set_temp_f,heat_set_temp_f,current_temp_f,current_humidity_rh,outdoor_temp_f,fan_sec,fan_runtime_category,date,hour,minute,weekday,month,weather_outdoor_temp_f,outdoor_humidity
0,2024-05-01 00:00:00,cool,compressorcooloff,sleep,74.5,69.5,73.0,57.0,59.9,300.0,Medium,2024-05-01,0,0,Wednesday,May,58.46,75
1,2024-05-01 01:00:00,cool,compressorcooloff,sleep,74.5,69.5,72.6,57.0,58.1,300.0,Medium,2024-05-01,1,0,Wednesday,May,56.3,80
2,2024-05-01 02:00:00,cool,compressorcooloff,sleep,74.5,69.5,72.2,57.0,55.6,300.0,Medium,2024-05-01,2,0,Wednesday,May,55.04,83
3,2024-05-01 03:00:00,cool,compressorcooloff,sleep,74.5,69.5,71.8,57.0,53.9,300.0,Medium,2024-05-01,3,0,Wednesday,May,54.14,86
4,2024-05-01 04:00:00,cool,compressorcooloff,sleep,74.5,69.5,71.3,57.0,53.3,300.0,Medium,2024-05-01,4,0,Wednesday,May,54.14,88


In [15]:
# Save the merged df to a CSV file

merged_df.to_csv("../data/processed/merged_thermostat_weather_data.csv", index=False)

print("Merged data saved to: ../data/processed/merged_thermostat_weather_data.csv")

Merged data saved to: ../data/processed/merged_thermostat_weather_data.csv


In [16]:
# Ensure timestamp is already datetime

merged_df['timestamp'].dtype

dtype('O')

In [17]:
# since the dtype is String, converting it into datetime format suitable for line plots and time-based filters

merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'])

In [18]:
merged_df['timestamp'].dtype

dtype('<M8[ns]')