In [None]:

import calendar
from copy import deepcopy
import csv
import datetime
import os

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


import matplotlib
import matplotlib.pyplot as plt

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

from pylab import rcParams

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy import stats

import seaborn as sns

import pytz

from google.colab import drive
drive.mount('/content/drive/')

!pip install influxdb
import influxdb

HOST = 'influx.linklab.virginia.edu'
PORT = 443
USERNAME = 'cps1f23'
PASSWORD = 'phah7goohohng5ooL9mae1quohpei1Ahsh1uGing'
DATABASE = 'gateway-generic'

client = influxdb.InfluxDBClient(HOST, PORT, USERNAME, PASSWORD, DATABASE, ssl=True, verify_ssl=True)

ValueError: mount failed

# Run CO2 clustering and Power classification without data smoothing.

In [None]:
full_241_df = pd.read_csv("/content/drive/MyDrive/2024_Fall/Smart and Healthy Buildings/SAHB Energy Occupancy Group/241_data_1.csv")
full_243_df = pd.read_csv("/content/drive/MyDrive/2024_Fall/Smart and Healthy Buildings/SAHB Energy Occupancy Group/243_data_1.csv")
full_269_df = pd.read_csv("/content/drive/MyDrive/2024_Fall/Smart and Healthy Buildings/SAHB Energy Occupancy Group/269_data_1.csv")
full_245_df = pd.read_csv("/content/drive/MyDrive/2024_Fall/Smart and Healthy Buildings/SAHB Energy Occupancy Group/245_data_1.csv")

dataframes = {
    "Room 241": full_241_df,
    "Room 243": full_243_df,
    "Room 269": full_269_df,
    "Room 245": full_245_df,
}

In [None]:
def df_list(dataframes):
    updated_dfs = []
    for df_name, df in dataframes.items():
      df["time"] = pd.to_datetime(df["time"])
      df['hour_of_day'] = df['time'].dt.hour
      dataframes[df_name] = df

In [None]:
def create_histogram(df_name, data, data_type, data_unit):
    plt.hist(data, bins='auto')

    # Add df_name to the title
    plt.title(f"Histogram of {data_type} data - {df_name}")
    plt.xlabel(f"{data_type} ({data_unit})")
    plt.ylabel("Count")
    plt.show()

def graph_mean_std(df_name, data, data_type, data_unit):
    mean = np.mean(data).item()
    std_dev = np.std(data)

    # Create figure and axis
    fig, ax = plt.subplots()

    # Plot histogram
    ax.hist(data, bins=30, edgecolor='black', alpha=0.7, label='Data')

    # Add vertical lines and text for the three-sigma rule
    for i in range(1, 4):
        ax.axvline(mean + i * std_dev, color='red', linestyle='--', label=f'+{i}σ' if i == 1 else '')
        ax.axvline(mean - i * std_dev, color='blue', linestyle='--', label=f'-{i}σ' if i == 1 else '')
        ax.text(mean + i * std_dev, ax.get_ylim()[1] * 0.8, f'+{i}σ', color='red', ha='center')
        ax.text(mean - i * std_dev, ax.get_ylim()[1] * 0.8, f'-{i}σ', color='blue', ha='center')

    # Add vertical line and text for the mean
    ax.axvline(mean, color='green', linestyle='-', label='Mean')
    ax.text(mean, ax.get_ylim()[1] * 0.9, 'Mean', color='green', ha='center')

    ax.legend()

    # Add df_name to the title
    plt.title(f"{data_type} data - {df_name}")
    plt.xlabel(f"{data_type} ({data_unit})")
    plt.show()


In [None]:
for df_name, df in dataframes.items():
  power_data = df['power']
  create_histogram(df_name, power_data, "Power", "W")
  graph_mean_std(df_name, power_data, "Power", "W")

  co2_data = df['co2']
  create_histogram(df_name, co2_data, "CO2", "ppm")
  graph_mean_std(df_name, co2_data, "CO2", "ppm")

In [None]:
# K-means with 2 clusters
def apply_kmeans(data, num_clusters=2):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans.fit(data)
    labels = kmeans.labels_
    return kmeans, labels

def graph_k_means(df_name, data, data_type, kmeans, labels):
  centroids = kmeans.cluster_centers_
  occupied_label = np.argmax(centroids)  # Index of the higher centroid
  binary_labels = np.where(labels == occupied_label, 1, 0)

  time = np.arange(len(data))

  plt.figure(figsize=(10, 6))
  plt.scatter(time, data, c=labels, cmap='viridis', marker='o', alpha=0.6)
  plt.xlabel('Time')
  plt.ylabel(f'{data_type} Levels')
  plt.title(f'K-means Clustering of {data_type} Levels Over Time for {df_name}')
  plt.colorbar(label='Cluster')
  plt.show()

  plt.figure(figsize=(10, 6))
  plt.hist(data[labels == 0], bins=30, alpha=0.6, label='Cluster 0')
  plt.hist(data[labels == 1], bins=30, alpha=0.6, label='Cluster 1')
  plt.xlabel(f'{data_type} Levels')
  plt.ylabel('Frequency')
  plt.title(f'Histogram of {data_type} Levels by Cluster for {df_name}')
  plt.legend()
  plt.show()

def run_random_forest(X_train, X_test, y_train, y_test, binary_labels):
  # Initialize the random forest classifier with previously found best hyper params
  rf_classifier = RandomForestClassifier(
      max_depth=None,
      min_samples_split=2,
      n_estimators=200,
      random_state=42)

  # Train the model
  rf_classifier.fit(X_train, y_train)

  # Make predictions on the test set
  y_pred = rf_classifier.predict(X_test)

  # Calculate accuracy
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy:.2f}")

  # Print classification report for more insights
  print("Classification Report:")
  print(classification_report(y_test, y_pred))

  # Confusion matrix
  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  return rf_classifier

In [None]:
for df_name, df in dataframes.items():
    co2_data = df['co2']
    power_data = df['power']

    # using min since it gave us the best results (max also works)
    co2_min = co2_data.rolling(window=10).max()
    power_min = power_data.rolling(window=10).max()

    co2_min = co2_min[9:].to_numpy().reshape(-1, 1)  # Reshape for scaling and clustering
    power_min = power_min[9:].to_numpy().reshape(-1, 1)  # Reshape for scaling and clustering

    scaler = StandardScaler()
    co2_min_scaled = scaler.fit_transform(co2_min)
    power_min_scaled = scaler.fit_transform(power_min)

    kmeans, binary_labels = apply_kmeans(co2_min_scaled, num_clusters=2)
    graph_k_means(df_name, co2_min_scaled, "CO2", kmeans, binary_labels)

    X = power_min_scaled
    y = binary_labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_classifier = run_random_forest(X_train, X_test, y_train, y_test, binary_labels)