## Daewoo Steel Factory dataset (building-level electricity forecasting)

In [None]:
from datasets.data_loader import Dataset

d = Dataset("datasets/data.json")
dataset = d.get_data("daewoo")
df = dataset["Steel_industry_data"]

df.head()

In [None]:
week_status = {}
day_number = {}

for i, s in enumerate(df.WeekStatus.unique()):
    week_status[s] = 0 if i == 1 else 1

for i, d in enumerate(df.Day_of_week.unique()):
    day_number[d] = i + 1

In [None]:
df_copy = df[["date", "Day_of_week", "WeekStatus", "NSM", "Leading_Current_Reactive_Power_kVarh", "Leading_Current_Power_Factor",
        "Lagging_Current_Reactive.Power_kVarh", "Lagging_Current_Power_Factor", "CO2(tCO2)", "Usage_kWh"]].copy()

df_copy.head()

In [None]:
import pandas as pd

df_copy["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y %H:%M")
df_copy.set_index("date", inplace=True)
df_copy.head()

In [None]:
df_copy["Day_of_week"] = df_copy["Day_of_week"].apply(lambda x: day_number[x])
df_copy["WeekStatus"] = df_copy["WeekStatus"].apply(lambda x: week_status[x])

df_copy = df_copy.resample('H').mean()

In [None]:
df_copy.index.min(), df_copy.index.max()

In [None]:
df_copy.corr().round(5).loc["Usage_kWh"]

In [None]:
import os

plots_dir = "plots"
if not os.path.exists(plots_dir):
    os.mkdir(plots_dir)
    print("Directory created.")
else:
    print("Directory already exists.")

In [None]:
import matplotlib.pyplot as plt

n_weeks = 10
idx = int(n_weeks * 7 * 24)

plt.figure(figsize=(20, 6)).set_dpi(128)
plt.plot(df_copy["Usage_kWh"].iloc[:idx], label="Usage in kWh")
#plt.plot(hourly_df["Day_of_week"].iloc[:idx] * 10, label="Day of week")
plt.plot(df_copy["WeekStatus"].iloc[:idx] * 10, label="Week status")
plt.legend()

plt.xlabel("Date")
plt.ylabel("Usage in kWh")
plt.title("Daewoo Steel Plant (10-week Electricity Usage in kWh)")

fig = plt.gcf()  # Get current figure
note_text = "** High signal in week status show WEEKDAY and low signal shows WEEKEND"
x_center = 0.25  # Center of the plot in figure coordinates (0 to 1)
y_below_xlabel = 0  # Adjust this for note position relative to x-label
note = fig.text(x_center, y_below_xlabel, note_text, ha="center", va="bottom")

fname = f"{plots_dir}/10_week_usage_daewoo.png"
# plt.savefig(fname, dpi=256, bbox_inches="tight")
plt.show()

In [None]:
cols = [col for col in df_copy.columns if "Usage" not in col]

row, col = (2, 3)
fig, axes = plt.subplots(row, col, figsize=(20, 10))

k = 0
x = "Usage_kWh"
for i in range(row):
    for j in range(col):
        y = cols[k]
        axes[i, j].scatter(df_copy[y], df_copy[x], c=df_copy[y])
        axes[i, j].set_xlabel(y)
        axes[i, j].set_ylabel("Usage in kWh")
        k += 1

plt.tight_layout()

fname = f"{plots_dir}/correlation_daewoo.png"
plt.savefig(fname, dpi=256, bbox_inches="tight")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

row, col = (3, 3)
fig, axes = plt.subplots(row, col, figsize=(20, 18))
cols = df_copy.columns

k = 0
for i in range(row):
    for j in range(col):
        x = cols[k]
        sns.histplot(df_copy, x=x, bins=50, ax=axes[i, j], kde=True)
        k = k + 1

fig.suptitle("Feature-wise distribution charts for Daewoo Steel Factory Dataset")
fig.tight_layout()
fig.subplots_adjust(top=0.95)

fname = f"{plots_dir}/distribution_daewoo.png"
plt.savefig(fname, dpi=512, bbox_inches="tight")
plt.show()

In [None]:
desc = df_copy.iloc[:, 2:].describe()
desc.to_csv("daewoo.csv")

desc

## Lawrence Berkley National Lab dataset (building-level Heating forecasting)

### Preprocessing the dataset

In [None]:
from datasets.data_loader import Dataset

d = Dataset("datasets/data.json")
dataset = d.get_data("lawber")
print("Load successful")

dataset["data"].head()

In [None]:
# set first column of each dataset as the time index of the dataframe
import pandas as pd

for key, df in dataset.items():
    index_col = df.columns[0]
    df.rename(columns = {index_col: "timestamp"}, inplace=True)
    index_col = "timestamp"
    df[index_col] = pd.to_datetime(df[index_col])
    df.set_index(index_col, inplace=True)

    df_select_cols = [col for col in df.columns if ("qc_" not in col) and ("Unnamed" not in col)]
    df = df[df_select_cols]
    dataset[key] = df

In [None]:
for key in dataset.keys():
    print(key, dataset[key].columns)
    print("-------")

##### Aggregating data points column-wise to form building-level data 

In [None]:
# Adding since electricity is measured separately for S and N wing
dataset["ele"]["electricity"] = dataset["ele"]["mels_S"] + dataset["ele"]["mels_N"] 

# We assume south wing light load as a whole for the full building
dataset["ele"].rename(columns={"lig_S": "light"}, inplace=True)

# adding since HVAC is measured separately for S and N wing
dataset["ele"]["hvac"] = dataset["ele"]["hvac_S"] + dataset["ele"]["hvac_N"]

dataset["ele"] = dataset["ele"][["electricity", "light", "hvac"]]
dataset["ele"].head()

In [None]:
# Print missing records
print(dataset["ele"].isna().sum())

# Linear interpolation for missing values
dataset["ele"] = dataset["ele"].interpolate()