# Generate Data

This Jupyter Notebook generates the dso_data folder and the data files. It first generates the train set and then in the second iteration starts with test set.

In [4]:
import os
import json
import time
from datetime import datetime
import threading

import numpy as np
import pandas as pd

from google.cloud import storage

def secure_mkdir(directory_path):
    if not os.path.isdir(directory_path):
        os.mkdir(directory_path)

def secure_listdir(path, rm_dirs=[".ipynb_checkpoints", ]):
    path_list = os.listdir(path)
    for rm_dir in rm_dirs:
        if rm_dir in path_list:
            path_list.remove(rm_dir)
    return path_list

API_KEY = ""
PROJECT_ID = ""
UPLOAD_2_GOOGLE_CLOUD = False

path = "data"
new_path = "dso_data"
secure_mkdir(new_path)

train = pd.read_csv(os.path.join(path, "train.csv"))
test = train[train["timestamp"] >= "2016-10-01 00:00:00"].copy()
test_dates = pd.to_datetime(test["timestamp"]).dt.date.unique()
train = train[train["timestamp"] < "2016-10-01 00:00:00"].copy()
train = pd.read_csv(os.path.join(path, "train.csv"))
train["timestamp"] = pd.to_datetime(train["timestamp"])
df = train.copy()

metadata = pd.read_csv(os.path.join(path, "building_metadata.csv"))
building2site = dict([list(x) for x in list(metadata[["building_id", "site_id"]].groupby("site_id").agg(list).reset_index().to_numpy())])
building_ids = sorted(train["building_id"].unique())
meters = sorted(train["meter"].unique())
sites = sorted(metadata["site_id"].unique())

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    storage_client = storage.Client(
        project=PROJECT_ID, 
        client_options={
            "api_key": API_KEY, 
            "quota_project_id": PROJECT_ID
        }
    )
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    generation_match_precondition = 0

    blob.upload_from_filename(
        source_file_name, 
        if_generation_match=generation_match_precondition
    )

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

for test_date in test_dates:
    train = df[df["timestamp"].dt.date < test_date].copy()
    train["timestamp"] = train["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
    for site in sites:
        site_path = os.path.join(new_path, str(site))
        for building_id in building2site[site]:
            building_path = os.path.join(site_path, str(building_id))
            if building_id % 2 == 0:
                for meter in meters:
                    data = train[(train["building_id"] == building_id) & (train["meter"] == meter)].copy()
                    data = data[["timestamp", "meter_reading"]].copy()
                    meter_file_path = os.path.join(building_path, f"{meter}")
                    if meter == 0 or meter == 2:
                        data.to_csv(meter_file_path + ".csv", index=False)
                        if UPLOAD_2_GOOGLE_CLOUD:
                            upload_blob('dagashrae', meter_file_path + ".csv", meter_file_path + ".csv")
                    else:
                        with open(meter_file_path + ".json", "w") as file:
                            json.dump(data.set_index("timestamp", drop=True).to_dict(orient="index"), file)
                        if UPLOAD_2_GOOGLE_CLOUD:
                            upload_blob('dagashrae', meter_file_path + ".json", meter_file_path + ".json")
            else:
                dso_2_data = []
                for meter in meters:
                    data = train.copy()
                    data = train[(train["building_id"] == building_id) & (train["meter"] == meter)].copy()
                    data = data[["timestamp", "meter_reading"]].rename(columns={"meter_reading": f"meter_{meter}"}).copy()
                    dso_2_data.append(data.copy())
                dso_2_data = dso_2_data[0].merge(
                    dso_2_data[1], 
                    on="timestamp", 
                    how="left"
                ).merge(
                    dso_2_data[2], 
                    on="timestamp", 
                    how="left"
                ).merge(
                    dso_2_data[3], 
                    on="timestamp", 
                    how="left"
                )
                meter_file_path = os.path.join(building_path, f"meter_reading.csv")
                dso_2_data.to_csv(meter_file_path, index=False)
                if UPLOAD_2_GOOGLE_CLOUD:
                    upload_blob('dagashrae', meter_file_path, meter_file_path)
    break