In [0]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data
from tqdm import tqdm_notebook, tnrange

from typing import Dict, Optional, Tuple, Union, Callable

In [0]:
WEATHER_NORM_COLS = ["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr", "sea_level_pressure", "wind_speed"]
BUILDINGS_NORM_COLS = ["square_feet", "floor_count"]

def load_weather(csvfile: str) -> pd.DataFrame:
    df = pd.read_csv(
        csvfile,
        parse_dates=["timestamp"],
        index_col=["site_id", "timestamp"],
        dtype={
            "site_id": "int32",
            "air_temperature": "float64",
            "cloud_coverage": "float64",
            "dew_temperature": "float64",
            "precip_depth_1_hr": "float64",
            "sea_level_pressure": "float64",
            "wind_direction": "float64",
            "wind_speed": "float64",
        },
    )
    df["wind_direction_x"] = np.cos(df["wind_direction"])
    df["wind_direction_y"] = np.sin(df["wind_direction"])
    df.drop(columns=["wind_direction"], inplace=True)
    return df


def interp_weather(
    df: pd.DataFrame, interpolate: str = "linear", **kwargs
) -> pd.DataFrame:
    df = df.reset_index()
    site_frames = []
    for site_id in df["site_id"].unique():
        site_frame = df[df["site_id"] == site_id]
        site_frame = (
            site_frame.set_index("timestamp")
            .sort_index()
            .interpolate(interpolate, **kwargs)
            .fillna(method="ffill")
            .fillna(method="bfill")
            .fillna(value=0.0)
        )
        site_frames.append(site_frame)
    weather_frame = pd.concat(site_frames)
    weather_frame.reset_index(inplace=True)
    weather_frame.set_index(["site_id", "timestamp"], inplace=True)
    return weather_frame


def weather_norm_params(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series]:
  df_norm = df[WEATHER_NORM_COLS]
  return df_norm.mean(), df_norm.std()


def load_buildings(csvfile: str) -> pd.DataFrame:
    df = pd.read_csv(
        csvfile,
        index_col=["building_id"],
        dtype={
            "site_id": "int32",
            "building_id": "int32",
            "primary_use": "category",
            "square_feet": "float64",
            "year_built": "float64",
            "floor_count": "float64",
        },
    )
    return pd.get_dummies(df, dtype=np.float64)


def impute_buildings(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in ["square_feet", "year_built", "floor_count"]:
        df[col].fillna(df[col].mean(), inplace=True)
    return df


def buildings_norm_params(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series]:
  df_norm = df[BUILDINGS_NORM_COLS]
  return df_norm.mean(), df_norm.std()


def normalize_frame(df: pd.DataFrame, norm_params: Tuple[pd.Series, pd.Series]) -> pd.DataFrame:
  df = df.copy()
  for col in norm_params[0].index:
    df[col] = (df[col] - norm_params[0][col]) / norm_params[1][col]
  return df


class ASHRAEDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        csvfile: str,
        buildings_df: pd.DataFrame,
        weather_df: pd.DataFrame,
        has_labels: bool = True,
        meter_filter: Optional[int] = None,
        drop_zero: bool = True,
    ):
        data = pd.read_csv(
            csvfile,
            parse_dates=["timestamp"],
            dtype={
                "buliding_id": "int32",
                "meter": "int32",
                "meter_reading": "float64",
            },
        )
        if meter_filter is not None:
          data.drop(data.index[data["meter"] != meter_filter], inplace=True)
        if has_labels and drop_zero:
          data.drop(data.index[data["meter_reading"] == 0.0], inplace=True)
        data = data.join(buildings_df, how="inner", on="building_id")
        data = data.join(weather_df, how="inner", on=["site_id", "timestamp"])
        meters = data["meter"]
        readings = (
            data["meter_reading"]
            if has_labels
            else pd.Series(np.zeros(len(data)), data.index, dtype="float64")
        )
        data.drop(
            columns=[
                "building_id",
                "meter",
                "timestamp",
                "site_id",
                "meter_reading",
            ],
            inplace=True,
            errors="ignore",
        )
        data.sort_index(inplace=True)

        self.data = data
        self.meters = meters
        self.readings = readings

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int) -> Tuple[np.int, torch.tensor, np.float64]:
        return (
            torch.tensor([self.meters.iloc[index]], dtype=torch.float64),
            torch.from_numpy(self.data.iloc[index].to_numpy(np.float64)),
            torch.tensor([self.readings.iloc[index]], dtype=torch.float64),
        )

In [0]:
buildings = impute_buildings(load_buildings("building_metadata.csv"))
buildings_norm_params_ = buildings_norm_params(buildings)
buildings_norm = normalize_frame(buildings, buildings_norm_params_)
del buildings

weather_train = interp_weather(load_weather("weather_train.csv"))
weather_train_norm_params = weather_norm_params(weather_train)
weather_train_norm = normalize_frame(weather_train, weather_train_norm_params)
del weather_train
weather_test = interp_weather(load_weather("weather_test.csv"))
weather_test_norm = normalize_frame(weather_test, weather_train_norm_params)
del weather_test

In [0]:
ds_train_0 = ASHRAEDataset("train.csv", buildings_norm, weather_train_norm, meter_filter=1)
# ds_test = ASHRAEDataset("test.csv", buildings_norm, weather_test_norm, has_labels=False)

In [0]:
def train_model(model: torch.nn.Module, ds: ASHRAEDatasetSubset):
  ds = torch.utils.data.Subset(ds, np.random.choice(len(ds), 50000, replace=False))
  train_len = int(0.8 * len(ds))
  valid_len = len(ds) - train_len
  ds_train, ds_valid = torch.utils.data.random_split(ds, (train_len, valid_len))
  train_loader = torch.utils.data.DataLoader(ds_train, batch_size=128, shuffle=True)
  valid_loader = torch.utils.data.DataLoader(ds_valid, batch_size=valid_len)

  optim = torch.optim.SGD(model.parameters(), lr=1e-8, weight_decay=1e-12)
  loss_fn = torch.nn.MSELoss(reduction="mean")
  for epoch in tnrange(10):
    model.train()
    for batch in tqdm_notebook(train_loader, leave=False):
      optim.zero_grad()
      meter_pred = model(batch[1])
      # print(meter_pred, batch[2])
      # print((batch[0] == 0).all())
      loss = loss_fn(meter_pred, batch[2])
      loss.backward()
      optim.step()

    model.eval()
    with torch.no_grad():
      meter_pred = model(batch[1])
      loss = loss_fn(meter_pred, batch[2])
      print(loss.item())

model = torch.nn.Sequential(
    torch.nn.Linear(27, 1000),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1000, 1000),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1000, 1),
)
train_model(model.double(), ds_train_0)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

865738.7929958139


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

1491567.0514819818


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

1731658.3318036436


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

3130888.9795902893


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

1846554738.9762638


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

491088.6749183607


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

2167240.074060734


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

811951.6297545192


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

661070.7853845909


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

1292742.1621844391
