# Data Preparation

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import sktime as skt
import xgboost as xgb
import matplotlib.pyplot as plt
sns.set(style='ticks', context='talk')

In [34]:
def feat_cleanup_primary_use(dt):
    dt.primary_use = dt.primary_use.str.lower()
    return dt

In [35]:
def feat_cleanup_categoricals(dt):
    dt["building_id"] = 'bldg_' + dt["building_id"].apply(str)
    dt["site_id"] = 'site_' + dt["site_id"].apply(str)
    return dt

In [36]:
def feat_anomaly(dt):
    if "anomaly" in dt.columns:
        dt["anomaly"] = 'A' + dt["anomaly"].apply(str)
        dt["anomaly"] = dt["anomaly"].astype("category")
    return dt

In [37]:
def feat_missingness(dt):
    dt["cloud_coverage_missing"] = dt["cloud_coverage"] == 255
    dt["year_built_missing"] = dt["year_built"] == 255
    return dt

In [51]:
def process_data(fp):
    df = (pd.read_csv(fp, nrows=1e5)
            .rename(columns=str.lower)
            .pipe(feat_cleanup_primary_use)
            .pipe(feat_cleanup_categoricals)
            .pipe(feat_anomaly)
            .pipe(feat_missingness)
            .assign(timestamp=lambda x: pd.to_datetime(x['timestamp']),
                    site_id=lambda x: pd.Categorical(x['site_id']),
                    building_id=lambda x: pd.Categorical(x['building_id']))
         )
    return df

In [52]:
training_dat = process_data("/home/jovyan/work/data-raw/train_features.csv")
training_dat.set_index("timestamp", inplace=True)
training_dat.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100000 entries, 2016-01-01 00:00:00 to 2016-01-21 23:00:00
Data columns (total 58 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   building_id                    100000 non-null  category
 1   meter_reading                  91526 non-null   float64 
 2   anomaly                        100000 non-null  category
 3   site_id                        100000 non-null  category
 4   primary_use                    100000 non-null  object  
 5   square_feet                    100000 non-null  int64   
 6   year_built                     100000 non-null  int64   
 7   floor_count                    100000 non-null  int64   
 8   air_temperature                100000 non-null  float64 
 9   cloud_coverage                 100000 non-null  int64   
 10  dew_temperature                100000 non-null  float64 
 11  precip_depth_1_hr              100000 non-nu