# ASHRAE with fast.ai, Part 1: Preprocessing

This kernel performs the preprocessing of the training and test data for future use. 

Even with careful memory management, some operations on the large ASHRAE datasets are too demanding. Let's spread the work across multiple notebooks. This kernel is thus part of the series which further includes:

- https://www.kaggle.com/michelezoccali/ashrae-with-fast-ai-part-2 (training)
- https://www.kaggle.com/michelezoccali/ashrae-with-fast-ai-part-3 (inference)

NB: Kaggle allows you to reference the static output of a kernel from other instances, as long as you remember to commit the notebook and save the output.

# Imports

In [None]:
import os
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import datetime

In [None]:
path = '../input/ashrae-energy-prediction'

for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Utilities

In [None]:
def reduce_mem(df):
    result = df.copy()
    for col in result.columns:
        col_data = result[col]
        dn = col_data.dtype.name
        if not dn.startswith("datetime"):
            if dn == "object":  # only object feature has low cardinality
                result[col] = pd.to_numeric(col_data.astype("category").cat.codes, downcast="unsigned")
            elif dn.startswith("int") | dn.startswith("uint"):
                if col_data.min() >= 0:
                    result[col] = pd.to_numeric(col_data, downcast="unsigned")
                else:
                    result[col] = pd.to_numeric(col_data, downcast='integer')
            else:
                result[col] = pd.to_numeric(col_data, downcast='float')
    return result

def add_lag_features(weather_df, window=3):
    group_df = weather_df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_min_lag{window}'] = lag_min[col]
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
        weather_df[f'{col}_max_lag{window}'] = lag_max[col]
    return weather_df

def _delete_bad_sitezero(X, y):
    cond = (X.timestamp > '2016-05-20') | (X.site_id != 0) | (X.meter != 0)
    X = X[cond]
    y = y.reindex_like(X)
    return X.reset_index(drop=True), y.reset_index(drop=True)

def _extract_temporal(X, train=True):
    X['hour'] = X.timestamp.dt.hour
    X['weekday'] = X.timestamp.dt.weekday
    if train:
        # include month to create validation set, to be deleted before training
        X['month'] = X.timestamp.dt.month 
    # month and year cause overfit, could try other (holiday, business, etc.)
    return reduce_mem(X)

In [None]:
def load_data(source='train'):
    assert source in ['train','test']
    df = pd.read_csv(f'{path}/{source}.csv', parse_dates=['timestamp'])
    return reduce_mem(df)

def load_building():
    df = pd.read_csv(f'{path}/building_metadata.csv').fillna(-1)
    return reduce_mem(df)

def load_weather(source='train', fix_timezone=True, impute=True, add_lag=True):
    assert source in ['train','test']
    df = pd.read_csv(f'{path}/weather_{source}.csv', parse_dates=['timestamp'])
    if fix_timezone:
        offsets = [5,0,9,6,8,0,6,6,5,7,8,6,0,7,6,6]
        offset_map = {site: offset for site, offset in enumerate(offsets)}
        df.timestamp = df.timestamp - pd.to_timedelta(df.site_id.map(offset_map), unit='h')
    if impute:
        site_dfs = []
        for site in df.site_id.unique():
            if source == 'train':
                new_idx = pd.date_range(start='2016-1-1', end='2016-12-31-23', freq='H')
            else:
                new_idx = pd.date_range(start='2017-1-1', end='2018-12-31-23', freq='H')
            site_df = df[df.site_id == site].set_index('timestamp').reindex(new_idx)
            site_df.site_id = site
            for col in [c for c in site_df.columns if c != 'site_id']:
                site_df[col] = site_df[col].interpolate(limit_direction='both', method='linear')
                site_df[col] = site_df[col].fillna(df[col].median())
            site_dfs.append(site_df)
        df = pd.concat(site_dfs)
        df['timestamp'] = df.index
        df = df.reset_index(drop=True)
        
    if add_lag:
        df = add_lag_features(df, window=3)
    
    return reduce_mem(df)

def merged_dfs(source='train', fix_timezone=True, impute=True, add_lag=False):
    df = load_data(source=source).merge(load_building(), on='building_id', how='left')
    df = df.merge(load_weather(source=source, fix_timezone=fix_timezone, impute=impute, add_lag=add_lag),
                 on=['site_id','timestamp'], how='left')
    if source == 'train':
        X = df.drop('meter_reading', axis=1)  
        y = np.log1p(df.meter_reading)  # log-transform of target
        return X, y
    elif source == 'test':
        return df

# Training data

We try without lag features first, as unsure if the TabularPandas instance can deal with the added features without adding enough overhead to fill the RAM.

In [None]:
X_train, y_train = merged_dfs(add_lag=False)
X_train.head()

In [None]:
# preprocessing
X_train, y_train = _delete_bad_sitezero(X_train, y_train)
X_train = _extract_temporal(X_train)

# remove timestamp and other unimportant features
to_drop = ['timestamp','sea_level_pressure','wind_direction','wind_speed']
X_train.drop(to_drop, axis=1, inplace=True)

gc.collect()

In [None]:
df_train = pd.concat([X_train, y_train], axis=1)

del X_train, y_train
gc.collect()

df_train.info()

An important part of the preprocessing involves careful type downcasting for memory management. Saving to a csv file does not preserve dtypes, hence we save in HDF5 format. We can add multiple frames to the save HDF5 file as long as we specify a key for retrieval.

In [None]:
df_train.to_hdf('preprocessing_no_lag.h5', index=False, key='train')
del df_train

# Test data

In [None]:
X_test = merged_dfs(source='test', add_lag=False)
X_test = _extract_temporal(X_test, train=False)
X_test.drop(columns=['timestamp']+to_drop, inplace=True)
gc.collect()

X_test.head()

In [None]:
X_test.info()

Now we add the test DataFrame to the same HDF5 file.

In [None]:
X_test.to_hdf('preprocessing_no_lag.h5', index=False, key='test')

Let us verify that HDF5 files did indeed preserve dtypes.

In [None]:
X_test2 = pd.read_hdf('preprocessing_no_lag.h5','test')
X_test2.info()

We are now ready to load `df_train` and `X_test` in any other kernel to start modelling! If you are interested, check out the kernel at:

- https://www.kaggle.com/michelezoccali/ashrae-with-fast-ai-part-2