# Data preparation using fastai TabularPandas

In [None]:
from fastai.tabular.all import *

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Train.csv exploration

In [None]:
train = pd.read_csv("../input/ashrae-energy-prediction/train.csv", parse_dates=['timestamp'], infer_datetime_format=True)

In [None]:
train.info()

In [None]:
train.columns

In [None]:
train.head()

In [None]:
meter_dict = {
    0: 'electricity',
    1: 'chilled_water',
    2: 'steam',
    3: 'hot_water'
}

### EDA

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train['meter'].value_counts()

In [None]:
plt.figure(figsize = (12,10))
train.hist(['meter_reading'], bins = 50)

In [None]:
(train['meter_reading']==0).sum()/len(train)*100 # percentage of meter readings = 0

## Weather_train.csv exploration

In [None]:
weather_train = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv", parse_dates=['timestamp'],
                            infer_datetime_format=True)

In [None]:
weather_train.info()

In [None]:
weather_train.describe()

In [None]:
weather_train.columns

In [None]:
weather_train.head()

## Building_metadata.csv exporation

In [None]:
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')

In [None]:
building.info()

In [None]:
building.columns

In [None]:
building.head()

## Merge and Convert

In [None]:
train = train.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id','timestamp'], how='left')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
dep_var = 'meter_reading'
train[dep_var] = np.log(train[dep_var]+1)

This is a fastai function used to add datetime components of a datetime object

In [None]:
train = add_datepart(train, 'timestamp')

In [None]:
train.columns

We drop the timestampYear column because it has only 1 value (2016)

In [None]:
train.drop('timestampYear', axis=1, inplace=True)

In [None]:
train.shape

In [None]:
train.info()

#### Here we convert the dataframe to a TabularPandas object

In [None]:
procs = [Categorify, FillMissing]

In [None]:
train.head()

Our validation set should be the last month of the training data because the test set is also comprised of dates in 2017, i.e., we should choose our validation set so that we can forecast the meter reading for future time in a building.

In [None]:
cond = train.timestampMonth <= 11
train_idxs = np.where(cond)[0]
valid_idxs = np.where(~cond)[0]
splits = (list(train_idxs), list(valid_idxs))
len(train_idxs), len(valid_idxs)

In [None]:
train['timestampWeek'] = train['timestampWeek'].astype('int64')

In [None]:
cont, cat = cont_cat_split(train, dep_var = dep_var)

In [None]:
cont

In [None]:
cat

In [None]:
to = TabularPandas(train, procs, cat, cont, y_names=dep_var, splits=splits)

In [None]:
len(to.train), len(to.valid)

In [None]:
save_pickle('../working/to.pkl', to)

We need to convert the columns to a less bulky datatype so that we can train models on this large dataset. These datatypes are chosen so that they are the smallest datatype which doesn't affect the data

In [None]:
types = {
    'building_id':'int16',
    'meter':'int8',
    'meter_reading':'float64',
    'site_id': 'int8',
    'air_temperature': 'float16',
    'cloud_coverage': 'int8',
    'dew_temperature': 'float16',
    'precip_depth_1_hr': 'int16',
    'sea_level_pressure': 'float32',
    'wind_direction': 'int16',
    'wind_speed': 'float16',
    'square_feet': 'int32',
    'year_built': 'int16',
    'floor_count': 'int8',
    'timestampMonth': 'int8',
    'timestampDay': 'int8',
    'timestampDayofweek': 'int8',
    'timestampDayofyear': 'int16',
    'timestampWeek': 'int8'
}