In [1]:
# System imports
import copy
import json
import os
import numpy as np
import pandas as pd
import ashrae_scripts as scp
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Utilities
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

dir = !ls -a
dir = !dir /b
output_compression_type = 'gzip'
if ('kernel-metadata.json' in dir):
    # Local environment
    src = 'Local'
    compression_type = 'gzip'
    data_folder = '../../../data/'
    output_folder = '../../../data/'
else:
    # Kaggle environment
    src = 'Kaggle'
    compression_type = None
    data_folder = '../input/'
    output_folder = ''
df_path = data_folder + 'ashrae-train-1-0/' + 'df_train_filled.csv.gz'
df_test_path = data_folder + 'ashrae-test-1-0/' + 'df_test_filled.csv.gz'
    
        
print('Environment set to [{env}]'.format(env=src))
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Environment set to [Kaggle]
/kaggle/input/ashrae-train-1-0/df_train_filled.csv.gz
/kaggle/input/ashrae-train-1-0/__output__.json
/kaggle/input/ashrae-train-1-0/custom.css
/kaggle/input/ashrae-train-1-0/__results__.html
/kaggle/input/ashrae-train-1-0/__notebook__.ipynb
/kaggle/input/ashrae-test-1-0/df_test_filled.csv.gz
/kaggle/input/ashrae-test-1-0/__output__.json
/kaggle/input/ashrae-test-1-0/custom.css
/kaggle/input/ashrae-test-1-0/__results__.html
/kaggle/input/ashrae-test-1-0/__notebook__.ipynb


In [2]:
debug = False
if debug:
    rows = 1000
else:
    rows = None

In [3]:
cols = ['building_id', 'site_id', 'meter', 'primary_use', 'timestamp', 'square_feet', 'floor_count', 'year_built']
time_cols = ['month', 'day', 'hour', 'day_of_week']
bool_cols = ['weekend', 'night']
target_col = ['meter_reading']
weather_cols = ['precip_depth_1_hr', 'wind_direction', 'wind_speed', 'sea_level_pressure', 'dew_temperature', 'air_temperature', 'cloud_coverage']
row_col = ['row_id']

# Dtypes
dtype_cols = {'building_id': 'int16', 'site_id': 'int8', 'meter': 'int8', 'primary_use': 'category', 'square_feet': 'float64', 'floor_count': 'int8', 'year_built': 'int16'}
dtype_time_cols = {col: 'int8' for col in time_cols}
dtype_bool_cols = {col: 'int8' for col in bool_cols}
dtype_target_col = {'meter_reading': 'float64'}
dtype_weather_cols = {col: 'float16' for col in weather_cols}

In [4]:
df_cols = cols + time_cols + bool_cols + target_col + weather_cols
dtype = {}
dtype.update(dtype_cols)
dtype.update(dtype_time_cols)
dtype.update(dtype_bool_cols)
dtype.update(dtype_target_col)
dtype.update(dtype_weather_cols)

In [5]:
df = pd.read_csv(df_path
                 , dtype=dtype
                 , parse_dates=['timestamp']
                 , nrows=rows
                 , usecols=df_cols)

In [6]:
# Apply label encoder 
label_encoder = LabelEncoder()
label_cols = ['primary_use']
for col in label_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [7]:
df.to_csv('df_label_enc.csv.gz', index=False, compression='gzip')

In [8]:
del df
gc.collect()

22

In [9]:
df_test_cols = cols + time_cols + bool_cols + weather_cols + row_col
dtype = {}
dtype.update(dtype_cols)
dtype.update(dtype_time_cols)
dtype.update(dtype_bool_cols)
dtype.update(dtype_target_col)
dtype.update(dtype_weather_cols)

In [10]:
df_test = pd.read_csv(df_test_path
                 , dtype=dtype
                 , parse_dates=['timestamp']
                 , nrows=rows
                 , usecols=df_test_cols)

In [11]:
#np.iinfo('int16')
#np.finfo('float16')

In [12]:
for col in label_cols:
    df_test[col] = label_encoder.transform(df_test[col])

In [13]:
df_test.to_csv('df_test_label_enc.csv.gz', index=False, compression='gzip')