In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

import gc

In [31]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# System imports
import copy
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Utilities
from sklearn.metrics import accuracy_score, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder

In [32]:
dir_win = !dir /b
dir_linux = !ls -a

In [33]:
if ('kernel-metadata.json' in dir_win) or ('kernel-metadata.json' in dir_linux):
    src = 'Local'
    # Local environment
    data_path = '../../../data/'
    debug = True
else:
    # Kaggle environment
    src = 'Kaggle'
    data_path = '../input/'

print('Environment set to [{env}]'.format(env=src))

Environment set to [Local]


In [34]:
debug = False

In [35]:
if debug:
    rows = 1000
else:
    rows = None

In [36]:
init_dtype = {'building_id': 'int16'
              , 'meter': 'category'
              , 'meter_reading': 'float16'
              , 'primary_use': 'category'
              , 'square_feet': 'int32'
              , 'floor_count': 'int8'
              , 'year_built': 'int16'
              , 'site_id': 'int8'
              , 'precip_depth_1_hr': 'float16'
              , 'wind_direction': 'float16'
              , 'wind_speed': 'float16'
              , 'sea_level_pressure': 'float16'
              , 'dew_temperature': 'float16'
              , 'air_temperature': 'float16'
              , 'cloud_coverage': 'float16'} 

In [37]:
df = pd.read_csv(data_path + 'ashraetrainfilled/df_train_filled.csv'
                 , dtype=init_dtype
                 , parse_dates=['timestamp']
                 , nrows=rows)

In [38]:
# df_sample = pd.read_csv(data_path + 'ashrae-energy-prediction/sample_submission.csv', nrows=1000)
# df_sample.head()

In [39]:
init_dtype.update({'row_id': 'int32'})
df_test = pd.read_csv(data_path + 'ashraetestfilled/df_test_filled.csv'
                 , dtype=init_dtype
                 , parse_dates=['timestamp']
                 , nrows=rows)

In [40]:
#np.iinfo('int16')
#np.finfo('float16')

In [41]:
model = RandomForestRegressor(n_estimators=100, random_state = 5)
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['weekend'] = (df['day_of_week'] == 5) | (df['day_of_week'] == 6)
df['night'] = (df['hour'] >= 19) | (df['day_of_week'] < 7)

df_test['month'] = df_test['timestamp'].dt.month
df_test['day'] = df_test['timestamp'].dt.day
df_test['hour'] = df_test['timestamp'].dt.hour
df_test['day_of_week'] = df_test['timestamp'].dt.dayofweek
df_test['weekend'] = (df_test['day_of_week'] == 5) | (df_test['day_of_week'] == 6)
df_test['night'] = (df_test['hour'] >= 19) | (df_test['day_of_week'] < 7)

In [42]:
df = df.astype({'month': 'int8', 'day': 'int8', 'hour': 'int8', 'day_of_week': 'int8'})
df_test = df_test.astype({'month': 'int8', 'day': 'int8', 'hour': 'int8', 'day_of_week': 'int8'})

In [43]:
onehot = OneHotEncoder(handle_unknown='ignore', sparse=False)
# Use as many lines of code as you need!
low_cardinality_cols = ['primary_use']

OH_cols_df = pd.DataFrame(onehot.fit_transform(df[low_cardinality_cols])) # Your code here

In [44]:
OH_cols_df_test = pd.DataFrame(onehot.transform(df_test[low_cardinality_cols])) # Your code here

In [45]:
column_names = onehot.get_feature_names(['primary_use'])
#OH_cols_df =  pd.DataFrame(OH_cols_df, columns=column_names)
OH_cols_df.index = df.index
df.drop(low_cardinality_cols, axis=1, inplace=True)
df = pd.concat([df, OH_cols_df], axis=1) # Your code here

In [46]:
OH_cols_df_test.index = df_test.index
df_test.drop(low_cardinality_cols, axis=1, inplace=True)
df_test = pd.concat([df_test, OH_cols_df_test], axis=1) # Your code here

In [47]:
target = 'meter_reading'
features = list(df.columns)
features.remove(target)
features.remove('timestamp')

In [48]:
X = df[features]
y = df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8)

In [None]:
#model.fit(X_train, y_train)
model.fit(X, y)

In [None]:
# Predict
#y_pred = model.predict(X_val)
preds = model.predict(df_test[features])
preds = preds * 1000

output = pd.DataFrame({'row_id': df_test.row_id,
                   'meter_reading': preds})
output.head()
output.to_csv('submission.csv', index=False)
# # Calculate accuracy
# #msle = mean_squared_log_error(y_val, y_pred)
# #print('{} Mean square log error: {:.2f}'.format(model.__class__.__name__, msle))

In [None]:
# rmsle = np.sqrt(msle)
# print('{} Root mean square log error: {:.2f}'.format(model.__class__.__name__, rmsle))