In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn import preprocessing
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        #file = os.path.join(dirname, filename)
        

# Any results you write to the current directory are saved as output.

In [None]:
# load data
Test = pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")
Weather_train = pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_train.csv")
Train = pd.read_csv("/kaggle/input/ashrae-energy-prediction/train.csv")
Weather_test = pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_test.csv")
Sample_submission = pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
Building = pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")

In [None]:
Weather_test.head()

In [None]:
Train['meter_reading_log1p'] = np.log1p(Train['meter_reading'])

In [None]:
Merge_build_train = Building.merge(Train, left_on='building_id', right_on='building_id')

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
Merge_build_train = reduce_mem_usage(Merge_build_train)
Test = reduce_mem_usage(Test)
Weather_train = reduce_mem_usage(Weather_train)
Train = reduce_mem_usage(Train)
Weather_test = reduce_mem_usage(Weather_test)
Building = reduce_mem_usage(Building)

In [None]:
Merge = Merge_build_train.merge(Weather_train, on=["site_id", "timestamp"], how="left")

In [None]:
# Creating datasets for 4 meter readings
Merge_meter0 = Merge[Merge.meter == 0]
Merge_meter1 = Merge[Merge.meter == 1]
Merge_meter2 = Merge[Merge.meter == 2]
Merge_meter3 = Merge[Merge.meter == 3]

In [None]:
X0 = Merge_meter0[['site_id','meter','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed']].astype(float)
y0 = Merge_meter0['meter_reading_log1p'].astype(float)
X1 = Merge_meter1[['site_id','meter','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed']].astype(float)
y1 = Merge_meter1['meter_reading_log1p'].astype(float)
X2 = Merge_meter2[['site_id','meter','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed']].astype(float)
y2 = Merge_meter2['meter_reading_log1p'].astype(float)
X3 = Merge_meter3[['site_id','meter','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed']].astype(float)
y3 = Merge_meter3['meter_reading_log1p'].astype(float)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X0, y0, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
d_train = lgb.Dataset(x_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 40
params['min_data'] = 50
params['max_depth'] = 20
params["metric"] = "rmse"
clf = lgb.train(params, d_train, 100)

In [None]:
print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_train, num_boost_round=1000, verbose_eval=200)

In [None]:
#Prediction
y_pred=clf.predict(x_test)

In [None]:
y_pred - y_test

In [None]:
X0_half_1 = X0[:int(X0.shape[0] / 2)]
X0_half_2 = X0[int(X0.shape[0] / 2):]
print(X0_half_1.shape)
y0_half_1 = y0[:int(X0.shape[0] / 2)]
y0_half_2 = y0[int(X0.shape[0] / 2):]
print(y0_half_1.shape)
categorical_features = ["building_id", "site_id", "meter"]

d_half_1 = lgb.Dataset(X0_half_1, label=y0_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X0_half_2, label=y0_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Q: How much does it cost to cool a skyscraper in the summer?
A: A lot! And not just in dollars, but in environmental impact.

Thankfully, significant investments are being made to improve building efficiencies to reduce costs and emissions. The question is, are the improvements working? That’s where you come in. Under pay-for-performance financing, the building owner makes payments based on the difference between their real energy consumption and what they would have used without any retrofits. The latter values have to come from a model. Current methods of estimation are fragmented and do not scale well. Some assume a specific meter type or don’t work with different building types.

In this competition, you’ll develop accurate models of metered building energy usage in the following areas: chilled water, electric, hot water, and steam meters. The data comes from over 1,000 buildings over a three-year timeframe. With better estimates of these energy-saving investments, large scale investors and financial institutions will be more inclined to invest in this area to enable progress in building efficiencies.

Evaluation Metric

The evaluation metric for this competition is Root Mean Squared Logarithmic Error.

The RMSLE is calculated as

[Metrics](http://www.kaggle.com/c/ashrae-energy-prediction/overview/evaluation)

Where:

ϵ is the RMSLE value (score)
n is the total number of observations in the (public/private) data set,
pi is your prediction of target, and
ai is the actual target for i.
log(x) is the natural logarithm of x

In [None]:
def prepare_data(X, building_data, weather_data, test=False):
    """
    Preparing final dataset with all features.
    """
    
    X = X.merge(building_data, on="building_id", how="left")
    X = X.merge(weather_data, on=["site_id", "timestamp"], how="left")
    
    X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
    X.square_feet = np.log1p(X.square_feet)
    
    if not test:
        X.sort_values("timestamp", inplace=True)
        X.reset_index(drop=True, inplace=True)
    
    gc.collect()
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X["hour"] = X.timestamp.dt.hour
    X["weekday"] = X.timestamp.dt.weekday
    X["is_holiday"] = (X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    drop_features = ["timestamp", "sea_level_pressure", "wind_direction", "wind_speed"]

    X.drop(drop_features, axis=1, inplace=True)

    if test:
        row_ids = X.row_id
        X.drop("row_id", axis=1, inplace=True)
        return X, row_ids
    else:
        y = np.log1p(X.meter_reading)
        X.drop("meter_reading", axis=1, inplace=True)
        return X, y

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Building.primary_use = le.fit_transform(Building.primary_use)

In [None]:
Building.info()

In [None]:
import gc
X_train, y_train = prepare_data(Train, Building, Weather_train)

del Train, Weather_train
gc.collect()

In [None]:
X_train

In [None]:
y_train.shape

In [None]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

categorical_features = ["building_id", "site_id", "meter", "primary_use", "hour", "weekday"]

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)