# Libraries 

## Processing 

In [2]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

from IPython.core.display import display, HTML

## Scipy 

In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

## Data Import

In [None]:
import eli5
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# EDA 

In [14]:
myfavouritenumber = 42
seed = myfavouritenumber

## Data Import 

In [4]:
energy_train = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\train.csv')
weather_train = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\weather_train.csv')
building = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\building_metadata.csv')
energy_test = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\test.csv')
weather_test = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\weather_test.csv')

## Energy 

In [None]:
energy.timestamp = pd.to_datetime(energy.timestamp)
energy.meter = energy.meter.astype('category')
energy.info()

In [None]:
#distribution of building counts
g1 = energy.building_id.value_counts()
plt.hist(g1.values,bins=100)
plt.xlabel('Total rows of a building id')
plt.ylabel('Number building ids')
plt.show()

In [None]:
energy['meter'].replace({0:"Electricity",1:"ChilledWater",2:"Steam",3:"HotWater"},inplace=True)

In [None]:
#distribution of meter
g2 = energy.meter.value_counts()
plt.bar(g2.index,g2.values)
plt.xlabel('Meter Type')
plt.ylabel('Count')
plt.show()

In [None]:
#check outlier
g3 = energy[['meter','meter_reading']]
g3['meter_reading'] = np.log1p(g3['meter_reading'])
sns.boxplot(x='meter',y='meter_reading',data=g3)
plt.plot();

In [None]:
energy.columns

In [None]:
sns.boxplot(energy[energy['meter'] == "Electricity"]['meter_reading'])
plt.title("Boxplot of Meter Reading Variable for the Meter Type: Electricity")
# We can see a few outliers here. 

In [None]:
sns.boxplot(energy[energy['meter'] == "ChilledWater"]['meter_reading'])
plt.title("Boxplot of Meter Reading Variable for the Meter Type: Electricity")
# Not many outliers here

In [None]:
sns.boxplot(energy[energy['meter'] == "HotWater"]['meter_reading'])
plt.title("Boxplot of Meter Reading Variable for the Meter Type: Electricity")
# We can see a single value that is way off from the rest.

In [None]:
sns.boxplot(energy[energy['meter'] == "Steam"]['meter_reading'])
plt.title("Boxplot of Meter Reading Variable for the Meter Type: Electricity") 
#No outlier

In [None]:
sns.distplot(np.log1p(energy['meter_reading']),kde=False)
plt.title("Distribution of Log of Meter Reading Variable")
# Lot of 0 values as can be seen from the distribution

## Weather 

In [None]:
weather.timestamp = pd.to_datetime(weather.timestamp)
weather.info()

In [None]:
h = weather.drop(['site_id','timestamp'],axis=1).corr()
plt.figure(figsize=(12,10))
sns.heatmap(h,annot=True,center=0,cmap='Blues');

In [None]:
cols = ['air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_speed']
for ind,col in enumerate(weather[cols]):
    plt.figure(ind)
    sns.distplot(weather[col].dropna())

## Building 

In [None]:
building_info.info()

In [None]:
#distribution of building age
i1 = building_info.year_built.value_counts()
plt.bar(i1.index,i1.values)
plt.xlabel('Year Built')
plt.ylabel('Count')
plt.show()

In [None]:
i2 = building_info.primary_use.value_counts()
plt.barh(i2.index,i2.values)
plt.xlabel('Building Primary Use')
plt.ylabel('Count')
plt.show()

## Segment by Time 

In [None]:
#Data prep for Modeling
#Target variable: meter_reading
#Encode primary_use variables for modeling
#Convert the timestamp to datetime

def dt_parts(df,dt_col):
    if(df[dt_col].dtype=='O'):
        df[dt_col] = pd.to_datetime(df[dt_col])
    df['year'] = df[dt_col].dt.year.astype(np.int16)
    df['month'] = df[dt_col].dt.month.astype(np.int8)
    df['day'] = df[dt_col].dt.day.astype(np.int8)
    df['hour'] = df[dt_col].dt.hour.astype(np.int8)
    df['minute'] = df[dt_col].dt.minute.astype(np.int8)
    df['second'] = df[dt_col].dt.second.astype(np.int8)
    return df

#optimizing the column types to consume less space
def df_type_optimize(df):
    df['building_id'] = df['building_id'].astype(np.uint16)
    df['meter'] = df['meter'].astype(np.uint8)
    df['site_id'] = df['site_id'].astype(np.uint8)
    df['square_feet'] = df['square_feet'].astype(np.uint32)
    
    df['year_built'] = df['year_built'].astype(np.uint16)
    df['floor_count'] = df['floor_count'].astype(np.uint8)
    
    df['air_temperature'] = df['air_temperature'].astype(np.int16)
    df['cloud_coverage'] = df['cloud_coverage'].astype(np.int16)
    df['dew_temperature'] = df['dew_temperature'].astype(np.int16)
    df['precip_depth_1_hr'] = df['precip_depth_1_hr'].astype(np.int16)
    df['sea_level_pressure'] = df['sea_level_pressure'].astype(np.int16)
    df['wind_direction'] = df['wind_direction'].astype(np.int16)
    df['wind_speed'] = df['wind_speed'].astype(np.int16)
    
    return df

In [None]:
dt_parts(energy,'timestamp')
dt_parts(weather,'timestamp')

In [None]:
energy.groupby(['meter','month'])['meter_reading'].agg(['max','mean','median','count','std'])
# We can see that only Steam meter has very high meter_reading values as compared to other types of meters.
# We can see that the average electricity meter_reading does not vary much across the months.
# Average Hot Water meter_reading is relatively less from April to October Months.
# Average Steam meter_reading is way higher from March to June as compared to the other months.

In [None]:
plt.figure(figsize=(8,6))
building_info['primary_use'].value_counts().sort_values().plot(kind='bar')
plt.title("Count of Primary_Use Variable in the Metadata table")
plt.xlabel("Primary Use")
plt.ylabel("Count")
plt.xticks(rotation=90)
# Education, Office, Entertainment/Public Assembly, Public Services, Lodging/Residential form the bulk of Primary Use

In [None]:
sns.boxplot(building_info['square_feet'])

In [None]:
building_info['square_feet'] = np.log1p(building_info['square_feet'])

In [None]:
sns.distplot(building_info['square_feet'])
plt.title("Distribution of Square Feet variable of Metadata Table")
plt.xlabel("Area in Square Feet")
plt.ylabel("Frequency")
# Looks like a normal distribution distribution

In [None]:
building_info.groupby('primary_use')['square_feet'].agg(['mean','median','count']).sort_values(by='count')
# Parking has the highest average are although the count is less.
# Education has the highest count as can be seen in the countplot above.

In [None]:
building_info['year_built'].value_counts().sort_values().plot(kind='bar',figsize=(15,6))
plt.xlabel("Year Built")
plt.ylabel("Count")
plt.title("Distribution of Year Built Variable")

In [None]:
building_info['floor_count'].value_counts(dropna=False).sort_index().plot(kind='bar',figsize=(8,6))
plt.xlabel("Number of Floors")
plt.ylabel("Count of Buildings")
# Lot of missing values here as well
# Maximum number of floors is 26

In [None]:
building_info.groupby('floor_count')['square_feet'].agg(['count','mean','median']).sort_values(by='count')

In [None]:
building_info.groupby('primary_use')['square_feet'].agg(['count','mean','median']).sort_values(by='count')

In [None]:
building_info['year_built'].fillna(-999, inplace=True)
building_info['year_built'] = building_info['year_built'].astype('int16')
building_info['floor_count'].fillna(-999, inplace=True)
building_info['floor_count'] = building_info['floor_count'].astype('float32')

## Alternative-All 

In [5]:
df_train = energy_train
df_test = energy_test

building = building
le = LabelEncoder()
building.primary_use = le.fit_transform(building.primary_use)

weather_train = weather_train
weather_test = weather_test

weather_train.drop(["sea_level_pressure", "wind_direction", "wind_speed"], axis=1, inplace=True)
weather_test.drop(["sea_level_pressure", "wind_direction", "wind_speed"], axis=1, inplace=True)

weather_train = weather_train.groupby("site_id").apply(lambda group: group.interpolate(limit_direction="both"))
weather_test = weather_test.groupby("site_id").apply(lambda group: group.interpolate(limit_direction="both"))

df_train = df_train.merge(building, on="building_id")
df_train = df_train.merge(weather_train, on=["site_id", "timestamp"], how="left")
df_train = df_train[~((df_train.site_id==0) & (df_train.meter==0) & (df_train.building_id <= 104) & (df_train.timestamp < "2016-05-21"))]
df_train.reset_index(drop=True, inplace=True)

#fixing site_id 0
df_train.loc[df_train['site_id']==0, 'meter_reading'] = df_train.loc[df_train['site_id']==0, 'meter_reading'] * 0.2931

df_train.timestamp = pd.to_datetime(df_train.timestamp, format='%Y-%m-%d %H:%M:%S')
df_train["log_meter_reading"] = np.log1p(df_train.meter_reading)

df_test = df_test.merge(building, on="building_id")
df_test = df_test.merge(weather_test, on=["site_id", "timestamp"], how="left")
df_test.reset_index(drop=True, inplace=True)
df_test.timestamp = pd.to_datetime(df_test.timestamp, format='%Y-%m-%d %H:%M:%S')

del building, le
gc.collect()

0

In [6]:
# Code from https://www.kaggle.com/caesarlupum/ashrae-start-here-a-gentle-introduction 
# Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
                    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# function to calculate evaluation metric
def rmsle(y_true: pd.Series, y_predict: pd.Series) -> float:
    """
    Evaluate root mean squared log error
    :param y_true:
    :param y_predict:
    :return:
    """
    return np.sqrt(msle(y_true, y_predict))

In [7]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

df_train["hour"] = df_train.timestamp.dt.hour
df_train["weekday"] = df_train.timestamp.dt.weekday

df_test["hour"] = df_test.timestamp.dt.hour
df_test["weekday"] = df_test.timestamp.dt.weekday

Mem. usage decreased to 663.15 Mb (67.6% reduction)
Mem. usage decreased to 1312.28 Mb (67.0% reduction)


In [8]:
df_building_meter = df_train.groupby(["building_id", "meter"]).agg(mean_building_meter=("log_meter_reading", "mean"),
                                                             median_building_meter=("log_meter_reading", "median")).reset_index()

df_train = df_train.merge(df_building_meter, on=["building_id", "meter"])
df_test = df_test.merge(df_building_meter, on=["building_id", "meter"])

df_building_meter_hour = df_train.groupby(["building_id", "meter", "hour"]).agg(mean_building_meter=("log_meter_reading", "mean"),
                                                                                median_building_meter=("log_meter_reading", "median")).reset_index()

df_train = df_train.merge(df_building_meter_hour, on=["building_id", "meter", "hour"])
df_test = df_test.merge(df_building_meter_hour, on=["building_id", "meter", "hour"])

In [9]:
def create_lag_features(df, window):
    """
    Creating lag-based features looking back in time.
    """
    
    feature_cols = ["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr"]
    df_site = df.groupby("site_id")
    
    df_rolled = df_site[feature_cols].rolling(window=window, min_periods=0)
    
    df_mean = df_rolled.mean().reset_index().astype(np.float16)
    df_median = df_rolled.median().reset_index().astype(np.float16)
    df_min = df_rolled.min().reset_index().astype(np.float16)
    df_max = df_rolled.max().reset_index().astype(np.float16)
    df_std = df_rolled.std().reset_index().astype(np.float16)
    df_skew = df_rolled.skew().reset_index().astype(np.float16)
    
    for feature in feature_cols:
        df[f"{feature}_mean_lag{window}"] = df_mean[feature]
        df[f"{feature}_median_lag{window}"] = df_median[feature]
        df[f"{feature}_min_lag{window}"] = df_min[feature]
        df[f"{feature}_max_lag{window}"] = df_max[feature]
        df[f"{feature}_std_lag{window}"] = df_std[feature]
        df[f"{feature}_skew_lag{window}"] = df_std[feature]

    return df

In [21]:
weather_train.timestamp = pd.to_datetime(weather_train.timestamp)


NameError: name 'weather_train' is not defined

In [23]:
weather_test.timestamp = pd.to_datetime(weather_test.timestamp)

In [11]:
weather_train = create_lag_features(weather_train, 18)
weather_train.drop(["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr"], axis=1, inplace=True)

df_train = df_train.merge(weather_train, on=["site_id", "timestamp"], how="left")

del weather_train
gc.collect()

0

In [12]:
categorical_features = [
    "building_id",
    "primary_use",
    "meter",
    "weekday",
    "hour"
]

all_features = [col for col in df_train.columns if col not in ["timestamp", "site_id", "meter_reading", "log_meter_reading"]]

In [15]:
cv = 2
models = {}
cv_scores = {"site_id": [], "cv_score": []}

for site_id in tqdm(range(16), desc="site_id"):
    print(cv, "fold CV for site_id:", site_id)
    kf = KFold(n_splits=cv, random_state=seed)
    models[site_id] = []

    X_train_site = df_train[df_train.site_id==site_id].reset_index(drop=True)
    y_train_site = X_train_site.log_meter_reading
    y_pred_train_site = np.zeros(X_train_site.shape[0])
    
    score = 0

    for fold, (train_index, valid_index) in enumerate(kf.split(X_train_site, y_train_site)):
        X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
        y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]

        dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
    
        watchlist = [dtrain, dvalid]

        params = {"objective": "regression",
                  "num_leaves": 41,
                  "learning_rate": 0.049,
                  "bagging_freq": 5,
                  "bagging_fraction": 0.51,
                  "feature_fraction": 0.81,
                  "metric": "rmse"
                  }

        model_lgb = lgb.train(params, train_set=dtrain, num_boost_round=999, valid_sets=watchlist, verbose_eval=101, early_stopping_rounds=21)
        models[site_id].append(model_lgb)

        y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
        y_pred_train_site[valid_index] = y_pred_valid

        rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
        print("Site Id:", site_id, ", Fold:", fold+1, ", RMSE:", rmse)
        score += rmse / cv
        
        gc.collect()
    
    cv_scores["site_id"].append(site_id)
    cv_scores["cv_score"].append(score)
        
    print("\nSite Id:", site_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")

HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…

2 fold CV for site_id: 0



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.640528	valid_1's rmse: 0.636799
Early stopping, best iteration is:
[119]	training's rmse: 0.621412	valid_1's rmse: 0.636539
Site Id: 0 , Fold: 1 , RMSE: 0.6357694905629656



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.471422	valid_1's rmse: 0.838763
Early stopping, best iteration is:
[85]	training's rmse: 0.48755	valid_1's rmse: 0.837274
Site Id: 0 , Fold: 2 , RMSE: 0.8379867462065811

Site Id: 0 , CV RMSE: 0.7437824386030453 

2 fold CV for site_id: 1



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.599865	valid_1's rmse: 0.977308
Early stopping, best iteration is:
[123]	training's rmse: 0.59147	valid_1's rmse: 0.975738
Site Id: 1 , Fold: 1 , RMSE: 0.9774293510378093



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.740553	valid_1's rmse: 0.758059
Early stopping, best iteration is:
[162]	training's rmse: 0.70489	valid_1's rmse: 0.75613
Site Id: 1 , Fold: 2 , RMSE: 0.7553552259600743

Site Id: 1 , CV RMSE: 0.8734787773779319 

2 fold CV for site_id: 2



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.732258	valid_1's rmse: 0.809412
Early stopping, best iteration is:
[107]	training's rmse: 0.72581	valid_1's rmse: 0.807701
Site Id: 2 , Fold: 1 , RMSE: 0.798330689926289



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.635472	valid_1's rmse: 0.935168
[202]	training's rmse: 0.584582	valid_1's rmse: 0.931301
Early stopping, best iteration is:
[266]	training's rmse: 0.563147	valid_1's rmse: 0.928987
Site Id: 2 , Fold: 2 , RMSE: 0.9255390519386634

Site Id: 2 , CV RMSE: 0.8642784352110446 

2 fold CV for site_id: 3



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.307539	valid_1's rmse: 0.440731
[202]	training's rmse: 0.281544	valid_1's rmse: 0.436894
[303]	training's rmse: 0.266953	valid_1's rmse: 0.435269
[404]	training's rmse: 0.256983	valid_1's rmse: 0.434391
Early stopping, best iteration is:
[407]	training's rmse: 0.256733	valid_1's rmse: 0.434369
Site Id: 3 , Fold: 1 , RMSE: 0.4345179614319258



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.32966	valid_1's rmse: 0.390334
Early stopping, best iteration is:
[108]	training's rmse: 0.326849	valid_1's rmse: 0.389689
Site Id: 3 , Fold: 2 , RMSE: 0.3900159254027245

Site Id: 3 , CV RMSE: 0.4128669860599957 

2 fold CV for site_id: 4



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.142263	valid_1's rmse: 0.265089
Early stopping, best iteration is:
[96]	training's rmse: 0.143932	valid_1's rmse: 0.264724
Site Id: 4 , Fold: 1 , RMSE: 0.26405785082977123



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.212279	valid_1's rmse: 0.315465
[202]	training's rmse: 0.188665	valid_1's rmse: 0.298443
Early stopping, best iteration is:
[216]	training's rmse: 0.186879	valid_1's rmse: 0.297918
Site Id: 4 , Fold: 2 , RMSE: 0.30966374580310735

Site Id: 4 , CV RMSE: 0.2877656894503315 

2 fold CV for site_id: 5



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.507374	valid_1's rmse: 0.70752
Early stopping, best iteration is:
[81]	training's rmse: 0.529147	valid_1's rmse: 0.706271
Site Id: 5 , Fold: 1 , RMSE: 0.7037725134708636



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.503428	valid_1's rmse: 0.64754
[202]	training's rmse: 0.448491	valid_1's rmse: 0.628246
[303]	training's rmse: 0.417892	valid_1's rmse: 0.618404
[404]	training's rmse: 0.397965	valid_1's rmse: 0.614329
[505]	training's rmse: 0.380305	valid_1's rmse: 0.609545
[606]	training's rmse: 0.367904	valid_1's rmse: 0.60632
[707]	training's rmse: 0.357034	valid_1's rmse: 0.603506
[808]	training's rmse: 0.347751	valid_1's rmse: 0.600737
[909]	training's rmse: 0.338945	valid_1's rmse: 0.598487
Did not meet early stopping. Best iteration is:
[999]	training's rmse: 0.331496	valid_1's rmse: 0.596372
Site Id: 5 , Fold: 2 , RMSE: 0.6129429913796911

Site Id: 5 , CV RMSE: 0.6599222914095949 

2 fold CV for site_id: 6



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.989478	valid_1's rmse: 1.1223
Early stopping, best iteration is:
[115]	training's rmse: 0.968779	valid_1's rmse: 1.121
Site Id: 6 , Fold: 1 , RMSE: 1.122429072412157



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
Early stopping, best iteration is:
[66]	training's rmse: 0.894874	valid_1's rmse: 1.40703
Site Id: 6 , Fold: 2 , RMSE: 1.3823946643109768

Site Id: 6 , CV RMSE: 1.2591388040845908 

2 fold CV for site_id: 7



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 1.33545	valid_1's rmse: 1.71594
Early stopping, best iteration is:
[84]	training's rmse: 1.38248	valid_1's rmse: 1.71341
Site Id: 7 , Fold: 1 , RMSE: 1.6537947613359545



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.985653	valid_1's rmse: 2.1427
Early stopping, best iteration is:
[132]	training's rmse: 0.945448	valid_1's rmse: 2.13486
Site Id: 7 , Fold: 2 , RMSE: 2.135872639795589

Site Id: 7 , CV RMSE: 1.9101026233882594 

2 fold CV for site_id: 8



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.43747	valid_1's rmse: 0.510902
[202]	training's rmse: 0.400086	valid_1's rmse: 0.507124
[303]	training's rmse: 0.376101	valid_1's rmse: 0.505912
Early stopping, best iteration is:
[322]	training's rmse: 0.372873	valid_1's rmse: 0.505469
Site Id: 8 , Fold: 1 , RMSE: 0.504086438419819



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.412876	valid_1's rmse: 0.655246
Early stopping, best iteration is:
[147]	training's rmse: 0.392149	valid_1's rmse: 0.652085
Site Id: 8 , Fold: 2 , RMSE: 0.6694145272396164

Site Id: 8 , CV RMSE: 0.5925447683742928 

2 fold CV for site_id: 9



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.874373	valid_1's rmse: 0.953378
[202]	training's rmse: 0.767887	valid_1's rmse: 0.91065
Early stopping, best iteration is:
[225]	training's rmse: 0.754711	valid_1's rmse: 0.90716
Site Id: 9 , Fold: 1 , RMSE: 0.8759140190786968



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.820918	valid_1's rmse: 1.04906
Early stopping, best iteration is:
[123]	training's rmse: 0.785783	valid_1's rmse: 1.03309
Site Id: 9 , Fold: 2 , RMSE: 1.010756912775909

Site Id: 9 , CV RMSE: 0.9457417222984216 

2 fold CV for site_id: 10



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 1.08908	valid_1's rmse: 1.1839
Early stopping, best iteration is:
[114]	training's rmse: 1.06961	valid_1's rmse: 1.17936
Site Id: 10 , Fold: 1 , RMSE: 1.1837774059454662



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.812435	valid_1's rmse: 1.44669
[202]	training's rmse: 0.753492	valid_1's rmse: 1.43685
[303]	training's rmse: 0.718702	valid_1's rmse: 1.43183
Early stopping, best iteration is:
[338]	training's rmse: 0.708055	valid_1's rmse: 1.43116
Site Id: 10 , Fold: 2 , RMSE: 1.4467305596718139

Site Id: 10 , CV RMSE: 1.3218087187280503 

2 fold CV for site_id: 11



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.602756	valid_1's rmse: 0.745377
Early stopping, best iteration is:
[93]	training's rmse: 0.611519	valid_1's rmse: 0.743517
Site Id: 11 , Fold: 1 , RMSE: 0.7415924611053343



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
Early stopping, best iteration is:
[30]	training's rmse: 0.692995	valid_1's rmse: 1.8768
Site Id: 11 , Fold: 2 , RMSE: 1.8768048628675664

Site Id: 11 , CV RMSE: 1.4269427089346949 

2 fold CV for site_id: 12



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.375046	valid_1's rmse: 0.419332
[202]	training's rmse: 0.326673	valid_1's rmse: 0.399352
[303]	training's rmse: 0.302727	valid_1's rmse: 0.393964
[404]	training's rmse: 0.285654	valid_1's rmse: 0.390238
Early stopping, best iteration is:
[456]	training's rmse: 0.276254	valid_1's rmse: 0.388712
Site Id: 12 , Fold: 1 , RMSE: 0.38777628176749607



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.330484	valid_1's rmse: 0.438018
[202]	training's rmse: 0.274905	valid_1's rmse: 0.421333
[303]	training's rmse: 0.24366	valid_1's rmse: 0.417539
[404]	training's rmse: 0.223276	valid_1's rmse: 0.415843
[505]	training's rmse: 0.20943	valid_1's rmse: 0.415203
Early stopping, best iteration is:
[578]	training's rmse: 0.199969	valid_1's rmse: 0.414358
Site Id: 12 , Fold: 2 , RMSE: 0.41487096231796045

Site Id: 12 , CV RMSE: 0.40155217048045017 

2 fold CV for site_id: 13



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.866766	valid_1's rmse: 1.22354
[202]	training's rmse: 0.776637	valid_1's rmse: 1.20035
Early stopping, best iteration is:
[275]	training's rmse: 0.750923	valid_1's rmse: 1.1935
Site Id: 13 , Fold: 1 , RMSE: 1.1889348458158573



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.859059	valid_1's rmse: 1.20044
[202]	training's rmse: 0.766175	valid_1's rmse: 1.18246
Early stopping, best iteration is:
[196]	training's rmse: 0.769084	valid_1's rmse: 1.17979
Site Id: 13 , Fold: 2 , RMSE: 1.1979158551571731

Site Id: 13 , CV RMSE: 1.193433797017492 

2 fold CV for site_id: 14



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 1.22286	valid_1's rmse: 1.48521
[202]	training's rmse: 1.11915	valid_1's rmse: 1.4594
[303]	training's rmse: 1.0714	valid_1's rmse: 1.45231
[404]	training's rmse: 1.03407	valid_1's rmse: 1.4485
Early stopping, best iteration is:
[402]	training's rmse: 1.03551	valid_1's rmse: 1.44815
Site Id: 14 , Fold: 1 , RMSE: 1.4443991430506828



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 1.26678	valid_1's rmse: 1.53091
[202]	training's rmse: 1.18506	valid_1's rmse: 1.5091
[303]	training's rmse: 1.14805	valid_1's rmse: 1.50033
Early stopping, best iteration is:
[330]	training's rmse: 1.13743	valid_1's rmse: 1.49762
Site Id: 14 , Fold: 2 , RMSE: 1.4908319353005135

Site Id: 14 , CV RMSE: 1.4677991592444484 

2 fold CV for site_id: 15



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.514012	valid_1's rmse: 0.79727
[202]	training's rmse: 0.473464	valid_1's rmse: 0.778437
[303]	training's rmse: 0.456319	valid_1's rmse: 0.772179
[404]	training's rmse: 0.4432	valid_1's rmse: 0.769008
Early stopping, best iteration is:
[407]	training's rmse: 0.442807	valid_1's rmse: 0.768982
Site Id: 15 , Fold: 1 , RMSE: 0.7615222536286377



Using categorical_feature in Dataset.



Training until validation scores don't improve for 21 rounds
[101]	training's rmse: 0.616884	valid_1's rmse: 0.67931
Early stopping, best iteration is:
[137]	training's rmse: 0.591666	valid_1's rmse: 0.66474
Site Id: 15 , Fold: 2 , RMSE: 0.6566975159884169

Site Id: 15 , CV RMSE: 0.71104422164764 




In [16]:
output = pd.DataFrame.from_dict(cv_scores)

### Scoring and Average of 3 models


In [17]:
weather_test = create_lag_features(weather_test, 18)
weather_test.drop(["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr"], axis=1, inplace=True)

In [24]:
df_test_sites = []

for site_id in tqdm(range(16), desc="site_id"):
    print("Preparing test data for site_id", site_id)

    X_test_site = df_test[df_test.site_id==site_id]
    weather_test_site = weather_test[weather_test.site_id==site_id]
    
    X_test_site = X_test_site.merge(weather_test_site, on=["site_id", "timestamp"], how="left")
    
    row_ids_site = X_test_site.row_id

    X_test_site = X_test_site[all_features]
    y_pred_test_site = np.zeros(X_test_site.shape[0])
    
    print("Scoring for site_id", site_id)    
    
    for fold in range(cv):
        model_lgb = models[site_id][fold]
        y_pred_test_site += model_lgb.predict(X_test_site, num_iteration=model_lgb.best_iteration) / cv
        gc.collect()
        
    df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
    df_test_sites.append(df_test_site)
    
    print("Scoring for site_id", site_id, "completed\n")
    gc.collect()

HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…

Preparing test data for site_id 0
Scoring for site_id 0
Scoring for site_id 0 completed

Preparing test data for site_id 1
Scoring for site_id 1
Scoring for site_id 1 completed

Preparing test data for site_id 2
Scoring for site_id 2


MemoryError: Unable to allocate array with shape (5063280, 40) and data type float64

# Model 

## Data Preparation 

In [None]:
data = pd.merge(energy,building_info,on='building_id',how='left')
data = pd.merge(data,weather,on=['site_id','timestamp', 'year', 'month', 'day', 'hour', 'minute', 
                                  'second'],how='left')
data.tail()

In [None]:
data['primary_use'] = data['primary_use'].astype('category').cat.codes
data['meter'] = data['meter'].astype('category').cat.codes
#data = dt_parts(data,'timestamp')
#data.fillna(0,inplace=True)
cols = data.columns
for col in cols:
    data[col].fillna(data[col].mean(),inplace=True)
data = df_type_optimize(data)
data.head()

In [None]:
# As per the discussion in the following thread, https://www.kaggle.com/c/ashrae-energy-prediction/discussion/117083, there is some discrepancy in the meter_readings for different ste_id's and buildings. It makes sense to delete them
idx_to_drop = list((data[(data['site_id'] == 0) & (data['timestamp'] < "2016-05-21 00:00:00")]).index)
data.drop(idx_to_drop,axis='rows',inplace=True)

In [None]:
%%time
number_unique_meter_per_building = data.groupby('building_id')['meter'].nunique()
data['number_unique_meter_per_building'] = data['building_id'].map(number_unique_meter_per_building)

mean_meter_reading_per_building = data.groupby('building_id')['meter_reading'].mean()
data['mean_meter_reading_per_building'] = data['building_id'].map(mean_meter_reading_per_building)
median_meter_reading_per_building = data.groupby('building_id')['meter_reading'].median()
data['median_meter_reading_per_building'] = data['building_id'].map(median_meter_reading_per_building)
std_meter_reading_per_building = data.groupby('building_id')['meter_reading'].std()
data['std_meter_reading_per_building'] = data['building_id'].map(std_meter_reading_per_building)

In [None]:
mean_meter_reading_per_meter = data.groupby('meter')['meter_reading'].mean()
data['mean_meter_reading_per_meter'] = data['meter'].map(mean_meter_reading_per_meter)
median_meter_reading_per_meter = data.groupby('meter')['meter_reading'].median()
data['median_meter_reading_per_meter'] = data['meter'].map(median_meter_reading_per_meter)
std_meter_reading_per_meter = data.groupby('meter')['meter_reading'].std()
data['std_meter_reading_per_meter'] = data['meter'].map(std_meter_reading_per_meter)

In [None]:
mean_meter_reading_per_primary_usage = data.groupby('primary_use')['meter_reading'].mean()
data['mean_meter_reading_per_primary_usage'] = data['primary_use'].map(mean_meter_reading_per_primary_usage)
median_meter_reading_per_primary_usage = data.groupby('primary_use')['meter_reading'].median()
data['median_meter_reading_per_primary_usage'] = data['primary_use'].map(median_meter_reading_per_primary_usage)
std_meter_reading_per_primary_usage = data.groupby('primary_use')['meter_reading'].std()
data['std_meter_reading_per_primary_usage'] = data['primary_use'].map(std_meter_reading_per_primary_usage)

In [None]:
mean_meter_reading_per_site_id = data.groupby('site_id')['meter_reading'].mean()
data['mean_meter_reading_per_site_id'] = data['site_id'].map(mean_meter_reading_per_site_id)
median_meter_reading_per_site_id = data.groupby('site_id')['meter_reading'].median()
data['median_meter_reading_per_site_id'] = data['site_id'].map(median_meter_reading_per_site_id)
std_meter_reading_per_site_id = data.groupby('site_id')['meter_reading'].std()
data['std_meter_reading_per_site_id'] = data['site_id'].map(std_meter_reading_per_site_id)

In [None]:
data.shape

In [None]:
data.dtypes

### Correlation Check

In [None]:
%%time
# Let's check the correlation between the variables and eliminate the one's that have high correlation
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = data.corr().abs()
corr_matrix.head()

In [None]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

In [None]:
# Select columns with upperelations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))
print ("Following columns can be dropped {}".format(to_drop))

In [None]:
data.drop(to_drop,axis=1,inplace=True)

## Data Split

In [None]:
categorical_cols = ['building_id','Month','meter','Hour','primary_use','DayOfWeek','DayOfMonth']

In [None]:
target_col = 'meter_reading'
y = np.log1p(data[target_col])
# Converting the dependent variable to logarithmic scale
data.drop('timestamp',axis=1,inplace=True)
Xs = data.drop(target_col,axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(Xs, y, test_size=0.2, random_state=0)
X_train.shape,X_valid.shape

## Training Model

In [None]:
#code reference above
from sklearn.ensemble import forest
def set_rf_samples(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))
#set_rf_samples(130000)

In [None]:
%%time
model = RandomForestRegressor(n_estimators=60,
                              random_state=0,n_jobs=-1)
model.fit(X_train,y_train)

In [None]:
X_train

## Evaluation 

In [None]:
def RMSE(actual,preds):
    return np.sqrt(mean_squared_error(actual,preds))

def get_evaluations(model):
    preds = model.predict(X_train)
    plt.hist(np.log1p(preds),bins=100)
    plt.show();
    print('train_rmse: ',RMSE(y_train,preds))
                    
    preds = model.predict(X_valid)
    plt.hist(np.log1p(preds),bins=100)
    plt.show()
    print('valid_rmse: ',RMSE(y_valid,preds))
    
get_evaluations(model)

## Enhancements 

In [None]:
eli5.show_weights(model,feature_names=list(X_train.columns))

In [None]:
#out sample out validation set
test_row = X_valid.loc[15256244,:]
test_row

In [None]:
eli5.show_prediction(model,test_row,feature_names=list(X_train.columns))
#bias seems to be high

## Test Prediction 

In [None]:
energy_test = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\test.csv')
weather_test = pd.read_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\weather_test.csv')
test = pd.merge(energy_test,building_info,on='building_id',how='left')
test = pd.merge(test,weather_test,on=['site_id','timestamp'],how='left')
test.tail()

In [None]:
test['primary_use'] = test['primary_use'].astype('category').cat.codes
test = dt_parts(test,'timestamp')
test.fillna(0,inplace=True)
test=df_type_optimize(test)
ids = test['row_id']
test.drop('row_id',axis=1,inplace=True)
test.head()

In [None]:
%%time
preds = model.predict(test)

sub_df = pd.DataFrame()
sub_df['row_id'] = ids
sub_df['meter_reading'] = preds
sub_df.to_csv(r'C:\Users\Solomonzhs\Desktop\Learn\ashrae-energy-prediction\the-sub-mission.csv',index=False)
sub_df.head()

In [None]:
plt.hist(np.log1p(sub_df['meter_reading']),bins=100)
plt.show()