In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

import random
random.seed(0)

import gc # garbage collection

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Import Datasets
We import datasets and perform preliminary optimization.

In [None]:
# Import all data
train = pd.read_csv('/kaggle/input/ashrae-energy-prediction/train.csv')
test = pd.read_csv('/kaggle/input/ashrae-energy-prediction/test.csv')
weather_train = pd.read_csv('/kaggle/input/ashrae-energy-prediction/weather_train.csv')
weather_test = pd.read_csv('/kaggle/input/ashrae-energy-prediction/weather_test.csv')
building_meta = pd.read_csv('/kaggle/input/ashrae-energy-prediction/building_metadata.csv')
train.name = 'train'
test.name = 'test'
weather_train.name = 'weather_train'
weather_test.name = 'weather_test'
building_meta.name = 'building_meta'

Take a quick look at the data.

In [None]:
# Show the top 5 entries and summary of each dataframe
dataframes = [train, test, weather_train, weather_test, building_meta]
for df in dataframes:
    print(df.name)
    print(df.head())
    print(df.info())

The timestamps in the dataframes are represented in strings. Here we convert them to pandas datetime objects.

In [None]:
# Convert string timestamp to Pandas datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'])
weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])

The original dataframes are too large to be directly manipulated in RAM, and also have unnecessarily large datatypes. Here we reduce the size of the dataframes by converting each data columns into most suitable datatypes.

In [None]:
# Function reducing dataframe size to fit into memory
def reduce_memory_usage(dataframe, verbose=True): 
    starting_memory = dataframe.memory_usage().sum() / 1024**2
    numeric_types = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in dataframe:
        data_type = dataframe[col].dtype
        if data_type in numeric_types:
            min_val = dataframe[col].min()
            max_val = dataframe[col].max()
            if str(data_type)[:3] == 'int':
                if min_val > np.iinfo('int8').min and max_val < np.iinfo('int8').max:
                    dataframe[col] = dataframe[col].astype('int8')
                elif min_val > np.iinfo('int16').min and max_val < np.iinfo('int16').max:
                    dataframe[col] = dataframe[col].astype('int16')
                elif min_val > np.iinfo('int32').min and max_val < np.iinfo('int32').max:
                    dataframe[col] = dataframe[col].astype('int32')
                else:
                    dataframe[col] = dataframe[col].astype('int64') # useless line?
            else: 
                if min_val > np.finfo('float16').min and max_val < np.finfo('float16').max:
                    dataframe[col] = dataframe[col].astype('float16')
                elif min_val > np.finfo('float32').min and max_val < np.finfo('float32').max:
                    dataframe[col] = dataframe[col].astype('float32')
                else: 
                    dataframe[col] = dataframe[col].astype('float64') # useless line?
    end_memory = dataframe.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage decreased to {:.2f} mb ({:.2f}% decrease)'.format(end_memory, 100 * (starting_memory-end_memory) / starting_memory))

In [None]:
# Apply memory reduction
reduce_memory_usage(train)
reduce_memory_usage(test)
reduce_memory_usage(weather_train)
reduce_memory_usage(weather_test)
reduce_memory_usage(building_meta)

## Initial Data Exploration

First let's explore the building_meta dataset.

In [None]:
building_meta.head()

There is a lot of NaN in floor_count column. Let's check.

In [None]:
# Check nan values
1 - building_meta.isna().sum() / len(building_meta) 

There is almost half of year_built value missing, and 75% of floor_count missing. We need to pay extra attention when performing data wrangling.

Let's visualize the columns to gain some initial insight. 

In [None]:
# Number of buildings in each site
ax = building_meta['site_id'].value_counts().plot(kind='bar', figsize=(10,5), title='Number of Buildings in Each Site')
ax.set_xlabel('Site ID')
ax.set_ylabel('Number of Buildings')

In [None]:
# Number of buildings for each primary_use
ax = building_meta['primary_use'].value_counts().plot(kind='bar', figsize=(10,5), title='Number of Buildings for Each Primary_Use')
ax.set_xlabel('Primary Use')
ax.set_ylabel('Number of Buildings')

In [None]:
# Distribution of square_feet (size of building)

ax = building_meta['square_feet'].hist(bins=100, figsize=(10,5))
ax.set_xlabel('Size of Building-Square Feet')
ax.set_ylabel('Number of Buildings')
ax.set_title('Histogram of Size of Buildings')

In [None]:
# Number of buildings built in each year
index = building_meta['year_built'].sort_values().value_counts().keys()
value = building_meta['year_built'].sort_values().value_counts().values
plt.figure(figsize=(10,5))
plt.bar(index, value)
plt.xlabel('Year')
plt.ylabel('Number of Buildings')
plt.title('Number of Buildings Built in Each Year')

In [None]:
# Number of buildings with each floor_count
ax = building_meta['floor_count'].value_counts().plot(kind='bar', figsize=(10,5), title='Number of Buildings with Each Floor_Count')
ax.set_xlabel('Floor Count')
ax.set_ylabel('Number of Buildings')

Now we take a look at the weather_train dataframe.

In [None]:
weather_train.head()

Let's check for NaN value.

In [None]:
1 - weather_train.isna().sum() / len(weather_train)

Most columns have more than 90% of values. Only cloud_coverage has only 50% data, and precip_depth_1_hr has 64%. We might take the columns out of consideration when building model.

In [None]:
weather_train.groupby('site_id').count() / len(weather_train['timestamp'].unique())

Here we can see that some sites don't have any entry for certain weather data. Some sites don't have data for all the times, and some entries are missing for some times. Both should be taken into consideration when performing data wrangling. 

Let's plot the weather measurement vs. time, and see the change within a year.

In [None]:
columns = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']

for col in columns:
    plt.figure(figsize=(20,10))
    plt.title(col + ' vs Time')
    labels = [0] * (max(weather_train['site_id'])+1)
    for id in range(max(weather_train['site_id'])+1):
        time = weather_train[weather_train['site_id']==id ]['timestamp']
        data = weather_train[weather_train['site_id']==id][col]
        labels[id], = plt.plot(time, data, label=id)
    plt.legend(handles=labels)

* Air_temperature is behaving as expected, we can see that some sites are warmer than others. 
* Dew_temperature is similar to air_temperature. One interesting thing is how site0's dew temperature is quite lower than other sites' in the summer months.
* Site 8 has the most large rainfalls
* Sea_level_pressure changes more radically during the colder half of the year.
* It's hard to gain insight from cloud_coverage, wind_direction and wind_speed by using this kind of plot.

Weather readings sometimes can have correlations with each other. Let's check that out.

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(weather_train.corr(), annot=True, linewidths=.5)

There is a few weak correlations. And one strong correlation between air_temperature and dew_temperature.

Now let's take a look at the train dataset.

In [None]:
train.head()

Let's check meter column first.
* Meter 0 => electricity
* Meter 1 => chilled water
* Meter 2 => steam
* Meter 3 => hot water

In [None]:
print(train.groupby(['meter', 'timestamp'])['building_id'].count().groupby(level=0).mean())
print(train.groupby(['meter', 'timestamp'])['building_id'].count().groupby(level=0).mean() / len(building_meta['building_id']))

We can see that most of the meters are electricity meters, followed by chilled water, steam, and hot water meters. Almost 95% of the building have electricity meters, and the other three meters aren't widely adopted.

In [None]:
train.groupby('meter')['meter_reading'].count()

Since there's way more electricity meters, it makes sense that there's way more eletricity meter readings as well.

Now, let's look at how the target variable **meter_reading** of different meters changes with respect to time.


In [None]:
# Graph of each meter reading from 1/1/16 to 12/31/16
train.groupby(['timestamp', 'meter'])['meter_reading'].mean().unstack().plot(subplots=True, figsize=(20,20), title='Meter reading from 1/1/16 to 12/31/16', fontsize='14', sharex=False)

From the graphs above, we can see some trends, something that makes sense, and something not making sense (possible anomaly).
* For electricy, there's a high consumption during weekdays, low consumption during weekends, high consumption during the day, low consumption at night. 
* Higher electricy and chilled water consumption during summer, lower during winter. And hot water consumption behaves the other way around.
* Steam consumption data is confusing. Why is the highest usage from March to mid-June? Why is there a sudden spike in November (Possible anomaly)?
* Sudden spikes in other meter readings as well. Possible anomaly (meter malfunction, wrong data entry, etc.)

 **Now we combine the dataset together, and perform data wrangling.**

In [None]:
train_df = train.merge(building_meta, on='building_id', how='left').merge(weather_train, on=['site_id', 'timestamp'], how='left')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

## Modeling

Import needed libraries


In [None]:
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

Create time features from timestamp, and drop unneeded features

In [None]:
train_df['hour'] = train_df.timestamp.dt.hour
train_df['weekday'] = train_df.timestamp.dt.weekday
train_df['month'] = train_df.timestamp.dt.month
train_df.drop(['timestamp', 'sea_level_pressure', 'wind_direction', 'wind_speed'], axis=1, inplace=True)

Split train_df into X_train and y_train

In [None]:
y_train = train_df['meter_reading']
X_train = train_df.drop('meter_reading', axis=1)

Encode primary_use

In [None]:
le = LabelEncoder()
X_train.primary_use = le.fit_transform(X_train.primary_use)

Normalize floor_count, square_feet, and target variable

In [None]:
X_train['floor_count'] = np.log1p(X_train['floor_count'])
X_train['square_feet'] = np.log1p(X_train['square_feet'])
y_train = np.log1p(y_train)

Take a look at the processed dataset now

In [None]:
X_train.head()

In [None]:
y_train

The idea here is to split the dataset into two, train two models, and validate on the other half of the data 

In [None]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

Create a list of categorical features to feed into the model

In [None]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "hour", "weekday", "month", "year_built"]

Create two lightGBM datasets

In [None]:
d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

Create watchlists and set hyperparameters

In [None]:
watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.01,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

Create models

In [None]:
print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

### Feature Importance

Plot feature importance using built-in lightGBM function

In [None]:
df_fimp_1 = pd.DataFrame()
df_fimp_1["feature"] = X_train.columns.values
df_fimp_1["importance"] = model_half_1.feature_importance()
df_fimp_1["half"] = 1

df_fimp_2 = pd.DataFrame()
df_fimp_2["feature"] = X_train.columns.values
df_fimp_2["importance"] = model_half_2.feature_importance()
df_fimp_2["half"] = 2

df_fimp = pd.concat([df_fimp_1, df_fimp_2], axis=0)

plt.figure(figsize=(14, 7))
sns.barplot(x="importance", y="feature", data=df_fimp.sort_values(by="importance", ascending=False))
plt.title("LightGBM Feature Importance")
plt.tight_layout()