In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this EDA, the 'train', 'weather_train', and 'building_metadata' will be used and the 3 data files are imported:

In [None]:
train_df=pd.read_csv('/kaggle/input/ashrae-energy-prediction/train.csv')
building_meta_df=pd.read_csv('/kaggle/input/ashrae-energy-prediction/building_metadata.csv')
weather_train_df=pd.read_csv('/kaggle/input/ashrae-energy-prediction/weather_train.csv')

Check the size of each dataframe to make sure the data is properly imported and the NA values of each dataset is checked.

In [None]:
print(train_df.shape)
print(building_meta_df.shape)
print(weather_train_df.shape)
print(train_df.isna().sum())
print(weather_train_df.isna().sum())
print(building_meta_df.isna().sum())

In [None]:
train_df.head()

In [None]:
weather_train_df.head()

In [None]:
building_meta_df.head()

In 'train_df', each building is identified with a building_id. And for each building_id, there is one or more types of meters attached to it. To explore the time series data, meter_reading, the time series plots of meter_readings from the four meters in building_id=0 is plotted with hourly,daily and monthly sampling frequency. (Note: the building_id is rondomly chosen and can be replaced by other building_id.)

Electricity meter reading for building_id=0

In [None]:
train_df['timestamp']=pd.to_datetime(train_df['timestamp'],format='%Y-%m-%d %H:%M:%S')
train_electricity=train_df[train_df['meter']==0]

fig, axes = plt.subplots(1, 1, figsize=(14, 6))
train_electricity[train_electricity['building_id']==0][['timestamp','meter_reading']].set_index('timestamp').plot(ax=axes).set_ylabel('building 0 electricity/kWh')
train_electricity[train_electricity['building_id']==0][['timestamp','meter_reading']].set_index('timestamp').resample('D').mean().plot(ax=axes).set_ylabel('building 0 electricity/kWh')
train_electricity[train_electricity['building_id']==0][['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes).set_ylabel('building 0 electricity/kWh')
axes.set_title('building 0 electricity hourly, daily, and monthly meter reading')
axes.legend(['hourly','daily','monthly']) 

building_id=0 doesn't have chilledwater and steam meter. 
Hotwater meter reading for building_id=0:

In [None]:
train_hotwater=train_df[train_df['meter']==3]
fig, axes = plt.subplots(1, 1, figsize=(14, 6))
train_hotwater[train_hotwater['building_id']==106][['timestamp','meter_reading']].set_index('timestamp').plot(ax=axes).set_ylabel('building 0 hotwater/kWh')
train_hotwater[train_hotwater['building_id']==106][['timestamp','meter_reading']].set_index('timestamp').resample('D').mean().plot(ax=axes).set_ylabel('building 0 hotwater/kWh')
train_hotwater[train_hotwater['building_id']==106][['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes).set_ylabel('building 0 hotwater/kWh')
axes.set_title('building 0 hotwater hourly, daily, and monthly meter reading')
axes.legend(['hourly','daily','monthly']) 

Check the values of meter_reading by plotting the histograms for each meter:

In [None]:
fig, axes = plt.subplots(2,2)
train_df[train_df['meter']==0].meter_reading.plot.hist(ax=axes[0,0])
train_df[train_df['meter']==1].meter_reading.plot.hist(ax=axes[0,1])
train_df[train_df['meter']==2].meter_reading.plot.hist(ax=axes[1,0])
train_df[train_df['meter']==3].meter_reading.plot.hist(ax=axes[1,1])
axes[0,0].set_title('electricity')
axes[0,1].set_title('chilledwater')
axes[1,0].set_title('steam')
axes[1,1].set_title('hotwater')

As we can see from the histograms of meter_readings, the meter_reading of all the meter type is significantly right_skewed. This might be due to very large outliers which pushes all the relatively small values to the left in the histogram. Next, let's check if there are outliers in the meter_reading variable.


In [None]:
fig, axes = plt.subplots(1,4,figsize=(10,6))
train_df[train_df['meter']==0].meter_reading.plot.box(ax=axes[0])
train_df[train_df['meter']==1].meter_reading.plot.box(ax=axes[1])
train_df[train_df['meter']==2].meter_reading.plot.box(ax=axes[2])
train_df[train_df['meter']==3].meter_reading.plot.box(ax=axes[3])
axes[0].set_title('electricity')
axes[1].set_title('chilledwater')
axes[2].set_title('steam')
axes[3].set_title('hotwater')

As shown in the boxplots, there are outliers in the meter_reading of each of the meter type. Next, the outliers of meter_reading are removed and a histogram of the meter_reading without the outliers is shown:

In [None]:
lower_quantile_meter_reading=train_df['meter_reading'].describe()[4]
higher_quantile_meter_reading=train_df['meter_reading'].describe()[6]
plt.figure()
train_df[(train_df['meter_reading']>lower_quantile_meter_reading)&(train_df['meter_reading']<higher_quantile_meter_reading)].meter_reading.hist()

 From the histogram plot of the meter_reading data without outliers, we can see the data is still significantly right_skewed. Therefore, when doing the data analysis, transformation such as log transformation on the meter_reading data might be needed.
 
 Log transformed meter_reading:

In [None]:
log_meter_reading=np.log1p(train_df['meter_reading']).plot.hist()
log_meter_reading.set_title('log transformed meter reading')
train_df['meter_reading']=np.log1p(train_df['meter_reading'])

From now on, the log-transformed meter_reading is used.

Since the meter_reading is time series data, It's helpful to figure out the time variables hlepful in predicting meter_reading. The meter_reading versus month and the meter_reading versus hour are explored in the following plots and the trend shown in the two plots are analyzed:

In [None]:
fig, axes = plt.subplots(5, 1, figsize=(10, 10))
train_df[['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes[0])
train_df[train_df['meter']==0][['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes[1])
train_df[train_df['meter']==1][['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes[2])
train_df[train_df['meter']==2][['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes[3])
train_df[train_df['meter']==3][['timestamp','meter_reading']].set_index('timestamp').resample('M').mean().plot(ax=axes[4])
axes[0].legend(['all meter data'],loc='upper left')
axes[1].legend(['electricity meter data'],loc='upper left')
axes[2].legend(['chilledwater meter data'],loc='upper left')
axes[3].legend(['steam meter data'],loc='upper left')
axes[4].legend(['hotwater meter data'],loc='upper left')
axes[0].set_title('monthly meter reading trend')

From the plots of the monthly meter reading, we can see that difference types of energy consumption hit the peak in different months within a year. For example, the chilledwater meter reading reaches the peak around September. This might be because the weahter in Septermber is hot and people tend to use more chilledwater. Thus, timestamp such as 'month' can be a predictor for predicting the meter_reading. Most of the meter reading goes up in winter, while the chilledwater meter reading goes down in winter.

In [None]:
hour=train_df.timestamp.dt.hour # extract 'hour' from datetime.
train_df['hour']=hour
fig, axes = plt.subplots(5,1,figsize=(10, 10))
train_df.groupby('hour').mean().meter_reading.plot(ax=axes[0])
train_df[train_df['meter']==0].groupby('hour').mean().meter_reading.plot(ax=axes[1])
train_df[train_df['meter']==1].groupby('hour').mean().meter_reading.plot(ax=axes[2])
train_df[train_df['meter']==2].groupby('hour').mean().meter_reading.plot(ax=axes[3])
train_df[train_df['meter']==3].groupby('hour').mean().meter_reading.plot(ax=axes[4])
axes[0].legend(['all meter data'],loc='upper left')
axes[1].legend(['electricity meter data'],loc='upper left')
axes[2].legend(['chilledwater meter data'],loc='upper left')
axes[3].legend(['steam meter data'],loc='upper left')
axes[4].legend(['hotwater meter data'],loc='upper left')
axes[0].set_title('hourly meter reading trend')

As we can see from the hourly meter_reading plot, different types of meter reach the maximum value at different time of a day. Thus, 'time of a day' can be a predictor for predicting the meter reading.

Above, the response variable 'meter_reading' is explored. Now,  let's look at some potential feature variables. The variable 'primary use' is investigated first.

In [None]:
print(building_meta_df.primary_use.value_counts())

Merge train_df and building_meta_df into one dataframe:

In [None]:
train_whole=train_df.merge(building_meta_df, on='building_id', how='left')
train_whole.head()

To explore whether meter_reading is related to primary_use, I  plot the time series meter_reading for each primary use as follows:

In [None]:
train_whole['timestamp']=pd.to_datetime(train_whole['timestamp'])
train_whole['month']=train_whole['timestamp'].dt.month
train_whole['meter_reading']=np.log1p(train_whole['meter_reading'])
fig, axes = plt.subplots(8, 2, figsize=(14, 16))
plt.subplots_adjust(hspace=0.8)
train_whole[['meter_reading','primary_use','month']].groupby(['month','primary_use']).sum().meter_reading.unstack().plot(subplots=True,ax=axes)
plt.legend(loc='upper right') 

From the meter_reading plots for each primary use, we can see the primary use impact the trend of energy consumption significantly. Thus, primary use should be a predictor. And as a categorical variable, 'primary_use' should be encoded before use. For most of the builidng uses, the meter reading goes up in the winter.

Another way to show the relationship between a numerical variable and a categorical variable is the boxplot:

In [None]:
sns.boxplot(x='meter_reading',y='primary_use',data=train_whole, orient='h')

Explore the variables: 'square_feet' and 'floor_count data'.

In [None]:
(np.log1p(building_meta_df.square_feet)).hist()

In [None]:
building_meta_df.floor_count.value_counts()

variable 'year_built':

In [None]:
building_meta_df.year_built.value_counts()

The next sections are simple investigations of the variables in weather_train_df.

In [None]:
weather_train_df.air_temperature.dropna().hist() 

From the above plot, air_temperatuere is roughly normal.

In [None]:
weather_train_df.dew_temperature.hist() 

dew_temperature is roughly normal