In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from IPython.display import Image
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print(os.getcwd())
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Begin by taking an overview of the houses dat stored in information_hoseholds

In [None]:

household_data = pd.read_csv('../input/smart-meters-in-london/informations_households.csv')

In [None]:
household_data.head()

In [None]:
Image('../input/acornimage/Acorn.png')

In [None]:
print(household_data.Acorn_grouped.unique())
print(household_data.stdorToU.unique())

The Acorn user group classifies the UK demographics into different demographic types ranging from Adversity groups to Affluent. The second column details what type of payment scheme the household was put on, with Std being a flat rate for energy cost, and ToU a Time of Use scheme, where customers are told at what time they will pay what price

In [None]:
homedf = pd.read_csv('/kaggle/input/smart-meters-in-london/daily_dataset/daily_dataset/block_24.csv')


In [None]:
print(homedf.shape)
print(homedf.describe())
homedf.head()


The energy units are given KWh. We review basic trends:


In [None]:
print('First day = {}'.format(homedf.day.min()))

print('Last day = {}'.format(homedf.day.max()))
homedf['day'] = pd.to_datetime(homedf['day'])
x_dates = homedf['day'].dt.strftime('%Y-%m-%d').sort_values().unique()
# ax.set_xticklabels(labels=x_dates, rotation=45, ha='right')

In [None]:
freq = int(100) #change frequency of tick label
homedf.iloc[::freq].day.dt.date

In [None]:

fig, ax = plt.subplots(figsize = (12,12))
sns.lineplot(ax = ax, x = 'day',y = 'energy_median',data = homedf)
ax.set_xticklabels(homedf.iloc[::freq].day.dt.date)


There are clearlt seasonal variations, winter consumption is higher than summer consumption, it is also interesting to note the rapid decrease in consumption at the beginning of the program.
Lets look at variation in energy consumption depending on the income of the house


All the people in a single block of units where on the standard rate and are from the same acorn group, lets merge a few blocks together to see if we can analyse the variation by joining a few blocks together

In [None]:
n=111 #Look at 10 blocks
houseblock = homedf
for block in range(n):
    newblock = pd.read_csv('/kaggle/input/smart-meters-in-london/daily_dataset/daily_dataset/block_'+str(block)+'.csv')
    houseblock = pd.concat([houseblock, newblock],axis =0)

In [None]:
houseblock.LCLid.nunique() #5550 unique homes

In [None]:
home_joined = pd.merge(houseblock,household_data, on = 'LCLid',how='left')

home_joined.day = pd.to_datetime(home_joined.day)
home_joined.head()


In [None]:
home_joined['stdorToU']

We review different trends in energy consumption depending on whether one uses the Standard plan or the Time of Use plan

In [None]:
sns.set(font_scale=1.2)  # crazy big
sns.set_style('whitegrid')

def demographic_trends_facet_plot(trend1 = 'energy_mean', trend2 = 'energy_max'):
    fig, ax = plt.subplots(2,2, figsize = (15,15))
    fig.subplots_adjust(hspace = .5, wspace=0.2)

    axs = ax.ravel()
    i=0
    for aggregate_function in [trend1,trend2]:
        for energy_plan in ['ToU','Std']:
            home_energyplan = home_joined[home_joined['stdorToU'] == energy_plan]
            aggdata = home_energyplan.groupby('Acorn')[aggregate_function].agg('mean')
            aggdata = aggdata.reset_index()
            bp = sns.barplot(ax = axs[i],x = 'Acorn',y = aggregate_function,data =aggdata)
            bp.set_xticklabels(bp.get_xticklabels(), 
                              rotation=45, 
                              horizontalalignment='right')
            axs[i].set_title(energy_plan)
            axs[i].set_xlabel('Acorn Group')
            i+=1


demographic_trends_facet_plot()

Energy consumption certainly increases for those using the Time of Use plan, there is also a trend of energy consumption increasing with Acorn group. Looking at mean consumption on a single chart


In [None]:
aggdata = home_joined.groupby(['Acorn','stdorToU'])['energy_mean'].agg('mean').reset_index()
fig,ax = plt.subplots(figsize = (12,12))
sns.set(font_scale=1.5)  # crazy big

bp = sns.barplot(ax = ax,x = 'Acorn',y = 'energy_mean',hue = 'stdorToU',data =aggdata)
bp.set_xticklabels(bp.get_xticklabels(), 
      rotation=45, 
      horizontalalignment='right')

For which groups does changing the plan have the most and the least effect??

In [None]:
aggdata = aggdata[aggdata['Acorn'] != 'ACORN-']
Std = aggdata.loc[aggdata['stdorToU'] == 'Std']
ToU = aggdata.loc[aggdata['stdorToU'] == 'ToU']
print(aggdata)
df = pd.merge(Std,ToU, on = 'Acorn', how = 'outer', suffixes = ('_Std','_ToU'))
df['consump_diff'] =  df.energy_mean_ToU - df.energy_mean_Std
fig,ax =plt.subplots(figsize = (12,12))
bp = sns.barplot(ax = ax, x = 'Acorn', y = 'consump_diff', data = df)
                     
bp.set_xticklabels(bp.get_xticklabels(), 
      rotation=45, 
      horizontalalignment='right')
ax.set_title('Change in Consuption after Change in Subscription from Std to ToU')

For most demographics, the consumption drops when switching subscription, presumably as the user becomes more conscious about there energy consumption, there are noticable outliers however such as those with executive wealth

In [None]:
Image('../input/acornimage/Acorn.png',width= 700, height=700)

# Forecasting

# Checking distribution of time data

In [None]:
tempforecast = pd.read_csv('../input/smart-meters-in-london/weather_hourly_darksky.csv')
print(tempforecast.shape)
tempforecast.head()
tempforecast.time = pd.to_datetime(tempforecast.time)

First analyse the distribution of time data to see how well distributed the time data is. Since our home_joined dataset uses daily data, we want to know how complete the weather distribution is for daily data

In [None]:
tempforecast.groupby([tempforecast['time'].dt.year]).count()['visibility'].plot(kind="bar")


2011 and 2014 are lacking in sufficient daily data, so they will be excluded from this analysis

In [None]:
years = [2012,2013]
tempforecast = tempforecast.loc[tempforecast.time.dt.year.isin(years)]

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (15,8))
tempforecast.loc[tempforecast.time.dt.year == 2012].groupby([tempforecast['time'].dt.hour])['visibility'].count().plot(ax = ax[0],kind='bar')
tempforecast.loc[tempforecast.time.dt.year == 2013].groupby([tempforecast['time'].dt.hour])['visibility'].count().plot(ax = ax[1],kind = 'kde')



In [None]:
home_joined.head()

In [None]:
n_homes =5550  # Number of unique homes in the dataset

(home_joined.groupby([home_joined['day'].dt.year]).count()/n_homes)['energy_mean'].plot(kind="bar")


We remove 2011, 2014 and 2012 from our daataset as they have insuffieicnt number of days on average

In [None]:
tempforecast = tempforecast.loc[tempforecast.time.dt.year == 2013]
home_joined = home_joined.loc[home_joined.day.dt.year == 2013]
tempforecast.head()

# Analysing trends in forecast variables throughout the year

Now we have to explore the forecast variables and see how best we can aggregrate these variables to predict consumption. The forecast variables are provided hourly, so they have to be aggreagated over a 24 hour period. For some variables, aggregating by the mean daily values loses a significant amount of information and makes the variable useless in analysis

In [None]:
numerical = tempforecast.columns.drop(['time','icon','precipType','summary'])
categorical = ['icon','summary','precipType']

fig,ax = plt.subplots(2,4,figsize = (25,18))
axs = ax.ravel()

for i, feature in enumerate(numerical):
    tempforecast.plot('time',feature, ax=axs[i])
    axs[i].set_ylabel(feature)

In [None]:
fig,ax = plt.subplots(2,4,figsize = (25,18))
axs = ax.ravel()

for i, feature in enumerate(numerical):
    tempforecast.groupby(tempforecast['time'].dt.dayofyear).mean().reset_index().plot('time',feature, ax=axs[i])
    axs[i].set_ylabel(feature)
    axs[i].set_xlabel('Day of Year')
plt.title('Average Hourly Variability of Predicted Variables')

Some predictors have more daily variability than seasonal variability such as windbearing and visibility 

In [None]:
fig,ax = plt.subplots(1,3,figsize = (20,9))
for i,feature in enumerate(categorical):
    tempforecast.groupby(tempforecast[feature])[feature].count().plot(kind = 'bar', ax = ax[i])

In [None]:
fig, ax = plt.subplots(figsize = (12,12))
sns.heatmap(tempforecast.corr(), annot = True, ax = ax)

We can get ris of apparentTemperature as it is highly correlated with the temperature and provides no new information

In [None]:
tempforecast = tempforecast.drop(['apparentTemperature'],axis=1)

Joining temperature forecast and house data, when aggregating, the categorical variables are lost, to include them in the daily values we will take the most common category in each day

To test how well this aggregate function applied on the categorical data performs, we can compare the barplots with the previous barplots and see if the proportions remain the same

In [None]:
tempforecast_daily_categorical = tempforecast[categorical+['time']].groupby(tempforecast['time'].dt.dayofyear).agg(lambda x:x.value_counts().index[0])
fig,ax = plt.subplots(1,3,figsize = (20,9))
for i,feature in enumerate(categorical):
    tempforecast_daily_categorical.groupby(tempforecast_daily_categorical[feature])[feature].count().plot(kind = 'bar', ax = ax[i])

They are pretty similar, with the exception of the removal of some categories which never featured as the most common in any day

# Joining data sets and forecasting consumption

In [None]:
tempforecast_daily = tempforecast.groupby(tempforecast['time'].dt.dayofyear).mean()
tempforecast_daily = tempforecast_daily.join(tempforecast_daily_categorical).drop('time',axis=1)
tempforecast_daily.head()
tempforecast_daily=tempforecast_daily.reset_index().rename({'time':'dayno'},axis=1)


In [None]:
pd.set_option('display.max_columns', None)
print('Shape before Join = {}'.format(home_joined.shape))
home_joined['dayno'] = home_joined.day.dt.dayofyear
df = home_joined.merge(tempforecast_daily, how = 'left', on = 'dayno')
print('Shape after Join = {}'.format(df.shape))
df.head()

Primary analysis of consumption vs. temperature



What do each of these acorn groups represent in terms of Acorn_i? What is Acorn-U? Can we rmove it?


In [None]:
fig, ax = plt.subplots(figsize= (12,12))
bp = df.groupby(['Acorn_grouped','Acorn']).count()['LCLid'].plot(ax=ax,kind = 'bar')
bp.set_xticklabels(bp.get_xticklabels(), 
      rotation=45, 
      horizontalalignment='right')

In [None]:
df = df[~df['Acorn'].isin(['ACORN-','ACORN-U'])]

In [None]:
fig,ax = plt.subplots(figsize = (14,14))
sns.lineplot(x= 'temperature', y = 'energy_mean', hue = 'stdorToU', data = df, ax = ax)

Those on the standard subscription always consume more than those on the ToU subscription. And energy consumption levels drop significantly with temperature. There is also a clear cut-off, past which the energy consu,ption tails off.
How does this trend vary with Acorn group?
1. 

In [None]:
fig,ax = plt.subplots(figsize = (14,14))
sns.lineplot(x= 'temperature', y = 'energy_mean', hue = 'Acorn_grouped', data = df, ax = ax)

What other factors may affect the consumption?

In [None]:
fig,ax = plt.subplots(figsize = (20,20))
sns.heatmap(df.corr(),ax =ax, annot=True)

# Forecasting Model Iteration

Import useful sklearn modules for feature preprocessing and selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn import set_config


Import sklearn regression models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Data cleaning

In [None]:
df1 = df[df.isna().any(axis=1)]
(df1['energy_max']-df1['energy_min']).sum()


In [None]:
df.loc[df['energy_std'].isna(),'energy_std']=0

We can drop the remaining NaN values of which there are only 7

In [None]:
df= df.dropna()

the std. deviation column gives a lot of NaN values, these occur when the minimum energy = max energy, we can therefore replace these with 0

In [None]:
df.head()

In [None]:
y = df.energy_mean
predictor_cols = ['temperature']
X = df[predictor_cols]


Start with linear regression from statsmodels api, for our benchmark model we will predict energy consumption based solely on temperature

In [None]:
X = sm.add_constant(X)
model = sm.OLS(y,X)
results = model.fit()
print(results.summary())
print('Mean Squared Error = {:.2f}'.format(results.mse_model))

This base model doesn't perform very well as can be seen we will need to introduce the other variables and perform categorical encoding on the weather features as well as ordinal encoding on the acorn group

# Pipeline & Feature Selection

The goal is to select the most important features in related to energy consumption and to build a predictive model. We first use RFE to determine the most important features and reduce our model

In [None]:
dfsmall = df.sample(frac=0.01)
predictor_cols = ['dayno','visibility','windBearing','temperature','dewPoint','pressure','windSpeed','humidity','stdorToU','Acorn','summary','precipType']

X = dfsmall[predictor_cols]
y = dfsmall['energy_mean']


numerical_features = ['dayno','visibility','windBearing','temperature','dewPoint','pressure','windSpeed','humidity']
categorical_features = ['stdorToU','summary','precipType']
ordinal_features = ['Acorn'] 


In [None]:
def build_pipeline(numerical_features, categorical_features, ordinal_features):
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    ordinal_transformer = OrdinalEncoder()

    preprocessor = ColumnTransformer( # Transform respective columns using column transformer
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features),
            ('ord', ordinal_transformer, ordinal_features)
        ])

    rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=4)


    regressorpipe = Pipeline(steps=[('preprocessor', preprocessor), ('feature_selection', rfe),
                          ('regressor', RandomForestRegressor())])
    return rfe, regressorpipe

rfe, regressor = build_pipeline(numerical_features, categorical_features, ordinal_features)

X_new = regressor.fit(X, y)



In [None]:
def visualization_plot(RFE):
    columns = numerical_features +list(X['stdorToU'].unique()) +list(X['summary'].unique())+list(X['precipType'].unique()) +['Acorn']
    fig, ax = plt.subplots(figsize = (12,12))
    x = np.arange(len(columns))  # the label locations
    ax.barh(x, RFE.ranking_)
    ax.set_yticks(x)
    ax.set_yticklabels(columns)
#     plt.xticks(rotation=90)
    ax.set_title('Ranking of Feature Importance (1 = highest, 18 = lowest)')

visualization_plot(rfe)

In [None]:
regressor.score(X_test,y_test)

The RFE selects the temperature, and the ACORN group as the most important features in determining the mean_consumption

In [None]:
# Reselect features and train model
numerical_features = ['dayno','visibility','temperature','pressure']
categorical_features = ['stdorToU','precipType']
ordinal_features = ['Acorn'] 
rfe, pipeline = build_pipeline(numerical_features, categorical_features, ordinal_features)
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state=42)

pipeline.fit(X_train, y_train)
print('Model Score After Feature Removal = {}'.format(pipeline.score(X_test,y_test)))