# Multi-variable linear regression

The multivariable linear regression analysis is used to create a model of a single variable, typically an energy consumption. We call this the *dependent* variable.  The model is constructed as a linear combination of *explanatory* variables, like weather measurements or occupation. More information can be found on <a href="https://en.wikipedia.org/wiki/Linear_regression" target="_blank">wikipedia</a>.


The model is static.  This means that the data set should not contain dynamic effects.  For buildings, dynamic effects are mostly neglegible on a weekly basis unless the building has a very high thermal inertia.

Typical use of this analysis is to create a model of eg. the gas consumption of a building, and then use this model to detect and quantify changes in the gas consumption.  For example, the savings resulting from a new gas boiler can be computed as the difference between the consumption predicted by the model and the actual consumption. 


### Imports and loading data

In [None]:
import opengrid as og
from opengrid.library import plotting as og_plot
import pandas as pd
from joule import meta, filter_meta, remove_outliers

In [None]:
plt = og.plot_style()

In [None]:
# load weather
dfw = pd.read_pickle('weather_leuven_daily.pkl')
dfw = dfw.filter(items=['cloudCover', 'dewPoint', 'humidity', 'precipIntensity', 'windSpeed', 'temperature'])
for col in dfw:
    try:
        dfw[col] = dfw[col].apply(float)
    except:
        dfw.drop(columns=[col], inplace=True)

### Compute degree-days

First we compute heating degree-days for different base temperatures. More information on the computation of degree-days can be found in [this demo](https://opengridcc.github.io/analysis/degree-days).

In [None]:
# resample weather data to daily values and compute degree-days
dfw = dfw.resample('D').mean()
dfw_HDD = og.library.weather.compute_degree_days(ts=dfw['temperature'],
                                                  heating_base_temperatures=range(4, 18, 2),
                                                  cooling_base_temperatures=range(12, 26, 2)).bfill()

# Create a monthly and weekly models for the gas, electricity and water consumption 

In [None]:
for building in meta['RecordNumber'].unique()[:]:
    record_name = filter_meta(RecordNumber=building)['RecordName'].iloc[0]
    print('\nResults for {}'.format(record_name))
    print(100*'*')
    for utility in ['Electricity', 'NaturalGas', 'Water']:
        analysis = '{} - {}'.format(utility, record_name)
        try:
            ts = pd.read_pickle('data/{}_{}.pkl'.format(utility, building)).sum(axis=1)*4
            #ts = ts[ts.index.year==2017]
            ts.name = analysis
        except:
            print('Cannot load {}'.format(building))
        else:
            if ts.empty:
                print('No Data for {}'.format(analysis))
                continue

            ts_day = ts.resample('D').sum()/1000. # kWh/day
            # remove days with incomplete data
            ts_day = ts_day[ts_day.diff() > ts_day.mean()/1e6]
            ts_day = remove_outliers(ts_day)
            df_day = pd.concat([ts_day, dfw, dfw_HDD], axis=1)
            df_day = df_day.dropna().loc['2017-01-01':]
            df_month = df_day.resample('MS').sum()
            df_month = df_month.reindex(df_day.resample('MS').mean().index)
            df_month_2017 = df_month.loc['2017-01-01':'2017-12-31']
            df_month_2018 = df_month.loc['2018-01-01':'2018-12-31']
            df_week = df_day.resample('W').sum()
            df_week = df_week.reindex(df_day.resample('W').mean().index)
            df_week_2017 = df_week.loc['2017-01-01':'2017-12-31']
            df_week_2018 = df_week.loc['2018-01-01':'2018-12-31']


            if df_day.empty:
                print('No data after cleaning for {}'.format(analysis))
                continue
            
            plt.figure()
            plt.plot(df_day[analysis].index, df_day[analysis], marker='o')
            plt.show()

            # create the MONTHLY model
            mvlr = og.MultiVarLinReg(df_month_2017, y=analysis)
            mvlr.do_analysis()

            print(mvlr.fit.summary())
            df_predicted_month = mvlr._predict(mvlr.fit, df_month)
            mvlr.plot(df=df_predicted_month)
            plt.show()
            df_predicted_month['diff'] = (df_predicted_month[analysis] - df_predicted_month['predicted'])
            df_predicted_month['cumdiff'] = df_predicted_month['diff'].cumsum()
            plt.figure()
            plt.plot(df_predicted_month['cumdiff'].index, df_predicted_month['cumdiff'], marker='o')
            plt.title('Cumulative difference between model and measurement (negative=savings)')
            plt.show()

            # create the WEEKLY model
            mvlr = og.MultiVarLinReg(df_week_2017, y=analysis)
            mvlr.do_analysis()

            print(mvlr.fit.summary())
            df_predicted_week = mvlr._predict(mvlr.fit, df_week)
            mvlr.plot(df=df_predicted_week)
            plt.show()
            df_predicted_week['diff'] = (df_predicted_week[analysis] - df_predicted_week['predicted'])
            df_predicted_week['cumdiff'] = df_predicted_week['diff'].cumsum()
            plt.figure()
            plt.plot(df_predicted_week['cumdiff'].index, df_predicted_week['cumdiff'], marker='o')
            plt.title('Cumulative difference between model and measurement (negative=savings)')
            plt.show()


        