# Master dataframe en functie om forecast 2021 2023 van verbruik te maken



In [35]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

# show all columns in the dataframe
pd.set_option('max_columns', None)

In [36]:
# extra imports 

#!pip install altair
import altair as alt
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

# Define Predict Function 

In [37]:
# Define function that returns a low, mid or high prediction of SJV_TOTAAL for each PC6 for the years 2021-2023 on the basis
# of the electricity consumption in period 2010-2020. Default prediction type is 'mid'. 

import pystan
import fbprophet
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def predict_verbruik_prophet(df_input):

    # PROPHET PREDICTION MODEL - PRODUCES LOW, MID , HIGH FORECASTS
 
    # Copy input dataframe
    df = df_input.copy()
    
    # Create a list of unique pc4
    list_of_pc4= df['PC4'].unique()
    
    print (f"\nForecasting low, mid, high cases for 2021-2023 for {len(list_of_pc4)} pc4's")
   
    # Create dataframe for forecast 2021-2023
    df_forecast = pd.DataFrame()
    df_forecast.loc[:,'ds'] = [2021, 2022, 2023]
    
    #Create output dataframe
    df_output_low = pd.DataFrame()
    df_output_mid = pd.DataFrame()
    df_output_high = pd.DataFrame()

    
    # Loop over each pc6 and predict 2021-2023      
    for pc4 in list_of_pc4:
   
        df_predict = pd.DataFrame()
        df_predict['ds'] = df['JAAR'][df['PC4'] == pc4].copy()
        df_predict['y']  = df['SJV_TOTAAL'][df['PC4'] == pc4]
        
        # Skip this loop if there are less than 8 years in dataset for this PC4
        if len(df_predict) < 9:
           print (f"Skipping forecasting low-mid-high case for 2021-2023 with prophet model for pc4 {pc4}")
           continue
        
        print (f"Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 {pc4}")

         # We fit the model by instantiating a new Prophet object.
        # Any settings to the forecasting procedure are passed into the constructor. 
        # Then you call its fit method and pass in the historical dataframe. Fitting should take 1-5 seconds.
        
        m = Prophet(growth='linear',
                    changepoints=None,
                    n_changepoints=2, #25,
                    changepoint_range=0.4, #0.8,
                    yearly_seasonality=True, #'auto',
                    weekly_seasonality=False, #'auto',
                    daily_seasonality=False, #'auto',
                    holidays=None , #holidays,
                    seasonality_mode='additive',
                    seasonality_prior_scale=10.0,
                    holidays_prior_scale=10.0,
                    changepoint_prior_scale=0.05,
                    mcmc_samples=0,
                    interval_width=0.8,
                    uncertainty_samples=1000,
                    stan_backend=None)
        m.fit(df_predict)# Create linear regression object
 


        # Create rows of output dataframe for 2021-2023
        df_copy = pd.DataFrame(df[(df['PC4'] == pc4) & (df['JAAR'] == 2020)])       

        # Skip this loop if there is no data for 2020
        if df_copy.empty:
            continue
                       
        df_temp = pd.DataFrame(np.repeat(df_copy.values,3,axis=0))
        df_temp.columns = df_copy.columns
        df_temp['JAAR'].iloc[0] = 2021
        df_temp['JAAR'].iloc[1] = 2022
        df_temp['JAAR'].iloc[2] = 2023
        
        df_temp_low = df_temp.copy()
        df_temp_mid = df_temp.copy()
        df_temp_high = df_temp.copy()
        
        # Calculate forecast 
        forecast = m.predict(df_forecast) 
                
        df_temp_low['SJV_TOTAAL'].iloc[0] = forecast['yhat_lower'].iloc[0]
        df_temp_low['SJV_TOTAAL'].iloc[1] = forecast['yhat_lower'].iloc[1]
        df_temp_low['SJV_TOTAAL'].iloc[2] = forecast['yhat_lower'].iloc[2]
 
        df_temp_mid['SJV_TOTAAL'].iloc[0] = forecast['yhat'].iloc[0]
        df_temp_mid['SJV_TOTAAL'].iloc[1] = forecast['yhat'].iloc[1]
        df_temp_mid['SJV_TOTAAL'].iloc[2] = forecast['yhat'].iloc[2]

        df_temp_high['SJV_TOTAAL'].iloc[0] = forecast['yhat_upper'].iloc[0]
        df_temp_high['SJV_TOTAAL'].iloc[1] = forecast['yhat_upper'].iloc[1]
        df_temp_high['SJV_TOTAAL'].iloc[2] = forecast['yhat_upper'].iloc[2]
       
        df_output_low = df_output_low.append(df_temp_low)  
        df_output_mid = df_output_mid.append(df_temp_mid)  
        df_output_high = df_output_high.append(df_temp_high)  
   
    return df_output_low, df_output_mid, df_output_high  



def predict_verbruik(df_input, predict_type='mid'):

    # LINEAR REGRESSION MODEL

    
    # Copy input dataframe
    df = df_input.copy()
    
    # Create a list of unique pc4
    list_of_pc4= df['PC4'].unique()
    
    print (f"\nForecasting {predict_type} case for 2021-2023 for {len(list_of_pc4)} pc4's")
   
    # Create dataframe for forecast 2021-2023
    df_forecast = pd.DataFrame()
    df_forecast.loc[:,'ds'] = [2021, 2022, 2023]
    
    #Create output dataframe
    df_output = pd.DataFrame()

    
    # Loop over each pc6 and predict 2021-2023      
    for pc4 in list_of_pc4:
   
        df_predict = pd.DataFrame()
        df_predict['ds'] = df['JAAR'][df['PC4'] == pc4].copy()
        df_predict['y']  = df['SJV_TOTAAL'][df['PC4'] == pc4]
        
        # Skip this loop if there are less than 8 years in dataset for this PC4
        if len(df_predict) < 9:
           print (f"Skipping forecasting low-mid-high case for 2021-2023 with prophet model for pc4 {pc4}")
           continue

        print (f"Forecasting {predict_type} case for 2021-2023 with linear regression model for pc4 {pc4}")

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Split the data into training/testing sets
        df_predict_X = df_predict['ds'].to_numpy()
        df_predict_Y = df_predict['y'].to_numpy()        
        
        df_predict_X = df_predict_X.reshape(-1, 1)
        df_predict_Y = df_predict_Y.reshape(-1, 1)

        # Train the model using the training sets
        regr.fit(df_predict_X, df_predict_Y)
        
        # Create rows of output dataframe for 2021-2023
        df_copy = pd.DataFrame(df[(df['PC4'] == pc4) & (df['JAAR'] == 2020)])       

        # Skip this loop if there is no data for 2020
        if df_copy.empty:
            continue
            
            
        df_temp = pd.DataFrame(np.repeat(df_copy.values,3,axis=0))
        df_temp.columns = df_copy.columns
        df_temp['JAAR'].iloc[0] = 2021
        df_temp['JAAR'].iloc[1] = 2022
        df_temp['JAAR'].iloc[2] = 2023
        
        # Calculate forecast 
        forecast = regr.predict(df_forecast)
        
        low_decr = 0.99
        high_incr= 1.01
        
        if predict_type == "low":
           df_temp['SJV_TOTAAL'].iloc[0] = forecast[0] * low_decr
           df_temp['SJV_TOTAAL'].iloc[1] = forecast[1] * low_decr * low_decr
           df_temp['SJV_TOTAAL'].iloc[2] = forecast[2] * low_decr * low_decr * low_decr
        elif predict_type == "mid":
           df_temp['SJV_TOTAAL'].iloc[0] = forecast[0]
           df_temp['SJV_TOTAAL'].iloc[1] = forecast[1]
           df_temp['SJV_TOTAAL'].iloc[2] = forecast[2]
        elif predict_type == "high":
           df_temp['SJV_TOTAAL'].iloc[0] = forecast[0] * high_incr
           df_temp['SJV_TOTAAL'].iloc[1] = forecast[1] * high_incr * high_incr
           df_temp['SJV_TOTAAL'].iloc[2] = forecast[2] * high_incr * high_incr * high_incr
        else:
           print(f"Unexpected prediction type {predict_type}")
       
        #print(df_temp)
    
        df_output = df_output.append(df_temp)  
            
    return df_output

# Set directory

In [38]:
# variables used in script
data_processed_location = '../data/processed'

if 'processed' not in os.getcwd():
    os.chdir(data_processed_location)

# Read dataframes

In [39]:
# kleinverbruikgegevens gegevens inlezen
df = pd.read_hdf('kleinverbruikgegevens_data.h5')

#Delete 2021 data by keeping JAAR < 2021
df = df[df['JAAR'] < 2021]

# Fix problem with postcodes
df['POSTCODE_VAN'] = df['POSTCODE_VAN'].str.replace(" ","")
df['POSTCODE_TOT'] = df['POSTCODE_TOT'].str.replace(" ","")

# Create new dataframe and fill with aggregated data 

In [40]:
# Spoor 2. Voeg profiel toe aan verbruiksgegevens. SOORT_AANSLUITING bepaald het profiel
# Omdat de E2 profielen in de NEDU set niet consistent zijn (verschil tussen 2010-2017 en 2018-), gebruiken we deze niet
profiel_E1 = ['1X25','3X25', '1X20', '1x25', '3x25', '1x20'] # de rest is profiel E2

# Voeg in de kleinverbruikgegevens, het bijpassende profiel voor de soort aansluiting toe
def vervang_door_profiel(aansluiting, postcode, percentage):
    # Bepaal basisprofiel
    profiel = 'E1' # if aansluiting in profiel_E1 else 'E2' # Gebruiken als we wel een goed E2 profiel hebben

    # Een laag tarief percentage lager dan 50% zetten we in een A profiel
    lh_profiel = 'A' if percentage < 50 else 'B'

    # Bepaal welk laag tarief gebied de postcode zit
    if profiel == 'E1' and lh_profiel == 'B':
        postcode_area = int(postcode[:2]) # PC4
        # Postcodes < 65 is Noord-Brabant of Limburg
        if postcode_area < 65:
            lh_profiel = 'C'
    return profiel + lh_profiel

df["PROFIEL"] = np.vectorize(vervang_door_profiel)(df.SOORT_AANSLUITING, df.POSTCODE_VAN, df.SJV_LAAG_TARIEF_PERC)
df.PC4 = df.PC4.astype('int')
print (f'#E1A = {df[df.PROFIEL == "E1A"].PROFIEL.count()}')
print (f'#E1B = {df[df.PROFIEL == "E1B"].PROFIEL.count()}')
print (f'#E1C = {df[df.PROFIEL == "E1C"].PROFIEL.count()}')

#E1A = 956271
#E1B = 52567
#E1C = 257304


In [41]:
# Voeg features toe per PC6
df["E1A_TOTAAL"] = df[df.PROFIEL == 'E1A'].SJV_TOTAAL
df["E1B_TOTAAL"] = df[df.PROFIEL == 'E1B'].SJV_TOTAAL
df["E1C_TOTAAL"] = df[df.PROFIEL == 'E1C'].SJV_TOTAAL
df["WEIGHTED_LEVERINGSRICHTING_PERC"] = df.AANSLUITINGEN_AANTAL * df.LEVERINGSRICHTING_PERC

In [42]:
# Rol op tot PC4
df_verbruik = df.groupby(['PC4','JAAR']).agg({'SJV_TOTAAL':'sum', 'E1A_TOTAAL' : 'sum', 'E1B_TOTAAL' : 'sum', 'E1C_TOTAAL': 'sum', 'AANSLUITINGEN_AANTAL':'sum', 'WEIGHTED_LEVERINGSRICHTING_PERC': 'sum'})
df_verbruik['LEVERINGSRICHTING_PERC'] = df_verbruik['WEIGHTED_LEVERINGSRICHTING_PERC'] / df_verbruik['AANSLUITINGEN_AANTAL']
df_verbruik = df_verbruik.drop(columns=['WEIGHTED_LEVERINGSRICHTING_PERC'])
df_verbruik

Unnamed: 0_level_0,Unnamed: 1_level_0,SJV_TOTAAL,E1A_TOTAAL,E1B_TOTAAL,E1C_TOTAAL,AANSLUITINGEN_AANTAL,LEVERINGSRICHTING_PERC
PC4,JAAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4251,2010,21802.0,17114.0,0.0,4688.0,4325,99.949133
4251,2011,23084.0,18179.0,0.0,4905.0,4413,99.950147
4251,2012,24006.0,18172.0,0.0,5834.0,4486,99.848417
4251,2013,23728.0,18421.0,0.0,5307.0,4512,99.578901
4251,2014,23623.0,22610.0,0.0,1013.0,4544,98.932218
...,...,...,...,...,...,...,...
9999,2016,56.0,56.0,0.0,0.0,21,81.000000
9999,2017,53.0,53.0,0.0,0.0,21,71.000000
9999,2018,174.0,174.0,0.0,0.0,37,64.865676
9999,2019,163.0,163.0,0.0,0.0,37,62.164054


In [43]:
# Verhuis de multi-level index naar kolommen en maak de index een simpele range van getallen
df_verbruik['PC4'] = df_verbruik.index.get_level_values('PC4')
df_verbruik['JAAR'] = df_verbruik.index.get_level_values('JAAR')
df_verbruik.index = range(len(df_verbruik))
df_verbruik.SJV_TOTAAL = df_verbruik.SJV_TOTAAL.astype('int')
df_verbruik.E1A_TOTAAL = df_verbruik.E1A_TOTAAL.astype('int')
df_verbruik.E1B_TOTAAL = df_verbruik.E1B_TOTAAL.astype('int')
df_verbruik.E1C_TOTAAL = df_verbruik.E1C_TOTAAL.astype('int')

In [44]:
df_verbruik

Unnamed: 0,SJV_TOTAAL,E1A_TOTAAL,E1B_TOTAAL,E1C_TOTAAL,AANSLUITINGEN_AANTAL,LEVERINGSRICHTING_PERC,PC4,JAAR
0,21802,17114,0,4688,4325,99.949133,4251,2010
1,23084,18179,0,4905,4413,99.950147,4251,2011
2,24006,18172,0,5834,4486,99.848417,4251,2012
3,23728,18421,0,5307,4512,99.578901,4251,2013
4,23623,22610,0,1013,4544,98.932218,4251,2014
...,...,...,...,...,...,...,...,...
17125,56,56,0,0,21,81.000000,9999,2016
17126,53,53,0,0,21,71.000000,9999,2017
17127,174,174,0,0,37,64.865676,9999,2018
17128,163,163,0,0,37,62.164054,9999,2019


# QC plot for 3 PC4's

In [45]:
df1 = df_verbruik[df_verbruik.PC4 == 4251]
df2 = df_verbruik[df_verbruik.PC4 == 6373]
df3 = df_verbruik[df_verbruik.PC4 == 9998]

In [46]:
pc4_1 = alt.Chart(df1.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T', axis=alt.Axis(title="Jaar")),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)
    
pc4_2 = alt.Chart(df2.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T', axis=alt.Axis(title="Jaar")),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)

pc4_3 = alt.Chart(df3.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T', axis=alt.Axis(title="Jaar")),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)


pc4_1 | pc4_2 | pc4_3

# Call simple linear regression predictie function voor low, mid, high forecast

In [47]:
df_verbruik_input = df_verbruik[(df_verbruik['PC4'] == 4251) | (df_verbruik['PC4'] == 6373) | (df_verbruik['PC4'] == 9998)]


# predict low case scenario voor 2021-2023 per PC4 
df_verbruik_low = predict_verbruik(df_verbruik_input, 'low')

# predict mid case scenario voor 2021-2023 per PC4 
df_verbruik_mid = predict_verbruik(df_verbruik_input, 'mid')

# predict high case scenario voor 2021-2023 per PC4 
df_verbruik_high = predict_verbruik(df_verbruik_input, 'high')
                     


Forecasting low case for 2021-2023 for 3 pc4's
Forecasting low case for 2021-2023 with linear regression model for pc4 4251
Forecasting low case for 2021-2023 with linear regression model for pc4 6373
Forecasting low case for 2021-2023 with linear regression model for pc4 9998

Forecasting mid case for 2021-2023 for 3 pc4's
Forecasting mid case for 2021-2023 with linear regression model for pc4 4251
Forecasting mid case for 2021-2023 with linear regression model for pc4 6373
Forecasting mid case for 2021-2023 with linear regression model for pc4 9998

Forecasting high case for 2021-2023 for 3 pc4's
Forecasting high case for 2021-2023 with linear regression model for pc4 4251
Forecasting high case for 2021-2023 with linear regression model for pc4 6373
Forecasting high case for 2021-2023 with linear regression model for pc4 9998


# QC plot for low, mid , high prediction for 3 PC6's¶

In [48]:
df1 =df_verbruik[df_verbruik['PC4'] == 4251]
df2 =df_verbruik[df_verbruik['PC4'] == 6373]
df3 =df_verbruik[df_verbruik['PC4'] == 9998]

df1_low =df_verbruik_low[df_verbruik_low['PC4'] == 4251]
df2_low =df_verbruik_low[df_verbruik_low['PC4'] == 6373]
df3_low =df_verbruik_low[df_verbruik_low['PC4'] == 9998]

df1_mid =df_verbruik_mid[df_verbruik_mid['PC4'] == 4251]
df2_mid =df_verbruik_mid[df_verbruik_mid['PC4'] == 6373]
df3_mid =df_verbruik_mid[df_verbruik_mid['PC4'] == 9998]

df1_high =df_verbruik_high[df_verbruik_high['PC4'] == 4251]
df2_high =df_verbruik_high[df_verbruik_high['PC4'] == 6373]
df3_high =df_verbruik_high[df_verbruik_high['PC4'] == 9998]

pc6_1 = alt.Chart(df1.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)
    
pc6_2 = alt.Chart(df2.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)

pc6_3 = alt.Chart(df3.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)

pc6_1_low = alt.Chart(df1_low.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('red')
                     ).properties(width=250,height=250)
    
pc6_2_low = alt.Chart(df2_low.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('red')
                     ).properties(width=250,height=250)

pc6_3_low = alt.Chart(df3_low.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('red')
                     ).properties(width=250,height=250)

pc6_1_mid = alt.Chart(df1_mid.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('green')
                     ).properties(width=250,height=250)
    
pc6_2_mid = alt.Chart(df2_mid.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('green')
                     ).properties(width=250,height=250)

pc6_3_mid = alt.Chart(df3_mid.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('green')
                     ).properties(width=250,height=250)

pc6_1_high = alt.Chart(df1_high.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('orange')
                     ).properties(width=250,height=250)
    
pc6_2_high = alt.Chart(df2_high.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('orange')
                     ).properties(width=250,height=250)

pc6_3_high = alt.Chart(df3_high.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('orange')
                     ).properties(width=250,height=250)



pc6_1 + pc6_1_low + pc6_1_mid + pc6_1_high| pc6_2 + pc6_2_low + pc6_2_mid + pc6_2_high  | pc6_3  + pc6_3_low + pc6_3_mid + pc6_3_high

# Call prophet predictie function voor low, mid, high forecast

In [49]:
df_verbruik_input  =df_verbruik[(df_verbruik['PC4'] == 4251) | (df_verbruik['PC4'] == 6373) | (df_verbruik['PC4'] == 9998)]


# predict low, mid and high case scenario voor 2021-2023 per PC4 in a single run  
df_verbruik_low, df_verbruik_mid, df_verbruik_high = predict_verbruik_prophet(df_verbruik_input)



Forecasting low, mid, high cases for 2021-2023 for 3 pc4's
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 4251
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 6373
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9998


In [50]:
df1 =df_verbruik[df_verbruik['PC4'] == 4251]
df2 =df_verbruik[df_verbruik['PC4'] == 6373]
df3 =df_verbruik[df_verbruik['PC4'] == 9998]

df1_low =df_verbruik_low[df_verbruik_low['PC4'] == 4251]
df2_low =df_verbruik_low[df_verbruik_low['PC4'] == 6373]
df3_low =df_verbruik_low[df_verbruik_low['PC4'] == 9998]

df1_mid =df_verbruik_mid[df_verbruik_mid['PC4'] == 4251]
df2_mid =df_verbruik_mid[df_verbruik_mid['PC4'] == 6373]
df3_mid =df_verbruik_mid[df_verbruik_mid['PC4'] == 9998]

df1_high =df_verbruik_high[df_verbruik_high['PC4'] == 4251]
df2_high =df_verbruik_high[df_verbruik_high['PC4'] == 6373]
df3_high =df_verbruik_high[df_verbruik_high['PC4'] == 9998]

pc6_1 = alt.Chart(df1.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)
    
pc6_2 = alt.Chart(df2.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)

pc6_3 = alt.Chart(df3.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),
                     ).properties(width=250,height=250)

pc6_1_low = alt.Chart(df1_low.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('red')
                     ).properties(width=250,height=250)
    
pc6_2_low = alt.Chart(df2_low.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('red')
                     ).properties(width=250,height=250)

pc6_3_low = alt.Chart(df3_low.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('red')
                     ).properties(width=250,height=250)

pc6_1_mid = alt.Chart(df1_mid.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('green')
                     ).properties(width=250,height=250)
    
pc6_2_mid = alt.Chart(df2_mid.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('green')
                     ).properties(width=250,height=250)

pc6_3_mid = alt.Chart(df3_mid.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('green')
                     ).properties(width=250,height=250)

pc6_1_high = alt.Chart(df1_high.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('orange')
                     ).properties(width=250,height=250)
    
pc6_2_high = alt.Chart(df2_high.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('orange')
                     ).properties(width=250,height=250)

pc6_3_high = alt.Chart(df3_high.reset_index()).mark_line(clip=True).encode(
                     alt.X('JAAR:T'),
                     alt.Y('SJV_TOTAAL:Q'),color=alt.value('orange')
                     ).properties(width=250,height=250)



pc6_1 + pc6_1_low + pc6_1_mid + pc6_1_high| pc6_2 + pc6_2_low + pc6_2_mid + pc6_2_high  | pc6_3  + pc6_3_low + pc6_3_mid + pc6_3_high

In [51]:
df_verbruik_input  = df_verbruik

# predict low case scenario voor 2021-2023 per PC6 
df_verbruik_low_all = predict_verbruik(df_verbruik_input, 'low')

# predict mid case scenario voor 2021-2023 per PC6 
df_verbruik_mid_all = predict_verbruik(df_verbruik_input, 'mid')

# predict high case scenario voor 2021-2023 per PC6 
df_verbruik_high_all = predict_verbruik(df_verbruik_input, 'high')
                     

r 2021-2023 with linear regression model for pc4 9531
Forecasting high case for 2021-2023 with linear regression model for pc4 9533
Forecasting high case for 2021-2023 with linear regression model for pc4 9534
Forecasting high case for 2021-2023 with linear regression model for pc4 9536
Forecasting high case for 2021-2023 with linear regression model for pc4 9537
Forecasting high case for 2021-2023 with linear regression model for pc4 9541
Forecasting high case for 2021-2023 with linear regression model for pc4 9545
Forecasting high case for 2021-2023 with linear regression model for pc4 9551
Forecasting high case for 2021-2023 with linear regression model for pc4 9561
Forecasting high case for 2021-2023 with linear regression model for pc4 9563
Forecasting high case for 2021-2023 with linear regression model for pc4 9564
Forecasting high case for 2021-2023 with linear regression model for pc4 9566
Forecasting high case for 2021-2023 with linear regression model for pc4 9571
Forecastin

In [52]:
alt.Chart(df_verbruik_low_all.reset_index()).mark_line(clip=True).encode(
    alt.X('JAAR:T'),
    alt.Y('SJV_TOTAAL:Q'),
    alt.Color('PC4:N')).properties(
    width=400,height=400)

In [53]:
alt.Chart(df_verbruik_mid_all.reset_index()).mark_line(clip=True).encode(
    alt.X('JAAR:T'),
    alt.Y('SJV_TOTAAL:Q'),
    alt.Color('PC4:N')).properties(
    width=400,height=400)

In [54]:
alt.Chart(df_verbruik_high_all.reset_index()).mark_line(clip=True).encode(
    alt.X('JAAR:T'),
    alt.Y('SJV_TOTAAL:Q'),
    alt.Color('PC4:N')).properties(
    width=400,height=400)

In [55]:
df_verbruik_input  = df_verbruik

# predict low case scenario voor 2021-2023 per PC6 
df_verbruik_low_prophet_all, df_verbruik_mid_prophet_all, df_verbruik_high_prophet_all = predict_verbruik_prophet(df_verbruik_input)


el for pc4 9521
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9523
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9524
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9525
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9526
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9527
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9528
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9531
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9533
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9534
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9536
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9537
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9541
Forecasting low-mid-high case for 2021-2023 with prophet model for pc4 9

In [56]:
# Combineer met NEDU profielen. Hier zou eigenlijk de forecast moeten staan
# Spoor 1. NEDU profielen
df_nedu_profielen = pd.read_hdf('nedu_files.h5')
df_nedu_profielen_origineel = df_nedu_profielen

In [57]:
df_nedu_profielen['jaar'] = df_nedu_profielen.DatumTijd.dt.year
df_nedu_profielen['maand'] = df_nedu_profielen.DatumTijd.dt.month
df_nedu_profielen['dag'] = df_nedu_profielen.DatumTijd.dt.day

df_nedu_profielen = df_nedu_profielen.groupby(['jaar','maand','dag']).agg({'E1A':'sum', 'E1B':'sum', 'E1C':'sum', 'E2A':'sum', 'E2B':'sum'})

def maak_datum(jaar,maand,dag):
    return format(jaar,'04d') + '-' + format(maand,'02d') + '-' + format(dag,'02d'), jaar
df_nedu_profielen.index, df_nedu_profielen["JAAR"] = np.vectorize(maak_datum)(df_nedu_profielen.index.get_level_values('jaar'), df_nedu_profielen.index.get_level_values('maand'), df_nedu_profielen.index.get_level_values('dag'))
df_nedu_profielen.index = pd.to_datetime(df_nedu_profielen.index)

# Herorganiseren kolommen. Index is een rijteller. Datum moet een datetime worden
df_nedu_profielen = df_nedu_profielen[df_nedu_profielen.JAAR < 2021]
df_nedu_profielen = df_nedu_profielen.drop(columns=['E2A', 'E2B'])
df_nedu_profielen['DATUM'] = df_nedu_profielen.index
df_nedu_profielen.index = range(len(df_nedu_profielen))
df_nedu_profielen.DATUM = pd.to_datetime(df_nedu_profielen.DATUM, format='%d-%m-%Y %H:%M')

In [58]:
df_nedu_profielen

Unnamed: 0,E1A,E1B,E1C,JAAR,DATUM
0,0.003231,0.003311,0.003430,2010,2010-01-01
1,0.003448,0.003514,0.003620,2010,2010-01-02
2,0.003314,0.003448,0.003394,2010,2010-01-03
3,0.003275,0.003303,0.003323,2010,2010-01-04
4,0.003232,0.003175,0.003221,2010,2010-01-05
...,...,...,...,...,...
4013,0.003476,0.003684,0.003629,2020,2020-12-27
4014,0.003287,0.003483,0.003427,2020,2020-12-28
4015,0.003544,0.003681,0.003836,2020,2020-12-29
4016,0.003464,0.003617,0.003735,2020,2020-12-30


In [59]:
df_combined = pd.merge(df_nedu_profielen, df_verbruik, on=['JAAR'], how='left')

In [60]:
df_combined

Unnamed: 0,E1A,E1B,E1C,JAAR,DATUM,SJV_TOTAAL,E1A_TOTAAL,E1B_TOTAAL,E1C_TOTAAL,AANSLUITINGEN_AANTAL,LEVERINGSRICHTING_PERC,PC4
0,0.003231,0.003311,0.003430,2010,2010-01-01,21802,17114,0,4688,4325,99.949133,4251
1,0.003231,0.003311,0.003430,2010,2010-01-01,10227,8134,0,2093,2093,99.954611,4254
2,0.003231,0.003311,0.003430,2010,2010-01-01,6732,5562,0,1170,1484,100.000000,4255
3,0.003231,0.003311,0.003430,2010,2010-01-01,13472,9484,0,3988,2285,100.000000,4261
4,0.003231,0.003311,0.003430,2010,2010-01-01,7234,5229,0,2005,1098,99.912568,4264
...,...,...,...,...,...,...,...,...,...,...,...,...
6257089,0.003249,0.003432,0.003416,2020,2020-12-31,1084,839,245,0,300,64.999267,9995
6257090,0.003249,0.003432,0.003416,2020,2020-12-31,191,191,0,0,58,81.035000,9996
6257091,0.003249,0.003432,0.003416,2020,2020-12-31,1008,878,130,0,271,70.110812,9997
6257092,0.003249,0.003432,0.003416,2020,2020-12-31,558,236,322,0,91,61.538571,9998


In [61]:
df_combined['VERBRUIK'] = df_combined.E1A * df_combined.E1A_TOTAAL + df_combined.E1B * df_combined.E1B_TOTAAL + df_combined.E1C * df_combined.E1C_TOTAAL

In [62]:
df_combined = df_combined.drop(columns=['E1A','E1B','E1C','E1A_TOTAAL','E1B_TOTAAL','E1C_TOTAAL','JAAR'])
df_combined[df_combined.PC4 == 5501]

Unnamed: 0,DATUM,SJV_TOTAAL,AANSLUITINGEN_AANTAL,LEVERINGSRICHTING_PERC,PC4,VERBRUIK
355,2010-01-01,11391,2340,99.908547,5501,37.397359
1905,2010-01-02,11391,2340,99.908547,5501,39.788411
3455,2010-01-03,11391,2340,99.908547,5501,37.986405
5005,2010-01-04,11391,2340,99.908547,5501,37.450164
6555,2010-01-05,11391,2340,99.908547,5501,36.780415
...,...,...,...,...,...,...
6249577,2020-12-27,8951,2405,91.683996,5501,31.668144
6251155,2020-12-28,8951,2405,91.683996,5501,29.928516
6252733,2020-12-29,8951,2405,91.683996,5501,32.781464
6254311,2020-12-30,8951,2405,91.683996,5501,31.987383


In [63]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6257094 entries, 0 to 6257093
Data columns (total 6 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   DATUM                   datetime64[ns]
 1   SJV_TOTAAL              int32         
 2   AANSLUITINGEN_AANTAL    int64         
 3   LEVERINGSRICHTING_PERC  float64       
 4   PC4                     int64         
 5   VERBRUIK                float64       
dtypes: datetime64[ns](1), float64(2), int32(1), int64(2)
memory usage: 310.3 MB


In [64]:
# cross check. Het totaal van een profiel over een heel jaar moet 1 zijn. Dat betekent dat het totaal van de verbruiken gelijk moet zijn aan SJV_TOTAAL
print (f"Totaal verbruik = {df_combined[(df_combined.DATUM < datetime(2011,1,1)) & (df_combined.PC4 == 5501)].agg({'VERBRUIK':'sum'}).values[0]}")
print (f"SJV totaal = {df_combined[(df_combined.DATUM < datetime(2011,1,1)) & (df_combined.PC4 == 5501)].SJV_TOTAAL.values[0]}")


Totaal verbruik = 11390.34956871
SJV totaal = 11391
