<a href="https://colab.research.google.com/github/shengy90/MSc-Project/blob/master/notebooks/sliding_window.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1️⃣ Setup Notebook 💻**


### **Authenticate with BigQuery ☁️**

In [None]:
!pip install --upgrade google-cloud-bigquery[bqstorage,pandas]
!pip install --upgrade pandas-gbq

In [1]:
import google.auth
from google.colab import auth
from google.cloud import bigquery
from google.cloud import bigquery_storage_v1beta1

auth.authenticate_user()
print('Authenticated')

Authenticated


In [2]:
credentials, your_project_id = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
your_project_id = 'machine-learning-msc'
# Make clients.
bqclient = bigquery.Client(
    credentials=credentials,
    project=your_project_id
    )
bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
    credentials=credentials
)

In [3]:
query_string = """
SELECT
COUNT(*) AS test
FROM `machine-learning-msc.low_carbon_london.household_consumption_daily_agg` 
"""

In [4]:
def download_query_results(query_string, bqclient=bqclient, bqstorateclient=bqstorageclient):
    df = bqclient.query(query_string).result().to_dataframe(bqstorage_client=bqstorageclient)
    return df

In [5]:
df = download_query_results(query_string, bqclient, bqstorageclient)
df

Unnamed: 0,test
0,14841792


In [6]:
import pandas_gbq
def output_to_bq(forecast, table_id, project_id='machine-learning-msc'):
    pandas_gbq.to_gbq(forecast, table_id, project_id=project_id, if_exists='append')

### **Importing Libraries⏬**

##### Standard Libraries

In [7]:
!pip install fbprophet
!pip install MiniSom

Collecting MiniSom
  Downloading https://files.pythonhosted.org/packages/9d/10/a1c1621000d5ca00c41695689551c1a4d6d245d7bbf099d81e067da3e8f2/MiniSom-2.2.6.tar.gz
Building wheels for collected packages: MiniSom
  Building wheel for MiniSom (setup.py) ... [?25l[?25hdone
  Created wheel for MiniSom: filename=MiniSom-2.2.6-cp36-none-any.whl size=8525 sha256=ffbcf4ffb245744ccb447b39b0810513088b2662045047ff672b9642be1799a1
  Stored in directory: /root/.cache/pip/wheels/b8/c6/01/330066e36e1f7c826c96f656f9185822cfcdef0591315949ea
Successfully built MiniSom
Installing collected packages: MiniSom
Successfully installed MiniSom-2.2.6


In [8]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import random
import datetime as dt

from minisom import MiniSom
from tqdm import tqdm
from datetime import date
from datetime import datetime
from dateutil.relativedelta import relativedelta
from matplotlib.gridspec import GridSpec
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
 
sns.set()
%matplotlib inline

  import pandas.util.testing as tm


In [9]:
import pandas_gbq
def output_to_bq(forecast, table_id, project_id='machine-learning-msc'):
    pandas_gbq.to_gbq(forecast, table_id, project_id=project_id, if_exists='append')

##### Import Github Repository

In [10]:
%cd /content
!rm -rf mscproj
!git clone https://github.com/shengy90/MSc-Project mscproj
!git pull
%cd /content/mscproj/
!ls

/content
Cloning into 'mscproj'...
remote: Enumerating objects: 155, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (102/102), done.[K
remote: Total 516 (delta 101), reused 100 (delta 53), pack-reused 361[K
Receiving objects: 100% (516/516), 12.72 MiB | 29.01 MiB/s, done.
Resolving deltas: 100% (299/299), done.
fatal: not a git repository (or any of the parent directories): .git
/content/mscproj
bin	     __init__.py  notebooks  requirements.txt  sql
definitions  Makefile	  README.md  run.py	       src


In [11]:
%reload_ext autoreload 
%autoreload 2 
from src.train_prophet import TrainProphet
from src.train_clusters import TrainClusters
from src.train_clusters import Normaliser
from src.train_sliding_window import train_som, train_baseline, train_som_forecasts, evaluate_results, generate_query_strings

# **2️⃣ Sliding Window Protocol**

## **Create Sliding Window Protocol**

### Functions

In [None]:
def save_results(df, train_test_split, model_name, num_clusters, start_date, end_date, eval_date, model_date):
    save_df = df.copy()
    save_df['model_date'] = model_date
    save_df['start_date'] = start_date
    save_df['end_date'] = end_date
    save_df['eval_date'] = eval_date             
    save_df['model'] = model_name
    save_df['train_test_split'] = train_test_split
    save_df['num_clusters'] = 5
    output_to_bq(save_df, 'sliding_window.results')

In [None]:
def train_forecast(start_date):
    start_date = datetime.strptime(start_date, "%Y-%m-%d").date()

    end_date = start_date + relativedelta(months=4)
    eval_date = start_date + relativedelta(months=3)
    start_date = start_date.strftime("%Y-%m-%d")
    end_date = end_date.strftime("%Y-%m-%d")
    eval_date = eval_date.strftime("%Y-%m-%d")
    som_query_string, ts_query_string= generate_query_strings(start_date, end_date)

    print(f"\nDownloading data for periods: {start_date} -> {end_date}")
    som_df = download_query_results(som_query_string, bqclient, bqstorageclient)
    ts_df = download_query_results(ts_query_string, bqclient, bqstorageclient)
    ts_df['ds'] = ts_df['ds'].dt.tz_localize(None)
    
    print(f"Training SOM Clusters.....")
    som_clusters = train_som(som_df)
    print(f"Training Baseline Model.....")
    baseline_model = train_baseline(ts_df, som_clusters, eval_date)
    print(f"Training SOM Forecasts.....")
    som_model, som_train_global, som_test_global = train_som_forecasts(ts_df, som_clusters, eval_date)
    
    print(f"\nEvaluating Baseline...")
    evaluate_results(baseline_model.train_global, baseline_model.test_global)
    print(f"\nEvaluating SOM...")
    evaluate_results(som_train_global, som_test_global)

    print(f"Uploading results to BigQuery...")
    save_results(baseline_model.train_global, "train", "baseline", 1, start_date, end_date, eval_date, '2020-08-09')
    save_results(baseline_model.test_global, "test", "baseline", 1, start_date, end_date, eval_date, '2020-08-09')
    save_results(som_train_global, "train", "som_clusters", 5, start_date, end_date, eval_date, '2020-08-09')
    save_results(som_test_global, "test", "som_clusters", 5, start_date, end_date, eval_date, '2020-08-09')

    print(f"\nFinished Training and Evaluation for periods: {start_date} -> {end_date}!\n")
    print("--------------------------------------------------------------------------------")

    return som_clusters, baseline_model, som_train_global, som_test_global

### Forecasts

In [None]:
start_dates = ['2012-11-01', '2012-12-01', '2013-01-01', '2013-02-01']

for start_date in start_dates:
    som_clusters, baseline_model, som_train_global, som_test_global = train_forecast(start_date)


Downloading data for periods: 2012-11-01 -> 2013-03-01
Training SOM Clusters.....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 19.8415756779928
Training Baseline Model.....
Training SOM Forecasts.....

Evaluating Baseline...
Train global MAPE: 7.4399999999999995. Test global MAPE: 8.88.

Evaluating SOM...
Train global MAPE: 7.42. Test global MAPE: 7.55.
Uploading results to BigQuery...



1344 out of 1344 rows loaded.
1it [00:02,  2.28s/it]

1344 out of 1344 rows loaded.
1it [00:02,  2.95s/it]

6720 out of 6720 rows loaded.

1it [00:03,  3.56s/it]

6720 out of 6720 rows loaded.

1it [00:02,  2.33s/it]



Finished Training and Evaluation for periods: 2012-11-01 -> 2013-03-01!

--------------------------------------------------------------------------------

Downloading data for periods: 2012-12-01 -> 2013-04-01
Training SOM Clusters.....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 19.75592700883923
Training Baseline Model.....
Training SOM Forecasts.....



1488 out of 1488 rows loaded.



Evaluating Baseline...
Train global MAPE: 8.290000000000001. Test global MAPE: 9.120000000000001.

Evaluating SOM...
Train global MAPE: 8.44. Test global MAPE: 8.07.
Uploading results to BigQuery...


1it [00:02,  2.42s/it]

1488 out of 1488 rows loaded.
1it [00:02,  2.44s/it]

7440 out of 7440 rows loaded.

1it [00:02,  2.91s/it]

7440 out of 7440 rows loaded.

1it [00:04,  4.32s/it]



Finished Training and Evaluation for periods: 2012-12-01 -> 2013-04-01!

--------------------------------------------------------------------------------

Downloading data for periods: 2013-01-01 -> 2013-05-01
Training SOM Clusters.....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 20.26105510270428
Training Baseline Model.....
Training SOM Forecasts.....



1440 out of 1440 rows loaded.



Evaluating Baseline...
Train global MAPE: 13.669999999999998. Test global MAPE: 13.020000000000001.

Evaluating SOM...
Train global MAPE: 14.04. Test global MAPE: 12.75.
Uploading results to BigQuery...


1it [00:02,  2.94s/it]

1440 out of 1440 rows loaded.
1it [00:02,  2.74s/it]

7200 out of 7200 rows loaded.

1it [00:03,  3.70s/it]

7200 out of 7200 rows loaded.

1it [00:03,  3.92s/it]



Finished Training and Evaluation for periods: 2013-01-01 -> 2013-05-01!

--------------------------------------------------------------------------------

Downloading data for periods: 2013-02-01 -> 2013-06-01
Training SOM Clusters.....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 21.25172153350484
Training Baseline Model.....
Training SOM Forecasts.....



1488 out of 1488 rows loaded.



Evaluating Baseline...
Train global MAPE: 16.939999999999998. Test global MAPE: 17.65.

Evaluating SOM...
Train global MAPE: 19.259999999999998. Test global MAPE: 21.18.
Uploading results to BigQuery...


1it [00:01,  1.96s/it]

1488 out of 1488 rows loaded.
1it [00:05,  5.06s/it]

7440 out of 7440 rows loaded.

1it [00:04,  4.54s/it]

7440 out of 7440 rows loaded.

1it [00:02,  2.66s/it]


Finished Training and Evaluation for periods: 2013-02-01 -> 2013-06-01!

--------------------------------------------------------------------------------





# 3️⃣ **Evaluating Sliding Window Performance**

##### **Get Sliding Window Results from BQ**

In [78]:
%%bigquery --project machine-learning-msc df_results --use_bqstorage_api
WITH stg1 AS (
SELECT 
CAST(TIMESTAMP_TRUNC(ds, MONTH) AS DATE) AS month,
* EXCEPT(cluster),
CAST(IF(model = 'baseline', 1, cluster) AS STRING) AS cluster
FROM `machine-learning-msc.sliding_window.results`
WHERE model_date = '2020-08-09' AND num_clusters = 5
)

SELECT 
*,
ROW_NUMBER() OVER (PARTITION BY model, train_test_split, cluster ORDER BY ds ASC) AS row_num
FROM stg1
ORDER BY model, train_test_split, cluster, ds ASC

In [79]:
df_results['ds'] = df_results['ds'].dt.tz_localize(None) # remove timezones 

df_baseline_train = df_results.query("model=='baseline' and train_test_split=='train'").copy()
df_baseline_test = df_results.query("model=='baseline' and train_test_split=='test'").copy()
df_som_train = df_results.query("model=='som_clusters' and train_test_split=='train'").copy()
df_som_test = df_results.query("model=='som_clusters' and train_test_split=='test'").copy()

In [103]:
def _agg_results(df, monthly):
    if monthly==True:
        aggcol = 'month'
    else:
        aggcol = 'ds'

    agg = df.groupby(aggcol).sum()
    agg.reset_index(inplace=True)
    agg = agg[[aggcol, 'y_global', 'yhat_global']]
    agg['abs_perc_err'] = np.round(np.abs(agg['yhat_global']/agg['y_global']-1),6)*100
    return agg

def evaluate_forecast(df, monthly=None):
    forecasts = _agg_results(df, monthly)

    if monthly is None:
        print(f"Global Mean Average Percentage Error: {np.mean(forecasts['abs_perc_err'])}")
    results = forecasts
    return results

In [119]:
def combine_results(idx_name, df1, df2, df1_name, df2_name, first=None):
    if first == True:
        df1 = df1[[idx_name, 'abs_perc_err']].copy()
        df1.rename(columns={'abs_perc_err':f"{df1_name}_APE"}, inplace=True)
    df2 = df2[[idx_name, 'abs_perc_err']].copy()
    df2.rename(columns={'abs_perc_err':f"{df2_name}_APE"}, inplace=True)

    out_df = df1.merge(df2, left_on=idx_name, right_on=idx_name)
    return out_df

In [104]:
print("\nBaseline Train:")
x = evaluate_forecast(df_baseline_train)
print("\nBaseline Test:")
x = evaluate_forecast(df_baseline_test)
print("\nSOM Train:")
x = evaluate_forecast(df_som_train)
print("\nSOM Test:")
x = evaluate_forecast(df_som_test)


Baseline Train:
Global Mean Average Percentage Error: 9.575817505447196

Baseline Test:
Global Mean Average Percentage Error: 9.952238428017255

SOM Train:
Global Mean Average Percentage Error: 9.840680544188736

SOM Test:
Global Mean Average Percentage Error: 9.936238656533986


In [106]:
basetrain = evaluate_forecast(df_baseline_train, monthly=True)
basetest = evaluate_forecast(df_baseline_test, monthly=True)
somtrain = evaluate_forecast(df_som_train, monthly=True)
somtest = evaluate_forecast(df_som_test, monthly=True)

In [124]:
monthly_ape = combine_results('month', basetrain, basetest, 'base_train', 'base_test', first=True)
monthly_ape = combine_results('month', monthly_ape, somtrain, None, 'som_train')
monthly_ape = combine_results('month', monthly_ape, somtest, None, 'som_test')

In [125]:
monthly_ape

Unnamed: 0,month,base_train_APE,base_test_APE,som_train_APE,som_test_APE
0,2013-02-01,2.0544,1.5996,1.977,0.1182
1,2013-03-01,3.771,3.4763,4.2418,2.7302
2,2013-04-01,7.4952,5.9928,7.9079,5.5975
3,2013-05-01,15.7664,16.3137,17.4087,19.4474
4,2013-06-01,1.1297,0.792,0.6504,0.0097
5,2013-07-01,3.7439,4.1796,4.1148,6.0481
6,2013-08-01,0.6945,0.1291,0.3574,0.433
7,2013-09-01,5.7219,5.9296,5.7095,5.7272
8,2013-10-01,6.1696,5.7002,5.9388,5.5161
9,2013-11-01,8.4111,8.5694,8.112,8.7072
