<a href="https://colab.research.google.com/github/shengy90/MSc-Project/blob/master/notebooks/30th_July_Optimal_Clusters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1️⃣ Setup Notebook 💻**


### **Authenticate with BigQuery ☁️**

In [None]:
!pip install --upgrade google-cloud-bigquery[bqstorage,pandas]
!pip install --upgrade pandas-gbq

In [1]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [2]:
%%bigquery --project machine-learning-msc df --use_bqstorage_api
SELECT 
  COUNT(*) as total_rows
FROM `machine-learning-msc.low_carbon_london.household_consumption_daily_agg` 

In [3]:
df.head()

Unnamed: 0,total_rows
0,14841792


### **Importing Libraries⏬**

##### Standard Libraries

In [None]:
!pip install fbprophet
!pip install MiniSom

Collecting MiniSom
  Downloading https://files.pythonhosted.org/packages/9d/10/a1c1621000d5ca00c41695689551c1a4d6d245d7bbf099d81e067da3e8f2/MiniSom-2.2.6.tar.gz
Building wheels for collected packages: MiniSom
  Building wheel for MiniSom (setup.py) ... [?25l[?25hdone
  Created wheel for MiniSom: filename=MiniSom-2.2.6-cp36-none-any.whl size=8525 sha256=aa2697c7604ee1f1e8af2a10ed4aa676ed79b1d267a89eaf910197e08e7e8d5a
  Stored in directory: /root/.cache/pip/wheels/b8/c6/01/330066e36e1f7c826c96f656f9185822cfcdef0591315949ea
Successfully built MiniSom
Installing collected packages: MiniSom
Successfully installed MiniSom-2.2.6


In [4]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import random
import datetime as dt

from minisom import MiniSom
from tqdm import tqdm
from datetime import date
from matplotlib.gridspec import GridSpec
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
 
sns.set()
%matplotlib inline

  import pandas.util.testing as tm


In [5]:
import pandas_gbq
def output_to_bq(forecast, table_id, project_id='machine-learning-msc'):
    pandas_gbq.to_gbq(forecast, table_id, project_id=project_id, if_exists='append')

##### Import Github Repository

In [6]:
%cd /content
!ls

/content
adc.json  mscproj  sample_data


In [7]:
!rm -rf mscproj
!git clone https://github.com/shengy90/MSc-Project mscproj
!git pull
%cd /content/mscproj/
!ls

Cloning into 'mscproj'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 373 (delta 4), reused 9 (delta 4), pack-reused 361[K
Receiving objects: 100% (373/373), 10.94 MiB | 5.83 MiB/s, done.
Resolving deltas: 100% (202/202), done.
fatal: not a git repository (or any of the parent directories): .git
/content/mscproj
bin	     __init__.py  notebooks  requirements.txt  sql
definitions  Makefile	  README.md  run.py	       src


In [8]:
%reload_ext autoreload 
%autoreload 2 
from src.train_prophet import TrainProphet
from src.train_clusters import TrainClusters
from src.train_clusters import Normaliser

# 2️⃣ **Downloading Data From Bigquery**

In [10]:
%%bigquery --project machine-learning-msc df_train --use_bqstorage_api
WITH stg1 AS (
SELECT 
lcl_id,
IF(acorn_grouped = "Adversity", 1, 0) AS adversity,
IF(acorn_grouped = "Affluent", 1, 0) AS affluent,
IF(acorn_grouped = "Comfortable", 1, 0) AS comfortable,
FORMAT_DATETIME("%B", DATETIME(ts)) AS month_name,
dayofweek,
hhourly_rank,
ROUND(AVG(kwhh),4) AS hh_avg,
ROUND(MAX(kwhh),4) AS hh_max,
ROUND(MIN(kwhh),4) AS hh_min,
ROUND(STDDEV(kwhh),4) AS hh_stddev

FROM `machine-learning-msc.forecasting_20200719.train_set`
WHERE train_test_split = 'train'
AND ts >= '2012-11-01' AND ts < '2013-03-01'

GROUP BY 1,2,3,4,5,6,7
)

SELECT 
*,
ROW_NUMBER() OVER (PARTITION BY lcl_id, month_name ORDER BY dayofweek ASC, hhourly_rank ASC) AS weekly_rank
FROM stg1 
ORDER BY lcl_id, month_name, weekly_rank, hhourly_rank

In [11]:
df_train.head()

Unnamed: 0,lcl_id,adversity,affluent,comfortable,month_name,dayofweek,hhourly_rank,hh_avg,hh_max,hh_min,hh_stddev,weekly_rank
0,MAC000003,1,0,0,December,1,0,1.3904,3.399,0.066,1.7475,1
1,MAC000003,1,0,0,December,1,1,3.1894,3.337,3.062,0.1026,2
2,MAC000003,1,0,0,December,1,2,2.971,3.344,2.559,0.3444,3
3,MAC000003,1,0,0,December,1,3,2.8374,3.234,2.17,0.4154,4
4,MAC000003,1,0,0,December,1,4,2.4668,2.804,2.154,0.2681,5


In [12]:
%%bigquery --project machine-learning-msc df_test --use_bqstorage_api
WITH stg1 AS (
SELECT 
lcl_id,
IF(acorn_grouped = "Adversity", 1, 0) AS adversity,
IF(acorn_grouped = "Affluent", 1, 0) AS affluent,
IF(acorn_grouped = "Comfortable", 1, 0) AS comfortable,
FORMAT_DATETIME("%B", DATETIME(ts)) AS month_name,
dayofweek,
hhourly_rank,
ROUND(AVG(kwhh),4) AS hh_avg,
ROUND(MAX(kwhh),4) AS hh_max,
ROUND(MIN(kwhh),4) AS hh_min,
ROUND(STDDEV(kwhh),4) AS hh_stddev

FROM `machine-learning-msc.forecasting_20200719.test_set`
WHERE train_test_split = 'test'
AND ts >= '2012-11-01' AND ts < '2013-03-01'

GROUP BY 1,2,3,4,5,6,7
)

SELECT 
*,
ROW_NUMBER() OVER (PARTITION BY lcl_id, month_name ORDER BY dayofweek ASC, hhourly_rank ASC) AS weekly_rank
FROM stg1 
ORDER BY lcl_id, month_name, weekly_rank, hhourly_rank

In [13]:
df_test.head()

Unnamed: 0,lcl_id,adversity,affluent,comfortable,month_name,dayofweek,hhourly_rank,hh_avg,hh_max,hh_min,hh_stddev,weekly_rank
0,MAC000004,0,1,0,December,1,0,0.0286,0.137,0.0,0.0607,1
1,MAC000004,0,1,0,December,1,1,0.0846,0.163,0.0,0.0795,2
2,MAC000004,0,1,0,December,1,2,0.0096,0.046,0.0,0.0204,3
3,MAC000004,0,1,0,December,1,3,0.0,0.0,0.0,0.0,4
4,MAC000004,0,1,0,December,1,4,0.0768,0.248,0.0,0.1124,5


# 3️⃣ **Generate SOM clusters**

### **Normalise Dataset**

In [14]:
value_list = ['hh_avg']
column_list = ['month_name', 'weekly_rank']
normaliser = Normaliser(value_list, column_list)
norm_df_train = normaliser.fit(df_train)
norm_df_test = normaliser.predict(df_test)

### **Train SOM**

In [15]:
for i in range(9):
    cluster_num = i+1
    print(f"Training {cluster_num} clusters....")
    som_cluster = TrainClusters(cluster_type="som")
    som_cluster.fit(norm_df_train, cluster_num=cluster_num, sigma=0.1, learning_rate=0.1)  

    train_pred = som_cluster.predict(norm_df_train)
    test_pred = som_cluster.predict(norm_df_test)

    train_pred['train_test_split'] = "train"
    test_pred['train_test_split'] = "test"

    som_results = pd.concat([train_pred[['lcl_id','cluster','train_test_split']], test_pred[['lcl_id','cluster','train_test_split']]])
    som_results['num_clusters'] = cluster_num
    som_results['cluster_type'] = 'som'

    # output_to_bq(som_results, 'clusters_20200739.clusters')
    print("Upload to BQ completed! 🎉")

Training 1 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 28.62229932330481



The topographic error is not defined for a 1-by-1 map.

2681it [00:01, 2186.75it/s]
1000it [00:00, 2354.37it/s]


Upload to BQ completed! 🎉
Training 2 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 24.530185802382075


2681it [00:01, 2167.18it/s]
1000it [00:00, 2260.22it/s]


Upload to BQ completed! 🎉
Training 3 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 21.59310210230596


2681it [00:01, 2116.74it/s]
1000it [00:00, 2286.66it/s]


Upload to BQ completed! 🎉
Training 4 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 21.062202682086788


2681it [00:01, 2089.48it/s]
1000it [00:00, 2179.91it/s]


Upload to BQ completed! 🎉
Training 5 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 

0it [00:00, ?it/s]


 quantization error: 19.841573891542318


2681it [00:01, 2130.59it/s]
1000it [00:00, 2213.93it/s]


Upload to BQ completed! 🎉
Training 6 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 19.49002343512572


2681it [00:01, 2100.44it/s]
1000it [00:00, 2031.20it/s]


Upload to BQ completed! 🎉
Training 7 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 

0it [00:00, ?it/s]


 quantization error: 18.918113461647287


2681it [00:01, 1656.47it/s]
1000it [00:00, 2094.01it/s]


Upload to BQ completed! 🎉
Training 8 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 

0it [00:00, ?it/s]


 quantization error: 18.75056029192445


2681it [00:01, 2049.31it/s]
1000it [00:00, 2136.91it/s]


Upload to BQ completed! 🎉
Training 9 clusters....
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 18.623579866826525


2681it [00:01, 2037.97it/s]
1000it [00:00, 2116.14it/s]


Upload to BQ completed! 🎉


### **Train Agglomerative Clusters**

In [28]:
for i in range(9):
    cluster_num = i+1
    print(f"Training {cluster_num} clusters....")
    agglo_cluster = TrainClusters(cluster_type="agglo")
    agglo_cluster.fit(norm_df_train, cluster_num=cluster_num)

    train_pred = agglo_cluster.predict(norm_df_train)
    test_pred = agglo_cluster.predict(norm_df_test)

    train_pred['train_test_split'] = "train"
    test_pred['train_test_split'] = "test"
    
    agglo_results = pd.concat([train_pred[['lcl_id','cluster','train_test_split']], test_pred[['lcl_id','cluster','train_test_split']]])
    agglo_results['cluster'] = agglo_results['cluster'].astype(float)
    agglo_results['num_clusters'] = cluster_num
    agglo_results['cluster_type'] = 'agglo'

    output_to_bq(agglo_results, 'clusters_20200739.clusters')
    print("Upload to BQ completed! 🎉")

Training 1 clusters....


1it [00:02,  2.84s/it]


Upload to BQ completed! 🎉
Training 2 clusters....


1it [00:03,  3.92s/it]


Upload to BQ completed! 🎉
Training 3 clusters....


1it [00:03,  3.44s/it]


Upload to BQ completed! 🎉
Training 4 clusters....


1it [00:04,  4.55s/it]


Upload to BQ completed! 🎉
Training 5 clusters....


1it [00:02,  2.76s/it]


Upload to BQ completed! 🎉
Training 6 clusters....


1it [00:03,  3.39s/it]


Upload to BQ completed! 🎉
Training 7 clusters....


1it [00:02,  2.81s/it]


Upload to BQ completed! 🎉
Training 8 clusters....


1it [00:06,  6.57s/it]


Upload to BQ completed! 🎉
Training 9 clusters....


1it [00:03,  3.48s/it]

Upload to BQ completed! 🎉





In [16]:
som_results.dtypes

lcl_id               object
cluster             float64
train_test_split     object
num_clusters          int64
cluster_type         object
dtype: object

In [27]:
agglo_results.dtypes

lcl_id               object
cluster             float64
train_test_split     object
num_clusters          int64
cluster_type         object
dtype: object