In [None]:
import pickle
import pandas as pd
import glob
import configparser
import os
import altair as alt
import numpy as np


### Build chart comparing edge weights and cutoff values observed for choices of percentile cutoff for embeddings

#### Pre-requisite:
1. Once embeddings have been generated for each of the choice of percentile cutoff (i.e. using `models/scripts/load_embeddings_from_model.py`), gather the log file to get hold of the edge weight cutoff values and edges remaining.
2. Format the data into 3 columns into a csv named `embedding_percentiles.csv` with columns `Percentile cutoff` , `Cutoff value` and `Edges remaining` using data from the log. Save at same level as this notebook (or change path in next cell as needed).

In [None]:
embeddingPercentilesDf = pd.read_csv('embedding_percentiles.csv')
embeddingPercentilesDf['Percentile cutoff'] = embeddingPercentilesDf['Percentile cutoff'].astype(int)
embeddingPercentilesDf['Cutoff value'] = embeddingPercentilesDf['Cutoff value'].astype(float)
embeddingPercentilesDf['Edges remaining'] = embeddingPercentilesDf['Edges remaining'].astype(int)
embeddingPercentilesDf.head()

In [None]:
# Create a layered chart
base = alt.Chart(embeddingPercentilesDf).encode(
    alt.X('Percentile cutoff:O', axis=alt.Axis(title="Percentile cutoff (in %)", grid=True, labelFontSize=12 ,titleFontSize = 14))
)

cutoffLine = base.mark_line(stroke='#5276A7', interpolate='monotone').encode(
    alt.Y('Cutoff value',
          axis=alt.Axis(title='Edge weight cutoff value observed', titleColor='#5276A7', labelFontSize=12 ,titleFontSize = 14))
)

remainingEdgesLine = base.mark_line(stroke='#f57542', interpolate='monotone').encode(
    alt.Y('Edges remaining',
          axis=alt.Axis(title='Total number of edges retained', titleColor='#f57542', labelFontSize=12,titleFontSize = 14))
)


alt.layer(cutoffLine, remainingEdgesLine).resolve_scale(
    y = 'independent'
).properties(
title=alt.TitleParams(text='Trending of number of edges and edge weight cutoff based on percentile choice', fontSize=20),
width = 500
)

### Build chart comparing choices of Hyperparameters used for `hidden_size` in model trained.

#### Pre-requisite:
1. Once training has been completed on the GNN (i.e. using `models/scripts/graph_network_GCNConv_mse_loss.py`), gather the log file to get hold of the metrics gathered during training along with validation and test scores.
2. Format the data into columns into a csv named `hyperparameter_tuning_hidden_layer.csv` with columns `Epoch`, `train_score`, `validation_score`, `test_score`, `hidden layer size` and use data from log. Save at same level as this notebook (or change path in next cell as needed).

In [None]:
hyperParamsDf = pd.read_csv('hyperparameter_tuning_hidden_layer.csv')
hyperParamsDf.head()

In [None]:
def generateChart(chartDf,testMseScore,hidden_size):
    folded_chart = alt.Chart(chartDf).mark_line().transform_fold(
        fold=['train_score', 'validation_score'], 
        as_=['Type of loss', 'MSE loss score']
    ).encode(
        alt.X('Epoch:O', axis=alt.Axis(title="Epochs in training", grid=True, labelFontSize=12 ,titleFontSize = 14)),
        y=alt.Y('MSE loss score:Q', scale=alt.Scale(domain=[0, 0.5]), axis=alt.Axis(title='MSE loss score', labelFontSize=8, titleFontSize=14)),
        color='Type of loss:N'
    )
    static_text = alt.Chart(pd.DataFrame({'text': ['Test MSE loss : {}'.format(testMseScore)]})).mark_text(
        fontSize=10,
        align='left',
        dx=3, # Adjust the x-offset for positioning
        dy=-10,  # Adjust the y-offset for positioning
    ).encode(
        text='text:N'
    )

    # Combine the folded chart and the static text annotation
    layered_chart = alt.layer(folded_chart, static_text).properties(
        #title="Hidden size = {}".format(hidden_size)
        title=alt.TitleParams(text="Hidden size = {}".format(hidden_size), fontSize=14)
    )
    return layered_chart


In [None]:
charts = [] # Container for the charts that will be generated
for hidden_size in hyperParamsDf['hidden layer size'].unique():
    chartDf = hyperParamsDf[hyperParamsDf['hidden layer size'] == hidden_size]
    testMseScore = round(chartDf['test_score'].unique()[0],6)
    charts.append(generateChart(chartDf,testMseScore,hidden_size))

# Concatenate all the 4 charts in 2x2 fashion and display
((charts[0] | charts[1]) & (charts[2] | charts[3])).properties(
    title=alt.TitleParams(text='Optimizing model based on hidden layer size in GNN', fontSize=20,anchor='middle', offset=30),
)