# Re-run best models from optimisation for final comparison

- Read in best model from fine-grained
- Rerun

In [3]:
import os
import pandas as pd
import numpy as np
import json
import pickle 
import torch

# Find best models

In [4]:
for CHR in [f"chr{i}" for i in range(6,9)]:
    PATH_results = f"logs/finalModels/{CHR}"
    os.makedirs(PATH_results, exist_ok=True)
    param_grid = dict()

    ### Step 1: read in old model
    ## a. original parameters
    with open(f"logs/optimisation/{CHR}/fine/param_grid_fine.json", "r") as f: dict_oldGrid = json.load(f)
    ## b. best HP model
    with open(f"logs/optimisation/{CHR}/fine/best_model_fineOptimization.json", "r") as f: dict_bestModel = json.load(f)

    ### Step 2: design hidden layers --> same as in coarse grid!
    param_grid["hidden_layer_encoder_topology"] = dict_oldGrid["hidden_layer_encoder_topology"]
    ### Step 3: design latSizes
    param_grid["latentSize"] = int(dict_bestModel["latSize"])
    ### Step 4: get lr and dropout
    param_grid["lr"] = dict_bestModel["lr"]
    param_grid["dropout"] = dict_bestModel["dropr"]

    ### Save parameter grid in file for later documentation
    with open(f"{PATH_results}/param_grid.json", "w") as f: f.write(json.dumps(param_grid, indent="\t"))

In [5]:
for CHR in [f"chr{i}" for i in range(6,9)]:
    PATH_results = f"logs/finalModels/{CHR}"
    ### Load parameter grid
    with open(f"{PATH_results}/param_grid.json", "r") as f: param_grid_comb=json.loads(f.read())

    ### Generate submit.sh with combinations
    os.makedirs(f"{PATH_results}/submit", exist_ok=True)
    with open("submit_template.sh", "r") as f: template=f.read()             
    latSize = str(param_grid_comb["latentSize"])
    lr = str(param_grid_comb["lr"])
    dropr = str(param_grid_comb["dropout"])

    ### Generate run name - combination of parameter settings
    fileName = f"latSize_{latSize}"
    ### Replace in template file
    template_updated = template.replace("$PATH", str(PATH_results+"/"+fileName)) \
                                   .replace("$CHR",  str(CHR)) \
                                   .replace("$HIDDEN_1", str(param_grid["hidden_layer_encoder_topology"][0])) \
                                   .replace("$HIDDEN_2", str(param_grid["hidden_layer_encoder_topology"][1])) \
                                   .replace("$LATSIZE", latSize) \
                                   .replace("$LR", lr) \
                                   .replace("$DROPR", dropr)
    with open(f"{PATH_results}/submit/{fileName}.sh", "w") as f: f.write(template_updated)
    print(f"Wrote file \t{fileName}")

Wrote file 	latSize_78
Wrote file 	latSize_70
Wrote file 	latSize_88


# Overview all final jobs

In [6]:
PATH_data = "/data/scratch/skatz/PROJECTS/methylnet/1_healthyVAE/data/GSE87571/train_val_test_sets/"

In [7]:
param_grid = ['hidden_layer_encoder_topology', 'latentSize', 'lr', 'dropout']
df_overview = pd.DataFrame(index=[f"chr{ele}" for ele in range(1,23)], columns=["num_cpgs"]+param_grid)

for CHR in [f"chr{ele}" for ele in range(1,23)]:
    ### Get number of input CpGs
    with open(os.path.join(PATH_data, f"{CHR}_train_methyl_array.pkl"), "rb") as f: train_dataset = pickle.load(f) #
    num_cpgs = train_dataset["beta"].shape[1]
    
    ### Parse parameter grid
    PATH_results = f"logs/finalModels/{CHR}"
    with open(f"{PATH_results}/param_grid.json", "r") as f: param = json.load(f)
        
    ### add to output dataframe
    df_overview.loc[CHR,"num_cpgs"] = num_cpgs
    df_overview.loc[CHR,"hidden_layer_encoder_topology"] = param["hidden_layer_encoder_topology"]
    df_overview.loc[CHR,"latentSize"] = param["latentSize"]
    df_overview.loc[CHR,"lr"] = param["lr"]
    df_overview.loc[CHR,"dropout"] = param["dropout"]
    
df_overview

Unnamed: 0,num_cpgs,hidden_layer_encoder_topology,latentSize,lr,dropout
chr1,29482,"[8840, 1770]",72,0.0001,0.1
chr2,21984,"[6600, 1320]",60,0.0001,0.1
chr3,15547,"[4660, 930]",45,0.0001,0.1
chr4,13100,"[3930, 790]",50,0.0001,0.1
chr5,15965,"[4790, 960]",38,0.0001,0.1
chr6,23070,"[6920, 1380]",78,0.0005,0.1
chr7,19599,"[5880, 1180]",70,0.0001,0.1
chr8,13704,"[4110, 820]",88,0.0005,0.3
chr9,6619,"[1990, 400]",88,0.0005,0.3
chr10,15736,"[4720, 940]",52,0.0001,0.1


In [8]:
'''
Reduction by factor of... 
'''
df_overview["num_cpgs"] / df_overview["latentSize"]

chr1     409.472222
chr2          366.4
chr3     345.488889
chr4          262.0
chr5     420.131579
chr6     295.769231
chr7     279.985714
chr8     155.727273
chr9      75.215909
chr10    302.615385
chr11         371.4
chr12    291.634615
chr13     86.688889
chr14      196.1875
chr15    202.395833
chr16        276.46
chr17    284.066667
chr18         50.75
chr19    257.916667
chr20        74.625
chr21     45.316667
chr22          74.9
dtype: object

In [14]:
df_overview["latentSize"].sum()

2009