# Design fine-grained model search

- Read in best (latent size) model from coarse grained
- Define latent sizes in close vicinity (+ / -) 
(- Same `lr` and `dropout` scan as in `coarse`)

In [1]:
import os
import pandas as pd
import numpy as np
import json
import pickle 
import torch
import itertools
import time

## Check `lr` and `dropout` of best models for all chromosomes
- Questions: can we reduce grid search space for those parameters? 

In [3]:
dict_overviewBestModelParams = dict(lr=[], dropr=[])
### Parse hyperparameters of best models
for i in range(1,23):
    CHR = f"chr{i}"
    PATH_results = f"logs/optimisation/{CHR}/coarse"
    with open(f"{PATH_results}/best_model_coarseOptimization.json", "r") as f: dict_bestModel = json.load(f)
    dict_overviewBestModelParams["lr"].append(dict_bestModel["lr"])
    dict_overviewBestModelParams["dropr"].append(dict_bestModel["dropr"])
### Print hp count
print(pd.Series(dict_overviewBestModelParams["lr"]).value_counts())
print(pd.Series(dict_overviewBestModelParams["dropr"]).value_counts())  

0.0001    16
0.0005     6
dtype: int64
0.1    16
0.3     6
dtype: int64


# Design fine-search grid

In [2]:
for CHR in ["chr6", "chr8"]:#[f"chr{i}" for i in range(1,15)]:
    PATH_results = f"logs/optimisation/{CHR}/fine"
    os.makedirs(PATH_results, exist_ok=True)
    param_grid = dict()

    ### Step 1: read in old model
    ## a. original parameters
    with open(f"logs/optimisation/{CHR}/coarse/param_grid_coarse.json", "r") as f: dict_oldGrid = json.load(f)
    ## b. best HP model
    with open(f"logs/optimisation/{CHR}/coarse/best_model_coarseOptimization.json", "r") as f: dict_bestModel = json.load(f)

    ### Step 2: design hidden layers --> same as in coarse grid!
    param_grid["hidden_layer_encoder_topology"] = dict_oldGrid["hidden_layer_encoder_topology"]

    ### Step 3: design latSizes
    ## Get best model and its neighboring latSizes 
    idx_best = dict_oldGrid["latentSize_coarse"].index(int(dict_bestModel["latSize"]))
    bestLatNeighbors = [dict_oldGrid["latentSize_coarse"][idx_best+i] for i in range(-1,2)]
    ## Define points in between neighbors
    lowerLatSizes = np.linspace(bestLatNeighbors[2],bestLatNeighbors[1],5)[1:].round().tolist()
    upperLatSizes = np.linspace(bestLatNeighbors[1],bestLatNeighbors[0],5)[1:-1].round().tolist()
    param_grid["latentSize_fine"] = list(map(int,lowerLatSizes + upperLatSizes))

    ### Step 4: decide on lr and dropout rates to try
    ### reduce searchable space according to experiment above
    lr_scan = [1e-4, 5e-4]#, 1e-3] 
    param_grid["lr"] = lr_scan
    dropout_scan = [0.1,0.3]
    param_grid["dropout"] = dropout_scan

    ### Save parameter grid in file for later documentation
    with open(f"{PATH_results}/param_grid_fine.json", "w") as f: f.write(json.dumps(param_grid, indent="\t"))

In [3]:
for CHR in ["chr6", "chr8"]:#[f"chr{i}" for i in range(1,15)]:
    PATH_results = f"logs/optimisation/{CHR}/fine"
    ### Step 5: generate combinations of parameters and replace in file; standaline script?
    ### Load parameter grid
    with open(f"{PATH_results}/param_grid_fine.json", "r") as f: param_grid_comb=json.loads(f.read())

    ### Generate combinations; remove 'hidden_layer_encoder_topology' before, as it should be fixed
    param_grid = param_grid_comb.copy()
    param_grid_comb.pop("hidden_layer_encoder_topology", None)
    combs_coarse = [dict(zip(param_grid_comb.keys(), values)) for values in itertools.product(*param_grid_comb.values())]
    print(f"Number of combinations: {len(combs_coarse)}")

    ### Generate submit.sh with combinations
    os.makedirs(f"{PATH_results}/submit", exist_ok=True)
    with open("submit_template.sh", "r") as f: template=f.read()
    for i in range(len(combs_coarse)):                          
        latSize = str(combs_coarse[i]["latentSize_fine"])
        lr = str(combs_coarse[i]["lr"])
        dropr = str(combs_coarse[i]["dropout"])

        ### Generate run name - combination of parameter settings
        fileName = f"latSize_{latSize}_lr_{lr}_dropr_{dropr}"
        ### Replace in template file
        template_updated = template.replace("$PATH", str(PATH_results+"/"+fileName)) \
                                   .replace("$CHR",  str(CHR)) \
                                   .replace("$HIDDEN_1", str(param_grid["hidden_layer_encoder_topology"][0])) \
                                   .replace("$HIDDEN_2", str(param_grid["hidden_layer_encoder_topology"][1])) \
                                   .replace("$LATSIZE", latSize) \
                                   .replace("$LR", lr) \
                                   .replace("$DROPR", dropr)
        with open(f"{PATH_results}/submit/{fileName}.sh", "w") as f: f.write(template_updated)
        print(f"Wrote file \t{fileName}")

Number of combinations: 28
Wrote file 	latSize_52_lr_0.0001_dropr_0.1
Wrote file 	latSize_52_lr_0.0001_dropr_0.3
Wrote file 	latSize_52_lr_0.0005_dropr_0.1
Wrote file 	latSize_52_lr_0.0005_dropr_0.3
Wrote file 	latSize_65_lr_0.0001_dropr_0.1
Wrote file 	latSize_65_lr_0.0001_dropr_0.3
Wrote file 	latSize_65_lr_0.0005_dropr_0.1
Wrote file 	latSize_65_lr_0.0005_dropr_0.3
Wrote file 	latSize_78_lr_0.0001_dropr_0.1
Wrote file 	latSize_78_lr_0.0001_dropr_0.3
Wrote file 	latSize_78_lr_0.0005_dropr_0.1
Wrote file 	latSize_78_lr_0.0005_dropr_0.3
Wrote file 	latSize_90_lr_0.0001_dropr_0.1
Wrote file 	latSize_90_lr_0.0001_dropr_0.3
Wrote file 	latSize_90_lr_0.0005_dropr_0.1
Wrote file 	latSize_90_lr_0.0005_dropr_0.3
Wrote file 	latSize_110_lr_0.0001_dropr_0.1
Wrote file 	latSize_110_lr_0.0001_dropr_0.3
Wrote file 	latSize_110_lr_0.0005_dropr_0.1
Wrote file 	latSize_110_lr_0.0005_dropr_0.3
Wrote file 	latSize_130_lr_0.0001_dropr_0.1
Wrote file 	latSize_130_lr_0.0001_dropr_0.3
Wrote file 	latSize_1

# Overview all fine grained jobs

In [4]:
PATH_data = "/data/scratch/skatz/PROJECTS/methylnet/1_healthyVAE/data/GSE87571/train_val_test_sets/"

In [5]:
df_overview = pd.DataFrame(index=[f"chr{ele}" for ele in range(1,23)], columns=["num_cpgs"]+list(param_grid.keys()))

for CHR in [f"chr{ele}" for ele in range(1,23)]:
    ### Get number of input CpGs
    with open(os.path.join(PATH_data, f"{CHR}_train_methyl_array.pkl"), "rb") as f: train_dataset = pickle.load(f) #
    num_cpgs = train_dataset["beta"].shape[1]
    
    ### Parse parameter grid
    PATH_results = f"logs/optimisation/{CHR}/fine"
    with open(f"{PATH_results}/param_grid_fine.json", "r") as f: param = json.load(f)
        
    ### add to output dataframe
    df_overview.loc[CHR,"num_cpgs"] = num_cpgs
    df_overview.loc[CHR,"hidden_layer_encoder_topology"] = param["hidden_layer_encoder_topology"]
    df_overview.loc[CHR,"latentSize_fine"] = param["latentSize_fine"]
    df_overview.loc[CHR,"lr"] = param["lr"]
    df_overview.loc[CHR,"dropout"] = param["dropout"]
    


df_overview

Unnamed: 0,num_cpgs,hidden_layer_encoder_topology,latentSize_fine,lr,dropout
chr1,29482,"[8840, 1770]","[38, 45, 52, 60, 72, 85, 98]","[0.0001, 0.0005]","[0.1, 0.3]"
chr2,21984,"[6600, 1320]","[50, 60, 70, 80, 100, 120, 140]","[0.0001, 0.0005]","[0.1, 0.3]"
chr3,15547,"[4660, 930]","[38, 45, 52, 60, 75, 90, 105]","[0.0001, 0.0005]","[0.1, 0.3]"
chr4,13100,"[3930, 790]","[28, 35, 42, 50, 62, 75, 88]","[0.0001, 0.0005]","[0.1, 0.3]"
chr5,15965,"[4790, 960]","[22, 25, 28, 30, 38, 45, 52]","[0.0001, 0.0005]","[0.1, 0.3]"
chr6,23070,"[6920, 1380]","[52, 65, 78, 90, 110, 130, 150]","[0.0001, 0.0005]","[0.1, 0.3]"
chr7,19599,"[5880, 1180]","[48, 55, 62, 70, 90, 110, 130]","[0.0001, 0.0005]","[0.1, 0.3]"
chr8,13704,"[4110, 820]","[62, 75, 88, 100, 125, 150, 175]","[0.0001, 0.0005]","[0.1, 0.3]"
chr9,6619,"[1990, 400]","[62, 75, 88, 100, 125, 150, 175]","[0.0001, 0.0005]","[0.1, 0.3]"
chr10,15736,"[4720, 940]","[38, 45, 52, 60, 75, 90, 105]","[0.0001, 0.0005]","[0.1, 0.3]"
