Connected to che_env (Python 3.8.8)

# Experiment train analysis

Un notebook hecho para poder analizar resultados de una busqueda de hyperparametros con Ray

## Funciones auxiliares ploteo entrenamientos y extracción de resultados

In [2]:
import glob
import json
import os
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import signal
from ray.tune import ExperimentAnalysis


def training_data_experiment(experiment_dir:str,trials_id:list,metrics:list,windows_size:int,width=850,height=800):

    data_to_plot = pd.DataFrame(columns=["Hyperparameters","trial_id","checkpoints","epoch"]+metrics)

    for trial_id in trials_id:
        progress_file_id_list = glob.glob(os.path.join(experiment_dir,f"TorchTrainer_{trial_id}*","progress.csv"))
        progress_file_id = progress_file_id_list[0]
        progress_df = pd.read_csv(progress_file_id)

        checkpoints_str = [checkpoint.split("/")[-1] for checkpoint in glob.glob(os.path.join(experiment_dir,f"TorchTrainer_{trial_id}*","checkpoint*"))]
        checkpoints_int = [int(checkpoint.split("_")[1]) for checkpoint in checkpoints_str]

        hparams_json_paths = glob.glob(os.path.join(experiment_dir,f"TorchTrainer_{trial_id}*","params.json"))
        with open(hparams_json_paths[0]) as hpams_json:
            hpams = json.load(hpams_json)["train_loop_config"]
        
        hpams_str = f'FL1:{hpams["fully_layer_1"]} FL2:{hpams["fully_layer_2"]} DR:{hpams["drop_rate"]} BS:{hpams["batch_size"]} LR:{hpams["learning_rate"]}'

        data_from_trial = progress_df[["epoch"]+metrics]

        for metric in metrics:
            data_from_trial.loc[:,metric] = smooth(data_from_trial[metric].to_numpy(),windows_size)

        data_from_trial.loc[:,["Hyperparameters"]] = [hpams_str]*len(data_from_trial)
        data_from_trial.loc[:,["trial_id"]] = [trial_id]*len(data_from_trial)
        data_from_trial.loc[:,["checkpoints"]] = [str(checkpoints_int)]*len(data_from_trial)

        data_to_plot = pd.concat([data_to_plot,data_from_trial],axis=0)
    
    return data_to_plot

def smooth(data, window):
    alpha = 2 /(window + 1.0)
    alpha_rev = 1-alpha

    scale = 1/alpha_rev
    n = data.shape[0]

    r = np.arange(n)
    scale_arr = scale**r
    offset = data[0]*alpha_rev**(r+1)
    pw0 = alpha*alpha_rev**(n-1)

    mult = data*pw0*scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums*scale_arr[::-1]
    return out

def plot_training_curve_single_model(data_to_plot,width=850,height=800):
    '''
    Solo usar con trials multiples del mismo modelo.
    '''
     
    metrics = [metric for metric in data_to_plot.columns if metric not in ["Hyperparameters","checkpoints","trial_id","epoch"]]

    if len(metrics) > 1:
        fig = px.line(data_to_plot,x="epoch",y=metrics, line_dash = "trial_id", width=width, height=height, hover_data=["Hyperparameters","checkpoints"])
    else:
        fig = px.line(data_to_plot,x="epoch",y=metrics, color = "trial_id", width=width, height=height, hover_data=["Hyperparameters","checkpoints"])

    return fig    

def plot_training_curve_multiple_model(data_to_plot:dict,width=850,height=800):
    '''
    Solo usar con trials simples. Muchos modelos distintos, pero solo 1 trial en cada 1
    '''

    data_to_plot_compiled = pd.DataFrame()

    for key, df in data_to_plot.items():
        df.loc[:,["model"]] = key
        data_to_plot_compiled = pd.concat([data_to_plot_compiled,df],axis=0)

    metrics = [metric for metric in data_to_plot_compiled.columns if metric not in ["Hyperparameters","checkpoints","trial_id","epoch"]]
    if len(metrics) > 1:
        fig = px.line(data_to_plot_compiled,x="epoch",y=metrics, line_dash = "model", width=width, height=height, hover_data=["Hyperparameters","checkpoints"])
    else:
        fig = px.line(data_to_plot_compiled,x="epoch",y=metrics, color = "model", width=width, height=height, hover_data=["Hyperparameters","checkpoints"])

    return fig
        
def get_result_df(experiment_path):
    experiment = ExperimentAnalysis(experiment_checkpoint_path=experiment_path)
    return experiment.results_df

  from .autonotebook import tqdm as notebook_tqdm
2024-08-06 11:27:24,865	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-08-06 11:27:25,042	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Extracción del df de resultados

In [3]:
#normal_path = "/home/sjinich/disco/TrypanoDEEPscreen/.experiments/chembl4567_poster"
#normal = get_result_df(chembl262_normal_path)

featurization_path = "/home/sjinich/disco/TrypanoDEEPscreen/.experiments/chembl2581_rdkitfeaturization"
featurization = get_result_df(featurization_path)

- TorchTrainer_802fd_00280: FileNotFoundError('Could not fetch metrics for TorchTrainer_802fd_00280: both result.json and progress.csv were not found at /home/sjinich/disco/TrypanoDEEPscreen/.experiments/chembl2581_rdkitfeaturization/TorchTrainer_802fd_00280_280_batch_size=64,drop_rate=0.5000,fully_layer_1=256,fully_layer_2=32,learning_rate=0.0050_2024-07-07_06-48-46')
- TorchTrainer_802fd_00281: FileNotFoundError('Could not fetch metrics for TorchTrainer_802fd_00281: both result.json and progress.csv were not found at /home/sjinich/disco/TrypanoDEEPscreen/.experiments/chembl2581_rdkitfeaturization/TorchTrainer_802fd_00281_281_batch_size=64,drop_rate=0.3000,fully_layer_1=16,fully_layer_2=128,learning_rate=0.0010_2024-07-07_06-55-50')
- TorchTrainer_802fd_00282: FileNotFoundError('Could not fetch metrics for TorchTrainer_802fd_00282: both result.json and progress.csv were not found at /home/sjinich/disco/TrypanoDEEPscreen/.experiments/chembl2581_rdkitfeaturization/TorchTrainer_802fd_002

## Ordenar analizar los resultados
Aca se va a elegir que trials plotear. Por ejemplo, lo que tengan el mejor MCC o el que tenga, las distintas metricas, etc.

In [4]:
# Aca me quedo con los 5 mejores trials en base al MCC de validacion
# best_5_id_normal = normal.sort_values("val_mcc",ascending=False).head(5).index
best_5_id_features = featurization.sort_values("val_mcc",ascending=False).head(10).index


Primero quiero ver si los mejores trials se comportan igual

In [8]:
featurization_top5 = training_data_experiment(featurization_path,best_5_id_features,["val_loss","train_loss"],windows_size=5)
fig = plot_training_curve_single_model(featurization_top5)

In [9]:
fig

In [7]:
fig.write_html("/home/sjinich/disco/TrypanoDEEPscreen/src/analysis/chembl2581_training_cruve.html")