In [1]:
import pandas as pd
import numpy as np

Load in `.csv` file that came from `stage-p2x`

In [2]:
df = pd.read_csv('runs_with_eval_loss_and_params.csv')

Load in data from MolNet finetuning

In [3]:
import s3fs 

In [4]:
import os
import json

In [5]:
fs = s3fs.S3FileSystem()

In [1]:
model_bucket = ""

In [6]:
cloud_dir = f"s3://{model_bucket}/chemberta/mlm_pretraining_5M_20210722/molnet_mlm_5M_ft_20210727/"

In [7]:
def get_dataframes(cloud_dir):
    run_dirs = fs.ls(cloud_dir)
    data_avg = []
    df_all = pd.DataFrame()
    for rd in run_dirs:
        run_name = os.path.basename(os.path.normpath(rd))
        # go one level down to get the molnet task
        molnet_task_data_avg = {}
        molnet_task_data_all = {}
        for molnet_task_dir in fs.ls(rd):
            molnet_task_name = os.path.basename(os.path.normpath(molnet_task_dir))
            results_dir = os.path.join(molnet_task_dir, "results/")
            for subset in ["valid", "test"]:
                with fs.open(os.path.join(results_dir, subset, "metrics.json")) as f:
                    metrics = json.load(f)
                # pick first item to get the keys
                metric_names = list(list(metrics.items())[0][1].keys())
                metric_res = {mn: [] for mn in metric_names}
                for seed, res in metrics.items():
                    for mn, mres in res.items():
                        if mn == "pearsonr":
                            metric_res[mn].append(mres[0])
                        else:
                            metric_res[mn].append(mres)
                molnet_task_data_all.update({f"{molnet_task_name}_{subset}_{mn}": metric_res[mn] for mn in metric_names})
                average_metrics = {f"{molnet_task_name}_{subset}_{mn}_mean": np.mean(metric_res[mn]) for mn in metric_names}
                std_metrics = {f"{molnet_task_name}_{subset}_{mn}_std": np.std(metric_res[mn]) for mn in metric_names}
                molnet_task_data_avg.update({**average_metrics, **std_metrics})
        molnet_task_data_all.update({"run_name": [run_name]*5})
        df_all = df_all.append(pd.DataFrame(molnet_task_data_all))
        data_avg.append({"run_name": run_name, **molnet_task_data_avg})

    df_avg = pd.DataFrame(data_avg)
    return df_all, df_avg

In [8]:
df_all, df_avg = get_dataframes(cloud_dir)

In [9]:
df_all

Unnamed: 0,bace_classification_valid_roc_auc_score,bace_classification_valid_average_precision_score,bace_classification_test_roc_auc_score,bace_classification_test_average_precision_score,bace_regression_valid_pearsonr,bace_regression_valid_rmse,bace_regression_test_pearsonr,bace_regression_test_rmse,bbbp_valid_roc_auc_score,bbbp_valid_average_precision_score,...,delaney_test_rmse,lipo_valid_pearsonr,lipo_valid_rmse,lipo_test_pearsonr,lipo_test_rmse,tox21_valid_roc_auc_score,tox21_valid_average_precision_score,tox21_test_roc_auc_score,tox21_test_average_precision_score,run_name
0,0.617626,0.642586,0.74221,0.805728,0.048413,0.486238,0.790035,1.062408,0.478746,0.56971,...,0.743731,0.634181,0.777338,0.530516,0.790485,0.765142,0.402088,0.725084,0.264528,run_11
1,0.639303,0.691699,0.763406,0.796037,0.088025,0.517607,0.760793,1.06476,0.478552,0.569611,...,0.756207,0.602142,0.803671,0.471359,0.820449,0.759737,0.406584,0.737022,0.278703,run_11
2,0.622601,0.650823,0.754348,0.816878,0.069123,0.50146,0.782642,1.073184,0.47894,0.569859,...,0.719029,0.589683,0.813896,0.511526,0.792144,0.732756,0.378155,0.753364,0.265832,run_11
3,0.651741,0.703773,0.752174,0.814921,0.053442,0.507471,0.775046,1.07186,0.478843,0.569748,...,0.804081,0.610482,0.796291,0.521493,0.790394,0.735415,0.358375,0.756299,0.250198,run_11
4,0.611763,0.667589,0.747101,0.807569,0.087243,0.502758,0.778306,1.066424,0.478406,0.569532,...,0.752333,0.581188,0.819264,0.466096,0.815085,0.734171,0.351607,0.74715,0.260902,run_11
0,0.530561,0.571195,0.459058,0.603086,0.191274,0.592941,0.651624,1.232585,0.621118,0.627761,...,0.566229,0.660891,0.759387,0.616087,0.720151,0.670084,0.390464,0.747052,0.339919,run_19
1,0.530561,0.5709,0.458877,0.602259,0.232305,0.587055,0.710005,1.186516,0.618401,0.631138,...,0.565597,0.669362,0.749947,0.611181,0.725322,0.680551,0.418526,0.781105,0.349839,run_19
2,0.529318,0.570548,0.459601,0.60324,0.211578,0.594727,0.687915,1.197694,0.605493,0.619957,...,0.567867,0.659139,0.760213,0.600058,0.731483,0.682953,0.417884,0.748006,0.360837,run_19
3,0.530028,0.570815,0.459239,0.602396,0.233774,0.580585,0.724925,1.185771,0.614227,0.62496,...,0.618144,0.647652,0.772542,0.581632,0.742866,0.676433,0.410727,0.781447,0.340522,run_19
4,0.529673,0.570175,0.458877,0.602331,0.311485,0.578315,0.708598,1.17177,0.598214,0.628301,...,0.616334,0.626287,0.787265,0.586526,0.739516,0.718514,0.435048,0.757571,0.353538,run_19


In [10]:
df_avg

Unnamed: 0,run_name,bace_classification_valid_roc_auc_score_mean,bace_classification_valid_average_precision_score_mean,bace_classification_valid_roc_auc_score_std,bace_classification_valid_average_precision_score_std,bace_classification_test_roc_auc_score_mean,bace_classification_test_average_precision_score_mean,bace_classification_test_roc_auc_score_std,bace_classification_test_average_precision_score_std,bace_regression_valid_pearsonr_mean,...,lipo_test_pearsonr_std,lipo_test_rmse_std,tox21_valid_roc_auc_score_mean,tox21_valid_average_precision_score_mean,tox21_valid_roc_auc_score_std,tox21_valid_average_precision_score_std,tox21_test_roc_auc_score_mean,tox21_test_average_precision_score_mean,tox21_test_roc_auc_score_std,tox21_test_average_precision_score_std
0,run_11,0.628607,0.671294,0.014765,0.023359,0.751848,0.808227,0.007146,0.007415,0.069249,...,0.026441,0.013233,0.745444,0.379362,0.014007,0.022225,0.743784,0.264033,0.011447,0.009164
1,run_19,0.530028,0.570727,0.00049,0.000344,0.45913,0.602662,0.000271,0.000414,0.236083,...,0.013406,0.008482,0.685707,0.41453,0.016974,0.014435,0.763036,0.348931,0.015341,0.007947
2,run_2,0.631379,0.621805,0.001342,0.003332,0.600362,0.713134,0.002174,0.001745,0.331871,...,0.018348,0.009652,0.704328,0.435489,0.003818,0.01462,0.757356,0.355721,0.00262,0.007957
3,run_34,0.663824,0.68213,0.001452,0.000616,0.686268,0.790764,0.005345,0.004102,0.194054,...,0.00355,0.004913,0.719964,0.445896,0.001454,0.009617,0.751495,0.322919,0.000856,0.009744
4,run_38,0.481663,0.574726,0.000174,7.9e-05,0.495725,0.600252,0.000145,0.000695,0.180218,...,0.006753,0.002934,0.802428,0.492139,0.004885,0.013011,0.737463,0.312435,0.009331,0.016
5,run_39,0.651919,0.708908,0.0072,0.006269,0.817754,0.857191,0.004307,0.010664,-0.024037,...,0.021805,0.014695,0.771354,0.408372,0.016536,0.026602,0.718362,0.30921,0.033522,0.020414
6,run_4,0.672459,0.71119,0.004787,0.005452,0.767101,0.836935,0.011848,0.005713,0.240182,...,0.022501,0.016704,0.774116,0.431274,0.02642,0.024656,0.743285,0.333644,0.013504,0.031936
7,run_43,0.562118,0.57957,0.000181,0.000376,0.557609,0.656718,0.002977,0.002176,0.168642,...,0.014675,0.010974,0.746096,0.433752,0.001319,0.001695,0.696629,0.337659,0.001548,0.005416
8,run_45,0.718728,0.764412,0.001508,0.001736,0.82029,0.844239,0.001255,0.000907,0.124947,...,0.006065,0.003496,0.768428,0.480094,0.006041,0.012952,0.763433,0.344838,0.00619,0.006467
9,run_9,0.60924,0.605404,0.000174,0.00035,0.619493,0.697078,8.9e-05,0.000109,0.150721,...,0.02079,0.021034,0.762869,0.419585,0.00221,0.005767,0.746808,0.370426,0.002211,0.020859


In [11]:
combined_avg_df = pd.merge(left=df, right=df_avg, on='run_name')
# combined_avg_df['run_name'] = combined_avg_df['run_name'].apply(lambda x: f"mlm_{x}")

combined_all_df = pd.merge(left=df, right=df_all, on='run_name')
# combined_all_df['run_name'] = combined_all_df['run_name'].apply(lambda x: f"mlm_{x}")

In [12]:
combined_avg_df

Unnamed: 0,run_name,min_eval_loss,hidden_size,attention_probs_dropout_prob,hidden_dropout_prob,intermediate_size,num_attention_heads,num_hidden_layers,learning_rate,pretraining_task,...,lipo_test_pearsonr_std,lipo_test_rmse_std,tox21_valid_roc_auc_score_mean,tox21_valid_average_precision_score_mean,tox21_valid_roc_auc_score_std,tox21_valid_average_precision_score_std,tox21_test_roc_auc_score_mean,tox21_test_average_precision_score_mean,tox21_test_roc_auc_score_std,tox21_test_average_precision_score_std
0,run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.026441,0.013233,0.745444,0.379362,0.014007,0.022225,0.743784,0.264033,0.011447,0.009164
1,run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.00355,0.004913,0.719964,0.445896,0.001454,0.009617,0.751495,0.322919,0.000856,0.009744
2,run_39,0.512588,209,0.176,0.128,3968,11,3,2e-06,5M-MLM,...,0.021805,0.014695,0.771354,0.408372,0.016536,0.026602,0.718362,0.30921,0.033522,0.020414
3,run_19,0.251964,57,0.129,0.139,10476,3,5,5.8e-05,5M-MLM,...,0.013406,0.008482,0.685707,0.41453,0.016974,0.014435,0.763036,0.348931,0.015341,0.007947
4,run_4,0.342888,344,0.235,0.139,1252,8,4,3e-06,5M-MLM,...,0.022501,0.016704,0.774116,0.431274,0.02642,0.024656,0.743285,0.333644,0.013504,0.031936
5,run_43,0.189039,324,0.201,0.126,5428,9,2,0.000262,5M-MLM,...,0.014675,0.010974,0.746096,0.433752,0.001319,0.001695,0.696629,0.337659,0.001548,0.005416
6,run_38,0.496019,126,0.109,0.279,456,3,2,2.1e-05,5M-MLM,...,0.006753,0.002934,0.802428,0.492139,0.004885,0.013011,0.737463,0.312435,0.009331,0.016
7,run_9,0.179905,580,0.249,0.121,5712,10,3,0.000279,5M-MLM,...,0.02079,0.021034,0.762869,0.419585,0.00221,0.005767,0.746808,0.370426,0.002211,0.020859
8,run_2,0.217681,82,0.232,0.16,11024,2,6,0.000144,5M-MLM,...,0.018348,0.009652,0.704328,0.435489,0.003818,0.01462,0.757356,0.355721,0.00262,0.007957
9,run_45,0.180579,384,0.109,0.144,464,12,3,0.000141,5M-MLM,...,0.006065,0.003496,0.768428,0.480094,0.006041,0.012952,0.763433,0.344838,0.00619,0.006467


In [13]:
combined_all_df

Unnamed: 0,run_name,min_eval_loss,hidden_size,attention_probs_dropout_prob,hidden_dropout_prob,intermediate_size,num_attention_heads,num_hidden_layers,learning_rate,pretraining_task,...,delaney_test_pearsonr,delaney_test_rmse,lipo_valid_pearsonr,lipo_valid_rmse,lipo_test_pearsonr,lipo_test_rmse,tox21_valid_roc_auc_score,tox21_valid_average_precision_score,tox21_test_roc_auc_score,tox21_test_average_precision_score
0,run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.719212,0.743731,0.634181,0.777338,0.530516,0.790485,0.765142,0.402088,0.725084,0.264528
1,run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.728683,0.756207,0.602142,0.803671,0.471359,0.820449,0.759737,0.406584,0.737022,0.278703
2,run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.748754,0.719029,0.589683,0.813896,0.511526,0.792144,0.732756,0.378155,0.753364,0.265832
3,run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.682729,0.804081,0.610482,0.796291,0.521493,0.790394,0.735415,0.358375,0.756299,0.250198
4,run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.725576,0.752333,0.581188,0.819264,0.466096,0.815085,0.734171,0.351607,0.74715,0.260902
5,run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.865496,0.533229,0.731352,0.696401,0.683632,0.678472,0.717999,0.452095,0.752777,0.326847
6,run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.872975,0.514787,0.733611,0.691946,0.686884,0.679239,0.720402,0.44133,0.750575,0.336646
7,run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.872234,0.52052,0.746214,0.683402,0.693174,0.675669,0.718771,0.461728,0.752238,0.326955
8,run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.880929,0.498402,0.736411,0.695391,0.683477,0.68909,0.72216,0.43709,0.750868,0.308967
9,run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.879001,0.507435,0.749287,0.686962,0.685513,0.685347,0.720487,0.437239,0.751015,0.315179


In [15]:
combined_avg_df.to_csv('ft_results_combined.csv', index=False)
combined_all_df.to_csv('ft_results_all_seeds.csv', index=False)

In [11]:
# run_dirs = fs.ls(cloud_dir)
# data = []
# for rd in run_dirs:
#     run_name = os.path.basename(os.path.normpath(rd))
#     # go one level down to get the molnet task
#     molnet_task_data = {}
#     for molnet_task_dir in fs.ls(rd):
#         molnet_task_name = os.path.basename(os.path.normpath(molnet_task_dir))
#         results_dir = os.path.join(molnet_task_dir, "results/")
#         for subset in ["valid", "test"]:
#             with fs.open(os.path.join(results_dir, subset, "metrics.json")) as f:
#                 metrics = json.load(f)
#             # pick first item to get the keys
#             metric_names = list(list(metrics.items())[0][1].keys())
#             metric_res = {mn: [] for mn in metric_names}
#             for seed, res in metrics.items():
#                 for mn, mres in res.items():
#                     metric_res[mn].append(mres)
#             average_metrics = {f"{molnet_task_name}_{subset}_{mn}_mean": np.mean(metric_res[mn]) for mn in metric_names}
#             std_metrics = {f"{molnet_task_name}_{subset}_{mn}_std": np.std(metric_res[mn]) for mn in metric_names}
#             molnet_task_data.update({**average_metrics, **std_metrics})
#     data.append({"run_name": run_name, **molnet_task_data})
                
        

In [12]:
# ft_df = pd.DataFrame(data)

In [13]:
# ft_df

Unnamed: 0,run_name,bace_classification_valid_roc_auc_score_mean,bace_classification_valid_average_precision_score_mean,bace_classification_valid_roc_auc_score_std,bace_classification_valid_average_precision_score_std,bace_classification_test_roc_auc_score_mean,bace_classification_test_average_precision_score_mean,bace_classification_test_roc_auc_score_std,bace_classification_test_average_precision_score_std,bace_regression_valid_pearsonr_mean,...,lipo_test_pearsonr_std,lipo_test_rmse_std,tox21_valid_roc_auc_score_mean,tox21_valid_average_precision_score_mean,tox21_valid_roc_auc_score_std,tox21_valid_average_precision_score_std,tox21_test_roc_auc_score_mean,tox21_test_average_precision_score_mean,tox21_test_roc_auc_score_std,tox21_test_average_precision_score_std
0,run_11,0.628607,0.671294,0.014765,0.023359,0.751848,0.808227,0.007146,0.007415,0.238411,...,0.250797,0.013233,0.745444,0.379362,0.014007,0.022225,0.743784,0.264033,0.011447,0.009164
1,run_19,0.530028,0.570727,0.00049,0.000344,0.45913,0.602662,0.000271,0.000414,0.121623,...,0.299698,0.008482,0.685707,0.41453,0.016974,0.014435,0.763036,0.348931,0.015341,0.007947
2,run_2,0.631379,0.621805,0.001342,0.003332,0.600362,0.713134,0.002174,0.001745,0.165959,...,0.3148,0.009652,0.704328,0.435489,0.003818,0.01462,0.757356,0.355721,0.00262,0.007957
3,run_34,0.663824,0.68213,0.001452,0.000616,0.686268,0.790764,0.005345,0.004102,0.106078,...,0.343277,0.004913,0.719964,0.445896,0.001454,0.009617,0.751495,0.322919,0.000856,0.009744
4,run_38,0.481663,0.574726,0.000174,7.9e-05,0.495725,0.600252,0.000145,0.000695,0.115208,...,0.258276,0.002934,0.802428,0.492139,0.004885,0.013011,0.737463,0.312435,0.009331,0.016
5,run_39,0.651919,0.708908,0.0072,0.006269,0.817754,0.857191,0.004307,0.010664,0.3734,...,0.294641,0.014695,0.771354,0.408372,0.016536,0.026602,0.718362,0.30921,0.033522,0.020414
6,run_4,0.672459,0.71119,0.004787,0.005452,0.767101,0.836935,0.011848,0.005713,0.121578,...,0.331609,0.016704,0.774116,0.431274,0.02642,0.024656,0.743285,0.333644,0.013504,0.031936
7,run_43,0.562118,0.57957,0.000181,0.000376,0.557609,0.656718,0.002977,0.002176,0.103561,...,0.319756,0.010974,0.746096,0.433752,0.001319,0.001695,0.696629,0.337659,0.001548,0.005416
8,run_45,0.718728,0.764412,0.001508,0.001736,0.82029,0.844239,0.001255,0.000907,0.125654,...,0.322734,0.003496,0.768428,0.480094,0.006041,0.012952,0.763433,0.344838,0.00619,0.006467
9,run_9,0.60924,0.605404,0.000174,0.00035,0.619493,0.697078,8.9e-05,0.000109,0.107715,...,0.331137,0.021034,0.762869,0.419585,0.00221,0.005767,0.746808,0.370426,0.002211,0.020859


In [14]:
# combined_df = pd.merge(left=df, right=ft_df, on='run_name')
# combined_df['run_name'] = combined_df['run_name'].apply(lambda x: f"mlm_{x}")

In [15]:
# combined_df

Unnamed: 0,run_name,min_eval_loss,hidden_size,attention_probs_dropout_prob,hidden_dropout_prob,intermediate_size,num_attention_heads,num_hidden_layers,learning_rate,pretraining_task,...,lipo_test_pearsonr_std,lipo_test_rmse_std,tox21_valid_roc_auc_score_mean,tox21_valid_average_precision_score_mean,tox21_valid_roc_auc_score_std,tox21_valid_average_precision_score_std,tox21_test_roc_auc_score_mean,tox21_test_average_precision_score_mean,tox21_test_roc_auc_score_std,tox21_test_average_precision_score_std
0,mlm_run_11,0.695174,112,0.118,0.183,4844,8,5,2e-06,5M-MLM,...,0.250797,0.013233,0.745444,0.379362,0.014007,0.022225,0.743784,0.264033,0.011447,0.009164
1,mlm_run_34,0.166413,696,0.148,0.226,8436,12,2,8.7e-05,5M-MLM,...,0.343277,0.004913,0.719964,0.445896,0.001454,0.009617,0.751495,0.322919,0.000856,0.009744
2,mlm_run_39,0.512588,209,0.176,0.128,3968,11,3,2e-06,5M-MLM,...,0.294641,0.014695,0.771354,0.408372,0.016536,0.026602,0.718362,0.30921,0.033522,0.020414
3,mlm_run_19,0.251964,57,0.129,0.139,10476,3,5,5.8e-05,5M-MLM,...,0.299698,0.008482,0.685707,0.41453,0.016974,0.014435,0.763036,0.348931,0.015341,0.007947
4,mlm_run_4,0.342888,344,0.235,0.139,1252,8,4,3e-06,5M-MLM,...,0.331609,0.016704,0.774116,0.431274,0.02642,0.024656,0.743285,0.333644,0.013504,0.031936
5,mlm_run_43,0.189039,324,0.201,0.126,5428,9,2,0.000262,5M-MLM,...,0.319756,0.010974,0.746096,0.433752,0.001319,0.001695,0.696629,0.337659,0.001548,0.005416
6,mlm_run_38,0.496019,126,0.109,0.279,456,3,2,2.1e-05,5M-MLM,...,0.258276,0.002934,0.802428,0.492139,0.004885,0.013011,0.737463,0.312435,0.009331,0.016
7,mlm_run_9,0.179905,580,0.249,0.121,5712,10,3,0.000279,5M-MLM,...,0.331137,0.021034,0.762869,0.419585,0.00221,0.005767,0.746808,0.370426,0.002211,0.020859
8,mlm_run_2,0.217681,82,0.232,0.16,11024,2,6,0.000144,5M-MLM,...,0.3148,0.009652,0.704328,0.435489,0.003818,0.01462,0.757356,0.355721,0.00262,0.007957
9,mlm_run_45,0.180579,384,0.109,0.144,464,12,3,0.000141,5M-MLM,...,0.322734,0.003496,0.768428,0.480094,0.006041,0.012952,0.763433,0.344838,0.00619,0.006467


In [16]:
# combined_df.to_csv('ft_results_combined.csv', index=False)