In [None]:
%matplotlib inline


import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# df = df.groupby(['explorer_name', 'num_batches_log']).max().reset_index()

In [None]:
fp = "path_to_adaptivity_runs"
data = [pd.read_csv(os.path.join(fp, "CMAES.csv")),
        pd.read_csv(os.path.join(fp, "PPO_Agent.csv")),
        pd.read_csv(os.path.join(fp, "DynaPPO_Agent_0.5_10_20.csv")), 
        pd.read_csv(os.path.join(fp, "DynaPPO_Agent_0.5_10_20_dens.csv"))]

In [None]:
class AdaptivityData:
    def __init__(self, filename, explorer_name):
        self.load_data(filename)
        self.explorer_name = explorer_name
        self.add_explorer_name()
    
    def add_explorer_name(self):
        if self.data is None:
            return
        self.data['explorer_name'] = self.explorer_name
    
    def load_data(self, filename):
        self.data = pd.read_csv(filename)
        
class AdaptivityPlot:
    def __init__(self, adaptivity_data_list):
        self.adaptivity_data_list = adaptivity_data_list
        self.data = pd.concat([df.data for df in adaptivity_data_list])
        
        self.total_seqs = 1000
        
    def add_num_batches_log(self):
        self.data['num_batches_log'] = np.log10(self.total_seqs/self.data['batch_size'])
        
    def plot_data(self, x_axis, y_axis, x_label=None, y_label=None, hue="explorer_name"):
        fig, ax = plt.subplots(figsize=(5,3), dpi=300)
#         self.data = self.data.groupby(['explorer_name', 'num_batches_log']).max().reset_index()
        self.data = self.data[self.data.start_id == 'startRNAL14_2']
        
        sns.lineplot(x=x_axis, y=y_axis, hue=hue, data=self.data)
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles=handles[1:], labels=labels[1:])
        if x_label is not None:
            plt.xlabel(x_label)
        if y_label is not None:
            plt.ylabel(y_label)
        plt.show()
        
CMAES = AdaptivityData(filename=os.path.join(fp, "CMAES.csv"), explorer_name="CMAES")
PPO = AdaptivityData(filename=os.path.join(fp, "PPO_Agent.csv"), explorer_name="PPO")
DynaPPO = AdaptivityData(filename=os.path.join(fp, "DynaPPO_Agent_0.5_10_20_dens.csv"), explorer_name="DynaPPO")
DynaPPONoDens = AdaptivityData(filename=os.path.join(fp, "DynaPPO_Agent_0.5_10_20.csv"), explorer_name="DynaPPONoDens")

AdaLead = AdaptivityData(filename=os.path.join("path_to_adaptivity_runs", "Greedy_mu1_tr0.05_r0.2_rho1.csv"), explorer_name="AdaLead")
CbAS = AdaptivityData(filename=os.path.join("path_to_adaptivity_runs", "CbAS_Q0.8_generatorVAE_adaptivity.csv"), explorer_name="CbAS")
BO = AdaptivityData(filename=os.path.join("path_to_adaptivity_runs", "BO_adaptivity.csv"), explorer_name="BayesOpt")

adaptivity_plot = AdaptivityPlot([CMAES, PPO, DynaPPO, DynaPPONoDens, AdaLead, CbAS, BO])
adaptivity_plot = AdaptivityPlot([DynaPPO, AdaLead, CbAS, BO])

In [None]:
adaptivity_plot.add_num_batches_log()
adaptivity_plot.plot_data('num_batches_log',
                          'true_score',
                          x_label='number of batches (log$_{10}$)',
                          y_label='average $\phi$',
                          hue='explorer_name')

In [None]:
adaptivity_plot.data.start_id.unique()

In [None]:
adaptivity_plot.plot_data('num_batches_log',
                          'true_score',
                          x_label='log$_{10}$ (number of batches)',
                          y_label='average $\phi$',
                          hue='explorer_name')

In [None]:
import itertools
from sklearn.metrics import r2_score
def add_column_with_r2(df):
    """
    Given a pandas data frame that contains colums 'batch', 'sequence', 'true_score', 'model_score',
    'landscape_id', 'start_id', 'model_type', and 'explorer_type', add a column called 'r2' that,
    for each landscape_id, start_id, model_type, and explorer_type combination, returns the r2 value within
    each batch (i.e., takes the true_score and model_score for each batch and calculates the r2)
    """
    r2_dict = dict() #key: tuple(model_type, explorer_type, landscape_id, start_id, batch), value: r2 score for batch
    model_types = df.model_type.unique()
    explorer_types = df.explorer_type.unique()
    landscape_types = df.landscape_id.unique()
    start_id_types = df.start_id.unique()
    for combination in itertools.product(model_types, explorer_types, landscape_types, start_id_types):
        (model, explorer, landscape, start_seq) = combination
        df_subset = df[(df.model_type == model) &
                       (df.explorer_type == explorer) &
                       (df.landscape_id == landscape) &
                       (df.start_id == start_seq)]
        if len(df_subset) != 0:
            for batch in df_subset.batch.unique():
                df_batch = df_subset[df_subset.batch == batch]
                y_true = list(df_batch.true_score)
                y_pred = list(df_batch.model_score)
                r2 = r2_score(y_true, y_pred)
                r2_dict[(model, explorer, landscape, start_seq, batch)] = r2
    r2_column = []
    for i, row in df.iterrows():
        r2_column.append(r2_dict[(row.model_type,
                                 row.explorer_type,
                                 row.landscape_id,
                                 row.start_id,
                                 row.batch)])
    df['r2_score'] = r2_column
    return df

def plot_consistency(list_of_dfs):
    """
    Takes a list of data frames [df1, df2, df3, ...] and plots the top score as a function of the r2 score.
    """
    #concatenate all dfs, add a column with r2 scores, add clean explorer names, collapse by true_score using max()
    df = pd.concat(list_of_dfs)
    df = add_column_with_r2(df)
    df = add_clean_explorer_names(df, explorer_names_dict)
    df = df.groupby(['explorer_name', 'r2_score']).max().reset_index()
    #plotting
    fig, ax = plt.subplots(figsize=(5,3), dpi=300)
    sns.lineplot(x="measurement_cost", y="r2_score", hue="explorer_name", data=df)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles[1:], labels=labels[1:], loc=2)
    plt.xlabel('R$^2$ score of the model prediction')
    plt.ylabel('Top $\phi$')
    return

In [None]:
DynaPPO = AdaptivityData(filename=os.path.join(fp, "DynaPPO_Agent_0.5_10_20_dens.csv"), explorer_name="DynaPPO")
# DynaPPONoDens = AdaptivityData(filename=os.path.join(fp, "DynaPPO_Agent_0.5_10_20.csv"), explorer_name="DynaPPONoDens")

In [None]:
plot_consistency([DynaPPO.data])

In [None]:
DynaPPO.data.head()