# Contextual Bayesian Optimisation with Large Language Models via In-Context-Learning

This notebook will execute Contextual Bayesian Optimisation, in attempt to detect whether LLMs can learn contextual information.

**NOTE:** Before running this file, you must fix the relative import issues and also add the src to the PYTHONPATH. Additionally, you may need to pass in parameters to save files etc...

<DIV STYLE="background-color:#000000; height:10px; width:100%;">

# Import Libraries

In [None]:
# Standard Library
import os
import pickle
import random
# Third Party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Private
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)
from src.main import run_bo_vs_c_bo

In [None]:
# Default OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""

# Data Preparation

The dataset can be found here [BigSOL](https://zenodo.org/record/6809669). We use this for contextual BO as we are provided with varying context, for the same compound and solute. In the original BO setup of BO-LIFT, we want to remove points because there is no context and the goal is to obtain the values of as many new points as possible to find the maximum. In the contextual BO setup, the goal is different - we want to choose the appropriate point for the current context, which can be anything from the pool.

In [None]:
def create_dataset(data, num_occurrences_low, num_occurrences_high, temps, num_smiles):
    data = data.dropna()
    data = data.drop_duplicates().reset_index(drop=True)
    data.rename(columns={"T,K": "Temperature"}, inplace=True)
    data = data.sort_values(by="SMILES")
    # Shrink dataset
    main_data = pd.DataFrame(columns=["SMILES"] + list(data["Temperature"].unique()))
    for smile in data["SMILES"].unique():
        sub_result = data[data["SMILES"] == smile]
        sub_temp = {"SMILES": smile}
        sub_temp.update(dict(sub_result["Temperature"].value_counts()))
        for temp in list(main_data.columns):
            if temp not in sub_temp.keys():
                sub_temp[temp] = 0
        main_data = pd.concat(
            (pd.DataFrame([sub_temp], columns=list(main_data.columns)), main_data)
        )
    sub_data = main_data[["SMILES"] + temps]
    mask = (sub_data.iloc[:, 1:] > num_occurrences_low) & (
        sub_data.iloc[:, 1:] < num_occurrences_high
    )
    mask = mask.all(axis=1)
    refined_data = sub_data[mask]
    refined_data = refined_data[refined_data.iloc[:, 1:].ge(3).all(axis=1)]
    refined_data = refined_data[refined_data["SMILES"].apply(lambda x: len(x) < 30)][
        :num_smiles
    ]
    combined_data = data.merge(refined_data["SMILES"], on="SMILES")
    combined_data = combined_data[combined_data["Temperature"].isin(temps)]
    # Final dataframe
    combined_data.rename(columns={"SMILES_Solvent": "SMILES Solvent"}, inplace=True)
    combined_df = combined_data[
        ["SMILES", "Temperature", "SMILES Solvent", "Solubility"]
    ].reset_index(drop=True)
    return combined_df, refined_data

In [None]:
# Find the duplicate pairs of SMILES and SMILES Solvents
df_new = pd.read_csv("src\data\bigsoldb.csv")
duplicates = df_new[df_new.duplicated(subset=['SMILES', 'SMILES_Solvent'], keep=False)]
bigsoldb_df, bigsoldmask_df = create_dataset(
    data=duplicates,
    num_occurrences_low=5,
    num_occurrences_high=20,
    temps=[313.15, 308.15, 303.15, 298.15],
    num_smiles=15,
)

In [None]:
# Load dataframe
df = pd.DataFrame(bigsoldb_df)
# Group by SMILES and SMILES Solvent, and count the number of occurrences
grouped = df.groupby(['SMILES', 'SMILES Solvent']).size().reset_index(name='count')
# Filter the rows where the count is even
even_rows = grouped[grouped['count'] == 4]
# Merge the filtered rows back to the original dataframe
filtered_df = pd.merge(df, even_rows[['SMILES', 'SMILES Solvent']], on=['SMILES', 'SMILES Solvent'])
while True:
    # Find the pairs that maximize solubility at each temperature
    max_solubility_pairs = filtered_df.groupby('Temperature').apply(lambda x: x.loc[x['Solubility'].idxmax()])[['SMILES', 'SMILES Solvent']]
    # Check if at least two of those pairs differ
    if len(max_solubility_pairs.drop_duplicates()) > 1:
        break
    # Remove the instances of the pairs that maximize solubility at each temperature
    for _, row in max_solubility_pairs.iterrows():
        filtered_df = filtered_df[(filtered_df['SMILES'] != row['SMILES']) | (filtered_df['SMILES Solvent'] != row['SMILES Solvent'])]

In [None]:
# Artificial modification
final_df = filtered_df.reset_index(drop=True)
final_df = pd.merge(final_df,final_df.groupby(['SMILES', 'SMILES Solvent']).size().reset_index(name='count').sample(169)[75:100].drop("count", axis=1))
final_df.to_csv("bo_vs_cbo_multi_context_100_4_temp.csv", index=False)

In [None]:
def find_df(df):
    # Identify rows with maximum solubility at each temperature
    max_row_30815 = df.loc[df[(df['Temperature'] == 308.15)]['Solubility'].idxmax()]
    max_row_31315 = df.loc[df[(df['Temperature'] == 313.15)]['Solubility'].idxmax()]
    # Check if the SMILES and SMILES Solvent pairs in these rows are the same
    row1= pd.Series(max_row_30815[['SMILES', 'SMILES Solvent']])
    row2 = pd.Series(max_row_31315[['SMILES', 'SMILES Solvent']])
    counter = 1
    while (row1['SMILES'] == row2['SMILES']) and (row1['SMILES Solvent'] == row2['SMILES Solvent']):
        df = df.drop(max_row_31315.name)
        df = df.drop(max_row_30815.name)
        df = df[df.duplicated(subset=['SMILES', 'SMILES Solvent'], keep=False)].reset_index(drop=True)
        # Identify rows with maximum solubility at each temperature
        max_row_30815 = df.loc[df[(df['Temperature'] == 308.15)]['Solubility'].idxmax()]
        max_row_31315 = df.loc[df[(df['Temperature'] == 313.15)]['Solubility'].idxmax()]
        # Check if the SMILES and SMILES Solvent pairs in these rows are the same
        row1= pd.Series(max_row_30815[['SMILES', 'SMILES Solvent']])
        row2 = pd.Series(max_row_31315[['SMILES', 'SMILES Solvent']])
        print(row1, row2)
        counter += 1
        if counter >= 4:
            break
    return df

In [None]:
# Calculate maximium value for each unique temperature
def data_max(data):
    max_temp = pd.DataFrame(columns=list(data.columns))
    for i, temp in enumerate(data["Temperature"].unique()):
        sub_result = data[data["Temperature"] == temp].reset_index(drop=True)
        max_index = sub_result["Solubility"].idxmax()
        final_result = sub_result.loc[max_index]
        max_temp.loc[i] = final_result
    return max_temp

# ICL Contextual Bayesian Optimisation Experiment

# Results

In [None]:
# NOTE - Sometimes this function runs into API request issues - if so, run this on PyCharm through the test file in debug mode.
kwargs = {"data": final_df,
          "N": 50,
          "M": 5,
          "num_train": [5, 15], 
          "models_list": ["gpt-3.5-turbo"],
          "_lambda": [1, 5, 10]}
bo_vs_cbo_results = run_bo_vs_c_bo(**kwargs)

Now choose a file path to save the results and load this in the next section.

In [None]:
# NOTE: Use pickle to dump the results - you will have to supply this code...


In [None]:
file_path = "..."
with open(file_path, "rb") as file:
    loaded_data = pickle.load(file)

In [None]:
def process_results(results, selector, component):
    df = pd.DataFrame(columns=["Strategy", "Method", "Selection", f"{component}"])
    i = 0
    for method, method_data in results.items():
        # Iterate through the second-level dictionary
        for lift_type, lift_data in method_data.items():
            # Iterate through the 'Optimal Point with MMR' and 'Optimal Point without MMR' entries
            for j, (mmr_type, mmr_data) in enumerate(lift_data.items()):
                component_values = [
                    point[component] for point in results[method][lift_type][mmr_type]
                ]
                if len(component_values) != 0:
                    df.loc[i] = {
                        "Strategy": method,
                        "Method": lift_type,
                        "Selection": selector[i],
                        f"{component}": component_values,
                    }
                    i += 1
    return df

In [None]:
# Obtain simplified results table
results = loaded_data["upper_confidence_bound"]
strategies = ["BO", "C-BO"]
methods = ["BO-LLM", "CBO-LLM"]
selector = ["with MMR", "with MMR"]
component = "Regret"
full_df = pd.DataFrame(columns=["Strategy", "Method", "Selection", f"{component}"])
for result in results:
    df = process_results(results=result, selector=selector, component=component)
    full_df = pd.concat((df, full_df))
full_df = full_df.reset_index(drop=True)
full_grouped = (
    full_df.groupby(["Strategy", "Method", "Selection"])[f"{component}"]
    .apply(list)
    .reset_index()
)

In [None]:
# Function to plot the lists and their average
def plot_regret_lists(ax, regret_lists, label, avg_label):
    # Plot individual regret lists (slightly faded)
    for i, regret_list in enumerate(regret_lists):
        ax.plot(np.cumsum(regret_list), alpha=0.5, label=f"{label}")
    # Calculate the average regret list
    avg_cum_regret = np.array(regret_lists).mean(axis=0).cumsum()
    # Plot the average regret list (bold)
    ax.plot(
        avg_cum_regret, label=avg_label, linewidth=2.5, linestyle="--", color="black"
    )
    # Add labels and legend
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Cumulative Regret")
    ax.set_title("Regret Over Iterations")
    ax.legend()

In [None]:
# Create a figure with dynamic number of subplots based on data
num_rows = len(full_grouped)
num_cols = 2  # You can change the number of columns as needed
fig, axs = plt.subplots(
    (num_rows + num_cols - 1) // num_cols, num_cols, figsize=(16, 6)
)
# Iterate through the rows in your data
for i, (_, row) in enumerate(full_grouped.iterrows()):
    strategy = row["Strategy"]
    method = row["Method"]
    selection = row["Selection"]
    regret_lists = row["Regret"]
    label = f"{strategy}-{method}-{selection}"
    avg_label = f"Average - {label}"
    # Plot in the corresponding subplot
    plot_regret_lists(axs[i], regret_lists, label, avg_label)
# Adjust spacing between subplots
plt.tight_layout()
# Show the plot
plt.show()


# Visualisations

In [None]:
# Extract BO and C-BO data
def plot(ax, df, lambda_val, num_train):
    bo_data = [df['Cumulative Regret'][i] for i in range(len(df['Method'])) if df['Method'][i] == 'BO']
    cbo_data = [df['Cumulative Regret'][i] for i in range(len(df['Method'])) if df['Method'][i] == 'C-BO']
    # Calculate mean and standard deviation
    bo_mean = np.mean(bo_data, axis=0)
    cbo_mean = np.mean(cbo_data, axis=0)
    # Create x-axis values
    x_values = np.arange(1, len(bo_mean)+1)
    # Create a figure with a white background
    # Plot individual BO cumulative regret lines as dashed
    for _, bo_regret in enumerate(bo_data):
        ax.plot(x_values, bo_regret, linestyle='--', color="C10", alpha=0.2)
    # Plot individual C-BO cumulative regret lines as dashed
    for _, cbo_regret in enumerate(cbo_data):
        ax.plot(x_values, cbo_regret, linestyle='--', color="C11", alpha=0.2)
    # Plot BO cumulative regrets with standard deviation fill
    ax.plot(x_values, bo_mean, label='BO (without context)', color='C10')
    # Plot C-BO cumulative regrets with standard deviation fill
    ax.plot(x_values, cbo_mean, label='CBO (with context)', color='C11')
    # Add labels and legend
    ax.set_xlabel('Iteration')
    ax.set_xticks(np.arange(5, len(bo_mean)+1, 5))
    ax.set_ylabel('Maximum Solubility')
    ax.set_title(rf" $\lambda = {lambda_val} | num_train = {num_train}$")
    ax.set_facecolor('lightyellow')

In [None]:
def bo_data_collection(ax, sub_results, lambda_val, num_train):
    data = {'Method': [], 'Strategy': [], 'Cumulative Regret': [], 'Acquisition Values': []}
    for val in sub_results:
        for method, strategies in val.items():
            for strategy, data_dict in strategies.items():
                regrets = []
                acquisition_values = []
                for item in data_dict['Optimal Point with MMR']:
                    regrets.append(item['Regret'])
                    acquisition_values.append(item['Acquisition Value'])
                # For BO, once maximum solubility is found, we can just set the remaining values to the maximum
                # if len(regrets) != 0:
                #     for _ in range(20-len(regrets)):
                #         regrets.append(regrets[-1])
                    # Caclulate regret
                data['Method'].append(method)
                data['Strategy'].append(strategy)
                data['Cumulative Regret'].append(regrets)
                data['Acquisition Values'].append(acquisition_values)
    df = pd.DataFrame(data)
    # Remove rows with empty 'Regret' lists
    df = df[df['Cumulative Regret'].apply(lambda x: bool(x))]
    df['Mean Acquisition Value'] = df['Acquisition Values'].apply(lambda x: np.mean(x))
    df['Std Dev Acquisition Value'] = df['Acquisition Values'].apply(lambda x: np.std(x))
    # Create a new column with 'Mean Acquisition Value ± Standard Deviation' as strings
    df['Acquisition Confidence Interval'] = df.apply(lambda row: f"{row['Mean Acquisition Value']:.2f} ± {row['Std Dev Acquisition Value']:.2f}", axis=1)
    # Drop the separate 'Mean Acquisition Value' and 'Std Dev Acquisition Value' columns
    df.drop(['Mean Acquisition Value', 'Std Dev Acquisition Value', "Acquisition Values"], axis=1, inplace=True)
    df = df.reset_index(drop=True)
    # Calculate mean and standard deviation of acquisition values
    df['Cumulative Regret'] = df['Cumulative Regret'].apply(lambda x: np.cumsum(x))
    plot(ax, df, lambda_val, num_train)
    return df

In [None]:
# NOTE: Update the None argument with the file path to the main data results
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), constrained_layout=True)
df_4 = bo_data_collection(axs[0], sub_results=loaded_data["gpt-3.5-turbo/15/1"], lambda_val=1, num_train=15)
df_5 = bo_data_collection(axs[1], sub_results=loaded_data["gpt-3.5-turbo/15/5"], lambda_val=5, num_train=15)
df_6 = bo_data_collection(axs[2], sub_results=loaded_data["gpt-3.5-turbo/15/10"], lambda_val=10, num_train=15)
fig.patch.set_facecolor('white')
fig.suptitle("Contextual Bayesian Optimisation (GPT-3.5-Turbo)", fontsize=20, fontweight='bold', y=1.1)
handles, labels = axs[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0), fancybox=True, shadow=True, ncol=2)
plt.savefig("extra_ones", dpi=1200, bbox_inches='tight')
plt.show()

<DIV STYLE="background-color:#000000; height:10px; width:100%;">