# Contextual Bayesian Optimisation via Large Language Models

This notebook will incorporate Part 1 and compare variations of BO-LIFT via the tell-predict phase.


**NOTE:** Before running this file, you must fix the relative import issues - all you must do is remove the full stop from in front of the imports. Until the directory is cleaned, this has to be done if you want to run the code in the notebook. If you do not, you may run the test files instead.

<DIV STYLE="background-color:#000000; height:10px; width:100%;">

# Import Libraries

In [None]:
# Standard Library
import os
import itertools
import pickle

# Third Party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Private
from model import CEBO
from bo_lift import AskTellFewShotTopk

In [None]:
# Default OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""

# Data Preparation

The original paper used data corresponding to this paper [ESOL](https://www.researchgate.net/publication/8551133_ESOL_Estimating_Aqueous_Solubility_Directly_from_Molecular_Structure) - this corresponds to only 927 examples with 7 columns, with only 3 being important. This is not enough information for us to compare alternative techniques, hence we will use the larger dataset, provided from Kaggle, which incorporates more information about these molecules [AqSOL](https://www.kaggle.com/datasets/sorkun/aqsoldb-a-curated-aqueous-solubility-dataset?resource=download).

In [None]:
# Load AqSolDB data
aqsoldb_df = pd.read_csv("data/aqsoldb.csv")

In [None]:
# Clean data
aqsoldb_df = aqsoldb_df.dropna()
aqsoldb_df = aqsoldb_df.drop_duplicates().reset_index(drop=True)
aqsoldb_df.rename(columns={'Name': 'Compound ID'}, inplace=True)
aqsoldb_df = aqsoldb_df.drop(["ID"], axis=1)

Given the token length of the OpenAI language models, we will work with chemical compounds which have a length of less than 15.

In [None]:
# Keep compounds that "read" easily
aqsoldb_df = aqsoldb_df[aqsoldb_df["Compound ID"].str.len()<15].reset_index(drop=True)
aqsoldb_df

In [None]:
# Use a smaller dataset framework
mini_df = aqsoldb_df.sample(n=1000, random_state=42).reset_index(drop=True)
mini_df

# Tell-Predict Experimentation

In [None]:
def ablation_study(T_list, k_list, train_num_list, test_num_list, models_list, data):
    # Store results
    bo_lift_results = []
    cebo_lift_results_1 = []
    cebo_lift_results_2 = []
    cebo_lift_results_3 = []
    cebo_lift_results_4 = []
    cebo_lift_results_5 = []
    # Loop
    print("Ablation study commenced!")
    for T, k, num_train, num_test, model in itertools.product(T_list, k_list, train_num_list, test_num_list,
                                                                   models_list):
        print(f"T = {T} | k = {k} | num_train = {num_train} | num_test = {num_test} | model = {model}")
        bo_lift_result = []
        cebo_lift_result_1 = []
        cebo_lift_result_2 = []
        cebo_lift_result_3 = []
        cebo_lift_result_4 = []
        cebo_lift_result_5 = []
        for i in range(10):
            # Create data
            shuffled_df = data.sample(frac=1)
            train_df = shuffled_df.iloc[:num_train]
            test_df = shuffled_df.iloc[num_train:].head(num_test)
            # Create the model object
            bo_lift = AskTellFewShotTopk(x_formatter=lambda x: f"compound id {x}",
                                         y_name="solubility",
                                         y_formatter=lambda y: f"{y:.6f}",
                                         model=model,
                                         selector_k=k,
                                         temperature=0.7)
            cebo_lift_1 = AskTellFewShotTopk(x_formatter=lambda x: f"compound id {x}",
                                             y_name="solubility",
                                             y_formatter=lambda y: f"{y:.6f}",
                                             model=model,
                                             selector_k=k,
                                             temperature=T,
                                             prefix=(f"You are an expert chemist. "
                                                     "The following are correctly answered questions. "
                                                     "Each answer is numeric and ends with ###\n"))
            cebo_lift_2 = CEBO(y_name="solubility",
                               model=model,
                               selector_k=k,
                               temperature=T,
                               domain=None,
                               features=True)
            cebo_lift_3 = CEBO(y_name="solubility",
                               model=model,
                               selector_k=k,
                               temperature=T,
                               domain="chemist",
                               features=True)
            cebo_lift_4 = CEBO(y_name="solubility",
                               model=model,
                               selector_k=k,
                               temperature=T,
                               domain=None,
                               features=True)
            cebo_lift_5 = CEBO(y_name="solubility",
                               model=model,
                               selector_k=k,
                               temperature=T,
                               domain="chemist",
                               features=True)
            # Tell some points to the model
            for _, row in train_df.iterrows():
                bo_lift.tell(row["Compound ID"], row["Solubility"])
                cebo_lift_1.tell(row["Compound ID"], row["Solubility"])
                cebo_lift_2.tell(row[["Compound ID", "MolLogP", "MolMR", "Solubility"]].to_dict())
                cebo_lift_3.tell(row[["Compound ID", "MolLogP", "MolMR", "Solubility"]].to_dict())
                cebo_lift_4.tell(row[["Compound ID", "Ocurrences", "SD", "Solubility"]].to_dict())
                cebo_lift_5.tell(row[["Compound ID", "Ocurrences", "SD", "Solubility"]].to_dict())
            # Predict remaining points
            bo_lift_y_pred = [bo_lift.predict(row["Compound ID"]) for _, row in test_df.iterrows()]
            cebo_lift_y_pred_1 = [cebo_lift_1.predict(row["Compound ID"]) for _, row in test_df.iterrows()]
            cebo_lift_y_pred_2 = [cebo_lift_2.predict(row[["Compound ID", "MolLogP", "MolMR"]].to_dict()) for _, row
                                  in test_df.iterrows()]
            cebo_lift_y_pred_3 = [cebo_lift_3.predict(row[["Compound ID", "MolLogP", "MolMR"]].to_dict()) for _, row
                                  in test_df.iterrows()]
            cebo_lift_y_pred_4 = [cebo_lift_4.predict(row[["Compound ID", "Ocurrences", "SD"]].to_dict()) for _, row
                                  in test_df.iterrows()]
            cebo_lift_y_pred_5 = [cebo_lift_5.predict(row[["Compound ID", "Ocurrences", "SD"]].to_dict()) for _, row
                                  in test_df.iterrows()]
            # Modify results
            bo_lift_y_pred_modify = [sol.mean() if len(sol) >= 1 else np.nan for sol in bo_lift_y_pred]
            cebo_lift_y_pred_modify_1 = [sol.mean() if len(sol) >= 1 else np.nan for sol in cebo_lift_y_pred_1]
            cebo_lift_y_pred_modify_2 = [sol.mean() if len(sol) >= 1 else np.nan for sol in cebo_lift_y_pred_2]
            cebo_lift_y_pred_modify_3 = [sol.mean() if len(sol) >= 1 else np.nan for sol in cebo_lift_y_pred_3]
            cebo_lift_y_pred_modify_4 = [sol.mean() if len(sol) >= 1 else np.nan for sol in cebo_lift_y_pred_4]
            cebo_lift_y_pred_modify_5 = [sol.mean() if len(sol) >= 1 else np.nan for sol in cebo_lift_y_pred_5]
            # Store values
            bo_lift_result.append({"Iteration": i,
                                   "T": T,
                                   "k": k,
                                   "Train": num_train,
                                   "Test": num_test,
                                   "Model": model,
                                   "True": list(test_df["Solubility"]),
                                   "Predictions": bo_lift_y_pred_modify
                                   })
            cebo_lift_result_1.append({"Iteration": i,
                                       "T": T,
                                       "k": k,
                                       "Train": num_train,
                                       "Test": num_test,
                                       "Model": model,
                                       "True": list(test_df["Solubility"]),
                                       "Predictions": cebo_lift_y_pred_modify_1
                                       })
            cebo_lift_result_2.append({"Iteration": i,
                                       "T": T,
                                       "k": k,
                                       "Train": num_train,
                                       "Test": num_test,
                                       "Model": model,
                                       "True": list(test_df["Solubility"]),
                                       "Predictions": cebo_lift_y_pred_modify_2
                                       })
            cebo_lift_result_3.append({"Iteration": i,
                                       "T": T,
                                       "k": k,
                                       "Train": num_train,
                                       "Test": num_test,
                                       "Model": model,
                                       "True": list(test_df["Solubility"]),
                                       "Predictions": cebo_lift_y_pred_modify_3
                                       })
            cebo_lift_result_4.append({"Iteration": i,
                                       "T": T,
                                       "k": k,
                                       "Train": num_train,
                                       "Test": num_test,
                                       "Model": model,
                                       "True": list(test_df["Solubility"]),
                                       "Predictions": cebo_lift_y_pred_modify_4
                                       })
            cebo_lift_result_5.append({"Iteration": i,
                                       "T": T,
                                       "k": k,
                                       "Train": num_train,
                                       "Test": num_test,
                                       "Model": model,
                                       "True": list(test_df["Solubility"]),
                                       "Predictions": cebo_lift_y_pred_modify_5
                                       })
        # Add to final results
        bo_lift_results.append(bo_lift_result)
        cebo_lift_results_1.append(cebo_lift_result_1)
        cebo_lift_results_2.append(cebo_lift_result_2)
        cebo_lift_results_3.append(cebo_lift_result_3)
        cebo_lift_results_4.append(cebo_lift_result_4)
        cebo_lift_results_5.append(cebo_lift_result_5)
        print("Sub-experiment complete!")
    # Combine the lists into a single data structure
    pickle_data = (bo_lift_results, cebo_lift_results_1,
                   cebo_lift_results_2, cebo_lift_results_3,
                   cebo_lift_results_4, cebo_lift_results_5)
    # Specify the file path where you want to save the pickled data (NOTE: Change this for own directory and rename when running it)
    file_path = '/Users/siddarthanath/Documents/University-College-London/Thesis/cebo/results/tell-predict/ablation_study.pkl'
    # Pickle and save the data
    with open(file_path, 'wb') as file:
        pickle.dump(pickle_data, file)
    print("Ablation study completed!")

# Results

In [None]:
# Hyperparameters
kwargs = {"T_list": [0.7],
          "k_list": [5],
          "train_num_list": [5, 25, 45],
          "test_num_list": [15],
          "models_list": ["curie", "davinci"],
          "data": mini_df 
          }

In [None]:
ablation_study(**kwargs)

# Results

In [None]:
# Load data from the pickle file
with open('./results/tell-predict/ablation_study.pkl', 'rb') as pickle_file:
    loaded_data = pickle.load(pickle_file)

In [None]:
# Results
full_results = {"BO-LIFT": loaded_data[0], 
                "BO-LIFT+DOMAIN": loaded_data[1],
                "CEBO-LIFT+NO_DOMAIN": loaded_data[2],
                "CEBO-LIFT+DOMAIN+FEATURE_1": loaded_data[3],
                "CEBO-LIFT+DOMAIN+FEATURE_2": loaded_data[4],
                }

In [None]:
# Store MSE results
mse_results = {}
for key, item in full_results.items():
    # Results
    sub_result = item
    # Store results for each iterations
    sub_experiment_results = {}
    for i in range(len(sub_result)):
        sub_experiment_mse_results = []
        sub_experiment = sub_result[i]
        # Loop through each iteration and calculate MSE
        for j in range(len(sub_experiment)):
            y_true = sub_experiment[j]["True"]
            y_pred = sub_experiment[j]["Predictions"]
            # Fill in empty or nan predictions - additionally, clip outliers as this will skew the MSE
            for k in range(len(y_pred)):
                if np.isnan(y_pred[k]):
                    y_pred[k] = y_true[k]
                elif np.abs(y_true[k] - y_pred[k]) > 100000:
                    y_pred[k] = 0
                else:
                    y_pred[k] = y_pred[k]
            y_true = [x for x, y in zip(y_true, y_pred) if y != 0]
            y_pred = [y for y in y_pred if y != 0]
            # Calculate MSE
            mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
            # Store results
            results = {"Iteration": j, "T": sub_experiment[j]["T"], "k": sub_experiment[j]["k"],
                       "Train": sub_experiment[j]["Train"], "Test": sub_experiment[j]["Test"], "Model": sub_experiment[j]["Model"],
                       "MSE": mse}
            sub_experiment_mse_results.append(results)
        # Put everything into one dictionary
        final_mse_results = {"T": sub_experiment[j]["T"], "k": sub_experiment[j]["k"],
        "Train": sub_experiment[j]["Train"], "Test": sub_experiment[j]["Test"], "Model": sub_experiment[j]["Model"],
        "MSE": [experiment["MSE"] for experiment in sub_experiment_mse_results]}
        sub_experiment_results[i] = final_mse_results
    # Store results
    mse_results[key] = sub_experiment_results

In [None]:
# Extract data for plotting
full_results_thesis = []  
i = 0          
for _, data in mse_results.items():
    # Use log to scale results - since MSE values are above 1, this will not affect anything
    mse_data = [item['MSE'] for _, item in data.items()]
    T, k = data[i]["T"], data[i]["k"]
    # Create a figure and axis
    full_results_thesis.append([f"T = {T} | k = {k} | Result = {np.round(np.log(np.mean(value)), 3)}±{np.round(np.log(np.std(value)), 3)}" for value in mse_data])
    i+=1
full_results_thesis

<DIV STYLE="background-color:#000000; height:10px; width:100%;">