In [1]:
import bolift
from bolift.llm_model import GaussDist, DiscreteDist
import numpy as np
import json
import pandas as pd
from langchain.prompts.prompt import PromptTemplate
import itertools
import os
import openai

np.random.seed(0)

In [3]:
import requests
data_path = "../paper/data/esol_iupac.csv"
raw_data = pd.read_csv(data_path)

def query2IUPAC(text):
  try:
    '''This function queries the one given molecule name and returns a SMILES string from the record'''
    #query the PubChem database
    r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/' + text + '/property/IUPACName/JSON')
    data = r.json()
    smi = data["PropertyTable"]["Properties"][0]["IUPACName"]
    return smi
  except:
    return None

# raw_data["IUPAC"] = raw_data["SMILES"].map(lambda sml: query2IUPAC(sml))
raw_data = raw_data[["IUPAC", "measured log(solubility:mol/L)"]]
raw_data = raw_data.dropna()
raw_data[50:65]

Unnamed: 0,IUPAC,measured log(solubility:mol/L)
50,"1,4-dinitrobenzene",-3.39
51,"penta-1,4-diene",-2.09
52,"1,5-dimethylnaphthalene",-4.679
53,"hexa-1,5-diene",-2.68
54,"1,7-phenanthroline",-2.68
55,"1,3,3-trimethyl-2-oxabicyclo[2.2.2]octane",-1.74
56,"17-hydroxy-10,13,17-trimethyl-2,6,7,8,9,11,12,...",-3.999
57,1-bromo-2-methylpropane,-2.43
58,1-bromobutane,-2.37
59,1-bromoheptane,-4.43


In [4]:
asktell = bolift.AskTellFewShotTopk()

asktell.tell("3-chloroaniline", -1.37)
asktell.tell("nitromethane", 0.26)
asktell.tell("1-bromobutane", -2.43)
asktell.tell("3-chlorophenol", -0.7)

yhat = asktell.predict("penta-1,4-diene	")
print(yhat.mean(), yhat.std())

pool_list = [
  "1,5-dimethylnaphthalene",
  "2-aminophenol",
  "1hexa-1,5-diene",
  "1,1,2,3,4,4-hexachlorobuta-1,3-diene"
]
pool=bolift.Pool(pool_list)
print(asktell.ask(pool))

asktell.tell("phenol", -0.5)
yhat = asktell.predict("penta-1,4-diene	")
print(yhat.mean(), yhat.std())

-0.14400000000000002 0.0571314274283428
(['1hexa-1,5-diene'], [0.49021678877818947], [0.0])
-1.120687227601084 1.153644358442063


# Ablation experiments

In [5]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def run_ablation_experiment(asktell, train_data, test_data):
    for i in range(len(train_data)):
        asktell.tell(train_data.iloc[i, 0], float(train_data.iloc[i, 1]))
    x    = []
    y    = []
    yhat = []
    for j in range(len(test_data)):
        x.append(test_data.iloc[j, 0])
        y.append(float(test_data.iloc[j, 1]))
        yhat.append(asktell.predict(test_data.iloc[j, 0]))

    x_filter = [xi for xi, yhi in zip(x, yhat)]# if len(yhi.values) > 0]
    y_filter = [yi for yi, yhi in zip(y, yhat)]# if len(yhi.values) > 0]
    yhat_filter = [yhi for yi, yhi in zip(y, yhat)]# if len(yhi.values) > 0]
    return x_filter, y_filter, yhat_filter

def save_csv(filename, x, y, yhat, data, model, T, k, N, model_class, tokens):
    if not os.path.exists(filename):
        f = open(filename, "w")
        f.write("y;yhat;yprobs;data;model;Temperature;k_selected;N_train;model_class;n_tokens;x\n")
    else:
        f = open(filename, "a")
    for xi, yi, yhi in zip(x, y, yhat):
        if isinstance(yhi, DiscreteDist):
            if len(yhi.values) > 0:
                for v,p in zip(yhi.values, yhi.probs):
                    f.write(f"{yi};{v};{p:.4f};{data};{model};{T};{k};{N};{model_class};{tokens};{xi}\n")
        if isinstance(yhi, GaussDist):
            f.write(f"{yi};{yhi.mean()};{yhi.std():.4f};{data};{model};{T};{k};{N};{model_class};{tokens};{xi}\n")
    f.close()

T_list = [0.05, 0.5, 1.0, 1.5]
k_list = [0, 5, 10]
N_list = [1,2,3,4,5,10,25,50,100,200] # depends on each dataset
models_list = ["text-curie-001"]
out_csv_file = "ablation_results.csv"

## C2 yield

In [None]:
data_path = "paper/data/C2_yield_meth_oxy_short.csv"
# data_path = "paper/data/ada_embedd_c2_dataset.csv"
# data_path = "paper\data\orig_MatBert_regression_embedding_dataset_C2 (5).csv"
raw_data = pd.read_csv(data_path)[['prompt', 'completion']]

np.random.seed(0)

N = raw_data.shape[0]
train = np.random.choice(raw_data.shape[0], int(N * 0.8), replace=False)
test = np.setdiff1d(np.arange(raw_data.shape[0]), train)
np.random.shuffle(test)

train_data = raw_data.iloc[train, :].reset_index(drop=True)[:int(0.1*len(train))]
# train_data["prompt"] = train_data["prompt"].map(lambda x: x.replace(",", ";"))
test_data = raw_data.iloc[test, :].reset_index(drop=True)[:int(0.1*len(test))]
# test_data["prompt"] = test_data["prompt"].map(lambda x: x.replace(",", ";"))
print(N, len(train_data), len(test_data))

### multi

In [None]:
def run_C2_multi_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10):
    asktell = bolift.AskTellFewShotMulti(
        x_formatter=lambda x: f"experimental procedure: {x}",
        y_name="C2 yield",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        selector_k=k,
        temperature=T
    )
    exp_train_data = train_data[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="C2"
    model_class="multi"
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [1.0]
k_list = [5]
N_list = [1,5,10,25,50,100,250,500]
models_list = ["text-curie-001"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running C2 multi ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_C2_multi_ablation(train_data, test_data, model=model, T=T, N=N, k=k)
  print(" --> done")

### topk

In [None]:
def run_C2_topk_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10):
    asktell = bolift.AskTellFewShotTopk(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"experimental procedure: {x}",
        y_name="C2 yield quantile",
        y_formatter=lambda y: f"{int(y)}",
        model=model,
        selector_k=k,
        temperature=T,
    )
    exp_train_data = train_data[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="C2"
    model_class="topk"
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [1.0]
k_list = [5]
N_list = [1000]
models_list = ["gpt-4"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running C2 topk ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_C2_topk_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k)
  print(" --> done")

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error

plt.plot(y,y)
lim=(min(y),max(y))
plt.xlim(lim)
plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
plt.errorbar(y, 
            [yhi.mean() for yhi in yhat], 
            yerr=[yhi.std() for yhi in yhat],
            fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

### GPR

In [None]:
def run_C2_GPR_train(train_data, model="text-ada-001", N=50, k=16, pool=None):
    asktell = bolift.AskTellGPR(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"experimental procedure: {x}",
        y_name="C2 yield",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        pool=pool,
        n_components=k,
        # cache_path="GPR_ada_embed_cache.csv"
    )
    # Tell one example so the moduel build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    examples = []
    for i in range(len(exp_train_data)):
        examples.append(dict(
            x=asktell.format_x(exp_train_data.iloc[i, 0]),
            y=asktell.format_y(exp_train_data.iloc[i, 1]),
            y_name=asktell._y_name,
            )
        )
    asktell._train(
            [asktell.prompt.format(
                x=ex["x"],
                y_name=asktell._y_name,
                )
             for ex in examples
            ], 
            [ex["y"] for ex in examples]
        )
    return asktell

def run_C2_GPR_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10,pool=None):
    asktell = run_C2_GPR_train(train_data, model="text-ada-001", N=N, k=k, pool=pool)

    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="C2"
    model_class="GPR"
    # asktell.save_cache("GPR_ada_embed_cache.csv")
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [0.05]
k_list = [2]
N_list = [500]
models_list = ["text-ada-001"]
pool = bolift.Pool(train_data['prompt'].to_list(), formatter=lambda x: f"experimental procedure: {x}")
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running C2 GPT ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  pool.reset()
  y, yhat = run_C2_GPR_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k, pool=pool)
  print(" --> done")

In [None]:
print(y)
print(yhat)

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error

plt.plot(y,y)
lim=(min(y),max(y))
plt.xlim(lim)
plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
plt.errorbar(y, 
            [yhi.mean() for yhi in yhat], 
            yerr=[yhi.std() for yhi in yhat],
            fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

### Ridge Kernel

In [None]:
def run_C2_ridge_train(train_data, model="text-ada-001", N=50, k=16, pool=None):
    asktell = bolift.AskTellRidgeKernelRegression(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"experimental procedure: {x}",
        y_name="C2 yield",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        alpha=0.5
    )
    # Tell one example so the module build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1], train=False)
    exp_train_data = train_data.iloc[:N]

    examples = []
    for i in range(len(exp_train_data)):
        examples.append(dict(
            x=asktell.format_x(exp_train_data.iloc[i, 0]),
            y=asktell.format_y(exp_train_data.iloc[i, 1]),
            y_name=asktell._y_name,
            )
        )
    asktell._train(
            [asktell.prompt.format(
                x=ex["x"],
                y_name=asktell._y_name,
                )
             for ex in examples
            ], 
            [ex["y"] for ex in examples]
        )
    return asktell

def run_C2_ridge_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10,pool=None):
    asktell = run_C2_ridge_train(train_data, model="text-ada-001", N=N, k=k, pool=pool)

    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="C2"
    model_class="KRR"
    # asktell.save_cache("GPR_ada_embed_cache.csv")
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [0.05]
k_list = [0]
N_list = [100]
models_list = ["text-ada-001"]
pool = bolift.Pool(train_data['prompt'].to_list(), formatter=lambda x: f"experimental procedure: {x}")
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running C2 krr ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  pool.reset()
  y, yhat = run_C2_ridge_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k, pool=pool)
  print(" --> done")

In [None]:
print(y)
print(yhat)

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error

plt.plot(y,y)
lim=(min(y),max(y))
# plt.xlim(lim)
# plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
# plt.errorbar(y, 
#             [yhi.mean() for yhi in yhat], 
#             yerr=[yhi.std() for yhi in yhat],
#             fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

### Finetune

In [None]:
def run_C2_finetune(train_data, model="text-ada-001", N=50):
    asktell = bolift.AskTellFinetuning(
        prefix="",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"experimental procedure: {x}",
        y_name="C2 yield",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        n_epochs=8,
        learning_rate_multiplier=0.05,
    )
    # Tell one example so the moduel build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    prompts=[]
    completions=[]
    for i in range(len(exp_train_data)):
        prompts.append(f"What is the yield strength of {exp_train_data.iloc[i, 0]}?@@@\\nA: ")
        completions.append(f"{float(exp_train_data.iloc[i, 1])}###")
    asktell.prepare_data(prompts, completions, f'./paper/out/data_C2_{N}.dat')
    asktell.fine_tune(prompts, completions, out_path='./paper/out', out_file=f'FT_C2_{N}')
    print(asktell.get_model_name())

def run_C2_FT_ablation(train_data, test_data, model="text-ada-001", T=0.7, N=10, k=0):
    with open(f'./paper/out/FT_C2_{N}.dat', 'r') as f:
        response = json.load(f)
    
    asktell = bolift.AskTellFinetuning(
        prefix="",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        y_name="C2 yield",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        id=response['id'],
        # selector_k=0,
        n_epochs=8,
        learning_rate_multiplier=0.02,
    )
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:1]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="C2"
    model_class="finetune"
    save_csv(out_csv_file, x, y, yhat, data, asktell.get_model_name(), T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
N_list=[50, 1000]
for N in N_list: 
  print(f"Running C2 FT with N={N}")
  run_C2_finetune(train_data, model="text-ada-001", N=N)
  print(" --> done")

In [None]:
T_list = [0.5, 0.7, 1.0]
k_list = [0]
N_list = [1000]
models_list = ["text-ada-001"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running C2 finetune ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_C2_FT_ablation(train_data, test_data, model="text-ada-001", T=T, N=N, k=k)
  print(" --> done")

### k-NN

In [None]:
def run_C2_knn_train(train_data, model="text-ada-001", N=50, k=16, pool=None):
    asktell = bolift.AskTellNearestNeighbor(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"experimental procedure: {x}",
        y_name="C2 yield",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        knn=5,
    )
    # Tell one example so the module build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    examples = []
    for i in range(len(exp_train_data)):
        asktell.tell(exp_train_data.iloc[i, 0], exp_train_data.iloc[i, 1])
    return asktell

def run_C2_knn_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10,pool=None):
    asktell = run_C2_knn_train(train_data, model="text-ada-001", N=N, k=k, pool=pool)

    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="C2"
    model_class="KNN"
    # asktell.save_cache("GPR_ada_embed_cache.csv")
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [0.05]
k_list = [5]
N_list = [1000]
models_list = ["text-ada-001"]
pool = bolift.Pool(train_data['prompt'].to_list(), formatter=lambda x: f"experimental procedure: {x}")
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running C2 knn ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  pool.reset()
  y, yhat = run_C2_knn_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k, pool=pool)
  print(" --> done") 

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error

plt.plot(y,y)
lim=(min(y),max(y))
# plt.xlim(lim)
# plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
# plt.errorbar(y, 
#             [yhi.mean() for yhi in yhat], 
#             yerr=[yhi.std() for yhi in yhat],
#             fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

## Iupac-solubility

In [7]:
import requests
data_path = "../paper/data/esol_iupac.csv"
raw_data = pd.read_csv(data_path)

def query2IUPAC(text):
  try:
    '''This function queries the one given molecule name and returns a SMILES string from the record'''
    #query the PubChem database
    r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/' + text + '/property/IUPACName/JSON')
    data = r.json()
    smi = data["PropertyTable"]["Properties"][0]["IUPACName"]
    return smi
  except:
    return None

# raw_data["IUPAC"] = raw_data["SMILES"].map(lambda sml: query2IUPAC(sml))
raw_data = raw_data[["IUPAC", "measured log(solubility:mol/L)"]]
raw_data = raw_data.dropna()

In [8]:
np.random.seed(0)

N = raw_data.shape[0]
train = np.random.choice(raw_data.shape[0], int(N * 0.8), replace=False)
test = np.setdiff1d(np.arange(raw_data.shape[0]), train)
np.random.shuffle(test)

train_data = raw_data.iloc[train, :].reset_index(drop=True)
test_data = raw_data.iloc[test, :].reset_index(drop=True)
print(N, len(train_data), len(test_data))

882 705 177


### multi

In [11]:
def run_iupac_sol_multi_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10):
    asktell = bolift.AskTellFewShotMulti(
        x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        selector_k=k,
        temperature=T
)
    exp_train_data = train_data[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="iupac-sol"
    model_class="multi"
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [14]:
T_list = [1.0]
k_list = [5]
N_list = [1,5,10,25,50,100,250,500]
models_list = ["text-curie-001"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running iupac-solv multi ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_iupac_sol_multi_ablation(train_data, test_data, model=model, T=T, N=N, k=k)
  print(" --> done")

Running iupac-solv multi ablation with T=1.0, k=5, N=1, model=text-curie-001 

KeyboardInterrupt: 

### topk

In [15]:
def run_iupac_sol_topk_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10):
    asktell = bolift.AskTellFewShotTopk(
        x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        selector_k=k,
        temperature=T
    )
    x, y, yhat =  run_ablation_experiment(asktell, train_data, test_data)

    data="iupac-sol"
    model_class="topk"
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [16]:
T_list = [0.7]
k_list = [5]
N_list = [700]
models_list = ["text-davinci-003"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running iupac-sol topk ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_iupac_sol_topk_ablation(train_data, test_data, model=model, T=T, N=N, k=k)
  print(" --> done")

Running iupac-sol topk ablation with T=0.7, k=5, N=700, model=text-davinci-003 

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 5df6fb97810b514626b822a1ba8b3dea in your message.) {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 5df6fb97810b514626b822a1ba8b3dea in your message.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 5df6fb97810b

KeyboardInterrupt: 

### GPR

In [13]:
def run_iupac_sol_GPR_train(train_data, model="text-ada-001", N=50, k=16, pool=None):
    asktell = bolift.AskTellGPR(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        pool=pool,
        n_components=k,
        # cache_path="GPR_ada_embed_cache.csv"
    )
    # Tell one example so the moduel build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    examples = []
    for i in range(len(exp_train_data)):
        examples.append(dict(
            x=asktell.format_x(exp_train_data.iloc[i, 0]),
            y=asktell.format_y(exp_train_data.iloc[i, 1]),
            y_name=asktell._y_name,
            )
        )
    asktell._train(
            [asktell.prompt.format(
                x=ex["x"],
                y_name=asktell._y_name,
                )
             for ex in examples
            ], 
            [ex["y"] for ex in examples]
        )
    return asktell

def run_iupac_sol_GPR_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=16, pool=None):
    asktell = run_iupac_sol_GPR_train(train_data, model="text-ada-001", N=N, k=k, pool=pool)

    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="iupac-sol"
    model_class="GPR"
    # asktell.save_cache("GPR_ada_embed_cache.csv")
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [0.05]
k_list = [2,4,8,16,32,64,128,256]
N_list = [1,5,10,25,50,100,250,500,700]
models_list = ["text-ada-001"]
pool = bolift.Pool(train_data['IUPAC'].to_list(), formatter=lambda x: f"iupac name {x}")
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running iupac-sol GPR ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  pool.reset()
  y, yhat = run_iupac_sol_GPR_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k, pool=pool)
  print(" --> done")

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error
print(yhat)
plt.plot(y,y)
lim=(min(y),max(y))
plt.xlim(lim)
plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
plt.errorbar(y, 
            [yhi.mean() for yhi in yhat], 
            yerr=[yhi.std() for yhi in yhat],
            fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

### Finetune

In [None]:
def run_iupac_sol_finetune(train_data, model="text-ada-001", N=50):
    asktell = bolift.AskTellFinetuning(
        prefix="",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        # x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        n_epochs=8,
        learning_rate_multiplier=0.02,
    )
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    prompts=[]
    completions=[]
    for i in range(len(train_data[:N])):
        prompts.append(f"What is the measured log solubility in mols per litre of {exp_train_data.iloc[i, 0]}?@@@\\nA: ")
        completions.append(f"{float(exp_train_data.iloc[i, 1])}###")
        asktell.prepare_data(prompts, completions, f'./paper/out/data_solv_{N}.dat')
    asktell.fine_tune(prompts, completions, out_path='./paper/out', out_file=f'FT_solv_{N}')
    print(asktell.get_model_name())

def run_iupac_sol_FT_ablation(train_data, test_data, model="text-ada-001", T=0.05, N=50, k=0):
    with open(f'./paper/out/FT_solv_{N}.dat', 'r') as f:
        response = json.load(f)
    
    asktell = bolift.AskTellFinetuning(
        prefix="",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        # x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        id=response['id'],
        selector_k=0,
        n_epochs=8,
        learning_rate_multiplier=0.02,
    )
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="iupac-sol"
    model_class="finetune"
    save_csv(out_csv_file, x, y, yhat, data, asktell.get_model_name(), T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
N_list=[50,100,250,500,700]
for N in N_list:
  print(f"Running iupac-sol FT with N={N}")
  run_iupac_sol_finetune(train_data, model="text-ada-001", N=N)
  print(" --> done")

In [None]:
T_list = [0.7]
k_list = [0]
N_list=[50,100,250,500,700]
models_list = ["text-ada-001"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running iupac-sol finetune ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_iupac_sol_FT_ablation(train_data, test_data, model="text-ada-001", T=T, N=N, k=k)
  print(" --> done")


### Ridge Regression

In [None]:
def run_sol_ridge_train(train_data, model="text-ada-001", N=50, k=16, pool=None):
    asktell = bolift.AskTellRidgeKernelRegression(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        alpha=0.5
    )
    # Tell one example so the module build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1], train=False)
    exp_train_data = train_data.iloc[:N]

    examples = []
    for i in range(len(exp_train_data)):
        examples.append(dict(
            x=asktell.format_x(exp_train_data.iloc[i, 0]),
            y=asktell.format_y(exp_train_data.iloc[i, 1]),
            y_name=asktell._y_name,
            )
        )
    asktell._train(
            [asktell.prompt.format(
                x=ex["x"],
                y_name=asktell._y_name,
                )
             for ex in examples
            ], 
            [ex["y"] for ex in examples]
        )
    return asktell

def run_sol_ridge_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10,pool=None):
    asktell = run_sol_ridge_train(train_data, model="text-ada-001", N=N, k=k, pool=pool)

    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="iupac-sol"
    model_class="KRR"
    # asktell.save_cache("GPR_ada_embed_cache.csv")
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [0.05]
k_list = [0]
N_list = [1,5,10,25,50,100,250,500,700]
models_list = ["text-ada-001"]
pool = bolift.Pool(train_data['IUPAC'].to_list(), formatter=lambda x: f"iupac name {x}")
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running sol GPT ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  pool.reset()
  y, yhat = run_sol_ridge_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k, pool=pool)
  print(" --> done")

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error

plt.plot(y,y)
lim=(min(y),max(y))
# plt.xlim(lim)
# plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
# plt.errorbar(y, 
#             [yhi.mean() for yhi in yhat], 
#             yerr=[yhi.std() for yhi in yhat],
#             fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

### k-NN

In [None]:
def run_sol_knn_train(train_data, model="text-ada-001", N=50, k=16, pool=None):
    asktell = bolift.AskTellNearestNeighbor(
        prefix="The following question should be answered with a number\n",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        x_formatter=lambda x: f"iupac name {x}",
        y_name="measured log solubility in mols per litre",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        knn=1,
    )
    # Tell one example so the module build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    examples = []
    for i in range(len(exp_train_data)):
        asktell.tell(exp_train_data.iloc[i, 0], exp_train_data.iloc[i, 1])
    return asktell

def run_sol_knn_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10,pool=None):
    asktell = run_sol_knn_train(train_data, model="text-ada-001", N=N, k=k, pool=pool)

    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="iupac-sol"
    model_class="KNN"
    # asktell.save_cache("GPR_ada_embed_cache.csv")
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
T_list = [0.05]
k_list = [0]
N_list = [5,10,25,50,100,250,500,700]
models_list = ["text-ada-001"]
pool = bolift.Pool(train_data['IUPAC'].to_list(), formatter=lambda x: f"iupac name {x}")
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running iupac-sol knn ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  pool.reset()
  y, yhat = run_sol_knn_ablation(train_data[:N], test_data, model=model, T=T, N=N, k=k, pool=pool)
  print(" --> done")

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error

plt.plot(y,y)
lim=(min(y),max(y))
# plt.xlim(lim)
# plt.ylim(lim)
plt.scatter(y, [yhi.mean() for yhi in yhat])
# plt.errorbar(y, 
#             [yhi.mean() for yhi in yhat], 
#             yerr=[yhi.std() for yhi in yhat],
#             fmt='.', color='gray', alpha=0.4)
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, [yhi.mean() for yhi in yhat])[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_absolute_error(y, [yhi.mean() for yhi in yhat]):.3f}")

## Tungsten

In [None]:
np.random.seed(8)

data_path = "paper/data/Tungsten carbide data.csv"
raw_data = pd.read_csv(data_path)

N = raw_data.shape[0]
train = np.random.choice(raw_data.shape[0], int(N * 0.8), replace=False)
test = np.setdiff1d(np.arange(raw_data.shape[0]), train)
np.random.shuffle(test)

train_data = raw_data.iloc[train, :].reset_index(drop=True)
test_data = raw_data.iloc[test, :].reset_index(drop=True)
print(N, len(train_data), len(test_data))

In [None]:
print(train_data.iloc[0,1])

In [None]:
asktell = bolift.AskTellFewShotTopk(
  prefix="This model was created to predict CO yield from a given experimental procedure. It is a difficult task and the answer should be numeric.",
  prompt_template=PromptTemplate(
      input_variables=["x", "y", "y_name"],
      template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
  ),
  suffix="What is the {y_name} of {x}?@@@\nA:",
  x_formatter=lambda x: f" the experimental procedure {x}",
  y_name="CO yield",
  y_formatter=lambda y: f"{y:.2f}%.",
  model="text-curie-001",
  selector_k=5,
  temperature=0.05
)

asktell.tell(train_data.iloc[0, 0], float(train_data.iloc[0, 1]))
for i in range(len(train_data)):
    asktell.tell(train_data.iloc[i, 0], float(train_data.iloc[i, 1]))
for i in range(len(test_data)):
    yhat = asktell.predict(test_data.iloc[i, 0]) 
    y = float(test_data.iloc[i, 1])
    print(yhat, y, y-yhat.mean())


## Steel alloy

In [None]:
np.random.seed(8)

data_path = "paper/data/yield_strength.csv"
raw_data = pd.read_csv(data_path)

N = raw_data.shape[0]
train = np.random.choice(raw_data.shape[0], int(N * 0.8), replace=False)
test = np.setdiff1d(np.arange(raw_data.shape[0]), train)
np.random.shuffle(test)

train_data = raw_data.iloc[train, :].reset_index(drop=True)
test_data = raw_data.iloc[test, :].reset_index(drop=True)
print(N, len(train_data), len(test_data))

### multi

In [None]:
def run_alloy_multi_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10):
    asktell = bolift.AskTellFewShotMulti(
        x_formatter=lambda x: f"alloy composition of {x}",
        y_name="yield strength",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        selector_k=k,
        temperature=T
    )
    exp_train_data = train_data[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="alloy"
    model_class="multi"
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat


In [None]:

T_list = [0.05]
k_list = [1,2,3,4]
N_list = [249]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running alloy multi ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_alloy_multi_ablation(train_data, test_data, model=model, T=T, N=N, k=k)
  print(" --> done")

### topk

In [None]:
def run_alloy_topk_ablation(train_data, test_data, model="text-curie-001", T=0.05, N=50, k=10):
    from openai import PromptTemplate
    
    asktell = bolift.AskTellFewShotTopk(
        x_formatter=lambda x: f"alloy composition of {x}",
        y_name="yield strength",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        selector_k=k,
        temperature=T
    )
    exp_train_data = train_data[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)
    data="alloy"
    model_class="topk"
    save_csv(out_csv_file, x, y, yhat, data, model, T, k, N, model_class, asktell.tokens_used)

    return y, yhat


In [None]:
T_list = [0.05]
k_list = [5]
N_list = [50] #[3,4,5,10,25,50,100,200]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running alloy topk ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_alloy_topk_ablation(train_data, test_data, model=model, T=T, N=N, k=k)
  print(" --> done")

### Finetune

In [None]:
def run_alloy_finetune(train_data, model="text-ada-001", N=50):
    asktell = bolift.AskTellFinetuning(
        prefix="",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        # x_formatter=lambda x: f"alloy composition: {x}",
        y_name="yield strength",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        n_epochs=8,
        learning_rate_multiplier=0.02,
    )
    # Tell one example so the moduel build the prompt
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]

    prompts=[]
    completions=[]
    for i in range(len(exp_train_data)):
        prompts.append(f"What is the yield strength of {exp_train_data.iloc[i, 0]}?@@@\\nA: ")
        completions.append(f"{float(exp_train_data.iloc[i, 1])}###")
    asktell.prepare_data(prompts, completions, f'./paper/out/data_alloy_{N}.dat')
    asktell.fine_tune(prompts, completions, out_path='./paper/out', out_file=f'FT_alloy_{N}')
    print(asktell.get_model_name())

def run_alloy_FT_ablation(train_data, test_data, model="text-ada-001", T=0.05, N=50, k=0):
    with open(f'./paper/out/FT_alloy_{N}.dat', 'r') as f:
        response = json.load(f)
    
    asktell = bolift.AskTellFinetuning(
        prefix="",
        prompt_template=PromptTemplate(
            input_variables=["x", "y", "y_name"],
            template="Q: What is the {y_name} of {x}?@@@\nA: {y}###",
        ),
        suffix="What is the {y_name} of {x}?@@@\nA:",
        # x_formatter=lambda x: f"alloy composition: {x}",
        y_name="yield strength",
        y_formatter=lambda y: f"{y:.2f}",
        model=model,
        id=response['id'],
        selector_k=0,
        n_epochs=8,
        learning_rate_multiplier=0.02,
    )
    asktell.tell(train_data.iloc[0, 0], train_data.iloc[0, 1])
    exp_train_data = train_data.iloc[:N]
    x, y, yhat =  run_ablation_experiment(asktell, exp_train_data, test_data)

    data="alloy"
    model_class="finetune"
    save_csv(out_csv_file, x, y, yhat, data, asktell.get_model_name(), T, k, N, model_class, asktell.tokens_used)

    return y, yhat

In [None]:
N_list=[249]
for N in N_list:
  print(f"Running alloy FT with N={N}")
  run_alloy_finetune(train_data, model="text-ada-001", N=N)
  print(" --> done")

In [None]:
T_list = [0.05]
k_list = [0]
N_list = [249]
models_list = ["text-ada-001"]
for T, k, N, model in itertools.product(T_list, k_list, N_list, models_list):
  print(f"Running alloy finetune ablation with T={T}, k={k}, N={N}, model={model}", end=" ")
  y, yhat = run_alloy_FT_ablation(train_data, test_data, model="text-ada-001", T=T, N=N, k=k)
  print(" --> done")
