In [None]:
!pip install py2opsin
!pip install rdkit
!pip install chemspipy
!pip install pubchempy
!pip install molvs

In [None]:
# import required modules
import os
import ast
import re
from collections import defaultdict
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from py2opsin import py2opsin
import pubchempy as pcp
from chemspipy import ChemSpider
from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import Draw
from molvs import standardize_smiles

RDLogger.DisableLog("rdApp.*")  # ignore the rdkit warnings

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
def load_data(data_dir):
    df = pd.DataFrame()
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".xlsx"):
            data_path = os.path.join(data_dir, file_name)
            sub_df = pd.read_excel(data_path, engine="openpyxl")
            sub_df = sub_df[
                [
                    "Reaction ID",
                    "Reaction",
                    "Temperature (Reaction Details) [C]",
                    "Yield (numerical)",
                    "Reagent",
                    "Solvent (Reaction Details)",
                    "Catalyst",
                ]
            ]
            sub_df = sub_df[:-2]  # Remove disclaimer and copyright
            df = pd.concat([df, sub_df], ignore_index=True)
        else:
            continue

    # Drop duplications
    df = df.drop_duplicates()

    return df

In [None]:
def preprocess_data(df):
    # remove reactions without reported yields, solvent, reaction SMILES, or temperature
    # reactions without reagents or catalysts were retained in this dataset as
    # there are known examples where, e.g., heating is sufficient for reactivity
    df = df[df["Yield (numerical)"].notna()]
    df = df[df["Solvent (Reaction Details)"].notna()]
    df = df[df["Reaction"].notna()]
    df = df[df["Temperature (Reaction Details) [C]"].notna()]

    # Separate multiple components of each reaction condition type
    col_splits = [
        "Yield (numerical)",
        "Reagent",
        "Catalyst",
        "Solvent (Reaction Details)",
    ]
    for col in col_splits:
        df[col] = df[col].str.split("; ")
    df["Yield (numerical)"] = df["Yield (numerical)"].apply(tuple)

    # split 'Reaction' into 'reactants' and 'products' by '>>' and
    df[["Reactants Smiles", "Products Smiles"]] = df["Reaction"].str.split(
        ">>", expand=True
    )
    df[["Reactants", "Products"]] = df["Reaction"].str.split(">>", expand=True)

    # split 'reactants' and 'products' into lists by '.' in place
    for col in ["Reactants", "Products"]:
        df[col] = df[col].str.split(".")

    # remove reactions without recorded structures (half reactions)
    df = df[~df["Reaction"].str.contains("^>>|>>$")]

    # remove reactants or products can not be parsed by RDKit
    passed_parsed_by_rdkit_id_list = []
    for i in range(len(df)):
        smiles_list = df.iloc[i]["Reactants"] + df.iloc[i]["Products"]
        failed_rdkit = False
        for s in smiles_list:
            if Chem.MolFromSmiles(s) is None:
                failed_rdkit = True
                break
        if not failed_rdkit:
            passed_parsed_by_rdkit_id_list.append(i)
    df = df.iloc[passed_parsed_by_rdkit_id_list]

    # concatenate 'Reagent' and 'Catalyst' columns to single 'reagents' lists
    # first replace NaN values with empty lists
    r_na = df["Reagent"].isna()
    c_na = df["Catalyst"].isna()
    df.loc[r_na, "Reagent"] = pd.Series([[]] * r_na.sum()).values
    df.loc[c_na, "Catalyst"] = pd.Series([[]] * c_na.sum()).values

    # concatenate lists and drop original columns
    df["Reagents"] = df["Reagent"] + df["Catalyst"]
    df = df.drop(["Reagent", "Catalyst"], axis=1)

    # remove duplicated reagents and solvents
    # duplicate compounds and reagents often just refer to multiple equivalents
    # duplicate solvents may refer to materials being added together as solutions
    for col in ["Reagents", "Solvent (Reaction Details)"]:
        df[col] = df[col].apply(lambda x: tuple(sorted(set(map(str.strip, x)))))

    # keep only the maximum temperature, as the reaction should occur at high temperatures
    # remove temperature details
    def find_max_temperature(x):
        numbers = re.findall(r"-?\d+", str(x))
        numbers = [int(num) for num in numbers]
        if numbers:
            return max(numbers)
        else:
            return None

    df["Max Temperature"] = df["Temperature (Reaction Details) [C]"].apply(
        find_max_temperature
    )

    # Remove duplicated reaction condition records
    df = df.drop_duplicates(
        subset=[
            "Reaction ID",
            "Max Temperature",
            "Solvent (Reaction Details)",
            "Reagents",
        ]
    )
    df = df[
        [
            "Reaction ID",
            "Reaction",
            "Temperature (Reaction Details) [C]",
            "Max Temperature",
            "Solvent (Reaction Details)",
            "Reagents",
            "Reactants",
            "Products",
            "Reactants Smiles",
            "Products Smiles",
            "Yield (numerical)",
        ]
    ]

    return df

In [None]:
def plot_dist(df):
    for variable in [
        "Reactants",
        "Products",
        "Solvent (Reaction Details)",
        "Reagents",
        "Yield (numerical)",
    ]:
        try:
            value_counts = df[variable].map(len).value_counts()
            plt.figure(figsize=(4, 4))
            bars = plt.bar(value_counts.index, value_counts.values, width=0.6)
            plt.xticks(value_counts.index)
            # Enlarge space from the top of the bar to the top edge of the box
            plt.ylim(top=max(value_counts.values) * 1.1)
            # Add count values as labels on the bars
            for bar, count in zip(bars, value_counts.values):
                plt.text(
                    bar.get_x() + bar.get_width() / 2,
                    count + 50,
                    str(count),
                    ha="center",
                    fontsize=10,
                    color="black",
                )
            plt.xlabel("Number of elements")
            plt.ylabel("Count")
            plt.title(variable)
        except:
            pass

In [None]:
# Standardize the labels for reagents and solvents
# Check nomenclature for reagents and solvents

ChemSpider_API_Key = "Your_API_Key"
cs = ChemSpider(ChemSpider_API_Key)


def PCPconvert(name):
    s = pcp.get_compounds(name, "name")
    try:
        return s[0].canonical_smiles
    except:
        return None


def ChemSpiderConvert(name):
    results = cs.search(name)
    if len(results) == 0:
        return None
    return results[0].smiles


def OpsinConvert(name):
    result = py2opsin(name)
    if result == "":
        return None
    return result

In [None]:
def analyze_nomenclature(df, property_name):
    # return list of all unique molecular names and frequencies
    df_all = df[property_name].explode()

    # combine into single list with counts and frequencies
    chemical_list = list(
        zip(
            df_all.value_counts().index,
            df_all.value_counts(),
            df_all.value_counts(normalize=True),
        )
    )

    # create dataframe for processing
    analyzed_df = pd.DataFrame(chemical_list, columns=["name", "count", "frequency"])

    # add column for cumulative sum of frequency
    analyzed_df["cumulative"] = analyzed_df.frequency.cumsum()

    return analyzed_df

In [None]:
def get_name_map_dict(df, name):
    name_df = analyze_nomenclature(df, name)
    # remove the entries with rare chemicals whose count in the dataset is less than 10
    name_df = name_df[name_df["count"] >= 10]

    # Standardize the labels: I use OPSIN, PubChem, ChemSpider application
    # programming interfaces to obtain the SMILES of chemical names

    smiles_list = []
    for i in tqdm(range(len(name_df))):
        name = name_df.iloc[i]["name"]
        smiles = OpsinConvert(name)
        if smiles is None:
            smiles = PCPconvert(name)
        if smiles is None:
            smiles = ChemSpiderConvert(name)
        if smiles is not None:
            try:
                smiles = standardize_smiles(smiles)
            except:
                smiles = None
        smiles_list.append(smiles)
    name_df["smiles"] = smiles_list

    # Build a dictionary: keys = smiles, values: chemical names
    keys = name_df.smiles
    values = name_df.name

    # Create a defaultdict with lists as default values
    name_dict = defaultdict(list)

    # Populate the dictionary with values
    for key, value in zip(keys, values):
        name_dict[key].append(value)

    # Build `name_map_dict` to map the chemical name with the most frequently appearing
    # name for the same chmical in the dataset
    name_map_dict = {}
    for key, values in name_dict.items():
        main_name = values[0]
        for value in values:
            name_map_dict[value] = main_name

    return name_map_dict

In [None]:
# Replace the labels
def get_names(names, map_dic):
    mapped_names = []
    for name in names:
        try:
            new_name = map_dic[name]
        except KeyError:
            new_name = None
        mapped_names.append(new_name)

    return mapped_names


def replace_label(df, reagent_name_map_dict, solvent_name_map_dict):
    # The labels without the corresponding SMILES are dropped
    df["Reagents"] = df["Reagents"].apply(lambda x: get_names(x, reagent_name_map_dict))
    mask = df["Reagents"].apply(
        lambda x: all(val in reagent_name_map_dict.keys() for val in x)
    )
    df = df[mask]

    df["Solvent (Reaction Details)"] = df["Solvent (Reaction Details)"].apply(
        lambda x: get_names(x, solvent_name_map_dict)
    )
    mask = df["Solvent (Reaction Details)"].apply(
        lambda x: all(val in solvent_name_map_dict.keys() for val in x)
    )
    df = df[mask]

    return df

# Running preprocessing and saving preprocessed data

In [None]:
def overview_df(df):
    # Count the number of reactions and contexts
    total_rxn = len(set(df["Reaction ID"]))
    total_context = len(df)
    print("Total reactions:", total_rxn)
    print("Total contexts:", total_context)

    # get overview of missing values and dtypes
    df.info()

In [None]:
def visualize(df, index):
    d = df.iloc[index]
    Reaction = d["Reaction"]
    reac_smiles, prod_smiles = Reaction.split(">>")
    reac_mol = Chem.MolFromSmiles(reac_smiles)
    prod_mol = Chem.MolFromSmiles(prod_smiles)
    return Draw.MolsToGridImage([reac_mol, prod_mol], legends=["Reactant", "Product"])

## negishi reactions

In [None]:
# Load data
data_dir = "/content/drive/MyDrive/SCL_Space/data/Reaxys/Raw_data/negishi/"
negishi_df = load_data(data_dir)
overview_df(negishi_df)
negishi_df

In [None]:
# Preprocess data
negishi_df = preprocess_data(negishi_df)
overview_df(negishi_df)
negishi_df

In [None]:
# Visualize the distribution for each variable
plot_dist(negishi_df)

In [None]:
# Remove reaction conditions that involve more than 1 product, more than 2 solvents or 4 reagents,
# have no reagents, or with multiple records of yield.
negishi_df = negishi_df[negishi_df["Reactants"].map(len) == 2]
negishi_df = negishi_df[negishi_df["Products"].map(len) == 1]
negishi_df = negishi_df[negishi_df["Solvent (Reaction Details)"].map(len) < 3]
negishi_df = negishi_df[
    (negishi_df["Reagents"].map(len) < 5) & (negishi_df["Reagents"].map(len) > 0)
]
negishi_df = negishi_df[negishi_df["Yield (numerical)"].map(len) == 1]

negishi_df["Yield (numerical)"] = negishi_df["Yield (numerical)"].apply(
    lambda x: float(x[0])
)
overview_df(negishi_df)
negishi_df

In [None]:
# Visualize the distribution for each variable
plot_dist(negishi_df)

In [None]:
# Standardize the labels for reagents and solvents
reagent_name_map_dict = get_name_map_dict(negishi_df, "Reagents")
solvent_name_map_dict = get_name_map_dict(negishi_df, "Solvent (Reaction Details)")
negishi_df = replace_label(negishi_df, reagent_name_map_dict, solvent_name_map_dict)
overview_df(negishi_df)
negishi_df

In [None]:
for key in ["Solvent (Reaction Details)", "Reagents"]:
    negishi_df[key] = negishi_df[key].apply(tuple)
negishi_df = negishi_df.drop_duplicates(
    subset=["Reaction ID", "Max Temperature", "Solvent (Reaction Details)", "Reagents"]
)
for key in ["Solvent (Reaction Details)", "Reagents"]:
    negishi_df[key] = negishi_df[key].apply(list)
negishi_df

In [None]:
# Save result
save_path = (
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/negishi.csv"
)
negishi_df.to_csv(save_path, index=False)

In [None]:
# Visualize the distribution for each variable
plot_dist(negishi_df)

## buchwald reactions

In [None]:
# Load data
data_dir = "/content/drive/MyDrive/SCL_Space/data/Reaxys/Raw_data/buchwald/"
buchwald_df = load_data(data_dir)
overview_df(buchwald_df)
buchwald_df

In [None]:
# Preprocess data
buchwald_df = preprocess_data(buchwald_df)
overview_df(buchwald_df)
buchwald_df

In [None]:
# Visualize the distribution for each variable
plot_dist(buchwald_df)

In [None]:
# Remove reaction conditions that involve more than 1 product, more than 2 solvents or 4 reagents,
# have no reagents, or with multiple records of yield.
buchwald_df = buchwald_df[buchwald_df["Reactants"].map(len) == 2]
buchwald_df = buchwald_df[buchwald_df["Products"].map(len) == 1]
buchwald_df = buchwald_df[buchwald_df["Solvent (Reaction Details)"].map(len) < 3]
buchwald_df = buchwald_df[
    (buchwald_df["Reagents"].map(len) < 5) & (buchwald_df["Reagents"].map(len) > 0)
]
buchwald_df = buchwald_df[buchwald_df["Yield (numerical)"].map(len) == 1]

buchwald_df["Yield (numerical)"] = buchwald_df["Yield (numerical)"].apply(
    lambda x: float(x[0])
)
overview_df(buchwald_df)
buchwald_df

In [None]:
# Visualize the distribution for each variable
plot_dist(buchwald_df)

In [None]:
# Standardize the labels for reagents and solvents
reagent_name_map_dict = get_name_map_dict(buchwald_df, "Reagents")
solvent_name_map_dict = get_name_map_dict(buchwald_df, "Solvent (Reaction Details)")
buchwald_df = replace_label(buchwald_df, reagent_name_map_dict, solvent_name_map_dict)
overview_df(buchwald_df)
buchwald_df

In [None]:
for key in ["Solvent (Reaction Details)", "Reagents"]:
    buchwald_df[key] = buchwald_df[key].apply(tuple)
buchwald_df = buchwald_df.drop_duplicates(
    subset=["Reaction ID", "Max Temperature", "Solvent (Reaction Details)", "Reagents"]
)
for key in ["Solvent (Reaction Details)", "Reagents"]:
    buchwald_df[key] = buchwald_df[key].apply(list)
buchwald_df

In [None]:
# Save result
save_path = "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/buchwald.csv"
buchwald_df.to_csv(save_path, index=False)

In [None]:
# Visualize the distribution for each variable
plot_dist(buchwald_df)

## suzuki reactions

In [None]:
# Load data
data_dir = "/content/drive/MyDrive/SCL_Space/data/Reaxys/Raw_data/suzuki/"
suzuki_df = load_data(data_dir)
overview_df(suzuki_df)
suzuki_df

In [None]:
# Preprocess data
suzuki_df = preprocess_data(suzuki_df)
overview_df(suzuki_df)
suzuki_df

In [None]:
# Visualize the distribution for each variable
plot_dist(suzuki_df)

In [None]:
# Remove reaction conditions that involve more than 1 product, more than 2 solvents or 4 reagents,
# have no reagents, or with multiple records of yield.
suzuki_df = suzuki_df[suzuki_df["Reactants"].map(len) == 2]
suzuki_df = suzuki_df[suzuki_df["Products"].map(len) == 1]
suzuki_df = suzuki_df[suzuki_df["Solvent (Reaction Details)"].map(len) < 3]
suzuki_df = suzuki_df[
    (suzuki_df["Reagents"].map(len) < 5) & (suzuki_df["Reagents"].map(len) > 0)
]
suzuki_df = suzuki_df[suzuki_df["Yield (numerical)"].map(len) == 1]

suzuki_df["Yield (numerical)"] = suzuki_df["Yield (numerical)"].apply(
    lambda x: float(x[0])
)
overview_df(suzuki_df)
suzuki_df

In [None]:
# Visualize the distribution for each variable
plot_dist(suzuki_df)

In [None]:
# Standardize the labels for reagents and solvents
reagent_name_map_dict = get_name_map_dict(suzuki_df, "Reagents")
solvent_name_map_dict = get_name_map_dict(suzuki_df, "Solvent (Reaction Details)")
suzuki_df = replace_label(suzuki_df, reagent_name_map_dict, solvent_name_map_dict)
overview_df(suzuki_df)
suzuki_df

In [None]:
for key in ["Solvent (Reaction Details)", "Reagents"]:
    suzuki_df[key] = suzuki_df[key].apply(tuple)
suzuki_df = suzuki_df.drop_duplicates(
    subset=["Reaction ID", "Max Temperature", "Solvent (Reaction Details)", "Reagents"]
)
for key in ["Solvent (Reaction Details)", "Reagents"]:
    suzuki_df[key] = suzuki_df[key].apply(list)
suzuki_df

In [None]:
# Save result
save_path = (
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/suzuki.csv"
)
suzuki_df.to_csv(save_path, index=False)

In [None]:
# Visualize the distribution for each variable
plot_dist(suzuki_df)

# Create features file

In [None]:
def int_to_binary(integer_value, num_bits=8):
    try:
        # Use the 'b' format specifier to convert the integer to a binary string
        binary_string = format(int(integer_value), f"0{num_bits}b")
        binary_array = [int(bit) for bit in binary_string]
    except:
        print(integer_value)
    return binary_array

## negishi reactions

In [None]:
data_path = (
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/negishi.csv"
)
negishi_df = pd.read_csv(data_path)
negishi_df["Reagents"] = negishi_df["Reagents"].apply(ast.literal_eval)
negishi_df["Solvent (Reaction Details)"] = negishi_df[
    "Solvent (Reaction Details)"
].apply(ast.literal_eval)
negishi_df = negishi_df[
    (negishi_df["Max Temperature"] >= 20) & (negishi_df["Max Temperature"] <= 150)
]
negishi_df = negishi_df.reset_index(drop=True)
negishi_df

In [None]:
temperature_all = negishi_df["Max Temperature"]
# check minimum and maximum temperatures (these may be outliers)
print("min:", temperature_all.min())
print("max:", temperature_all.max())
# plot distribution of temperatures excluding outliers
plot = temperature_all.hist(grid=False)

In [None]:
# temperature solvent
temperature_tiles = pd.qcut(temperature_all, 10, duplicates="drop")
temperature_tiles.to_frame()
negishi_df["Max Temperature Quantiles"] = temperature_tiles.apply(str)
temps = [[str(i)] for i in temperature_tiles]
temp_mlb = MultiLabelBinarizer()
temp_encoded = pd.DataFrame(
    temp_mlb.fit_transform(temps), columns=temp_mlb.classes_, index=negishi_df.index
)
temp_encoded

In [None]:
temperature_data = negishi_df["Max Temperature"].apply(int_to_binary)
columns = [str(2 ** (i - 1)) for i in range(8, 0, -1)]
temp_binary_encoded = pd.DataFrame(temperature_data.tolist(), columns=columns)
temp_binary_encoded

In [None]:
# label reagent
reagent_mlb = MultiLabelBinarizer()
reagent_encoded = pd.DataFrame(
    reagent_mlb.fit_transform(negishi_df["Reagents"]),
    columns=reagent_mlb.classes_,
    index=negishi_df.index,
)
reagent_encoded

In [None]:
# label solvent
sol_mlb = MultiLabelBinarizer()
sol_encoded = pd.DataFrame(
    sol_mlb.fit_transform(negishi_df["Solvent (Reaction Details)"]),
    columns=sol_mlb.classes_,
    index=negishi_df.index,
)
sol_encoded

In [None]:
for key in ["Solvent (Reaction Details)", "Reagents"]:
    negishi_df[key] = negishi_df[key].apply(tuple)
filtered_negishi_df = negishi_df.drop_duplicates(
    subset=[
        "Reaction ID",
        "Max Temperature",
        "Solvent (Reaction Details)",
        "Reagents",
        "Max Temperature Quantiles",
    ]
)
filtered_index = filtered_negishi_df.index
for key in ["Solvent (Reaction Details)", "Reagents"]:
    filtered_negishi_df[key] = filtered_negishi_df[key].apply(list)
filtered_negishi_df.to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/filtered_negishi.csv",
    index=False,
)
temp_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/filtered_negishi_temp.csv",
    index=False,
)
temp_binary_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/filtered_negishi_binary_temp.csv",
    index=False,
)
reagent_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/filtered_negishi_reagent.csv",
    index=False,
)
sol_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/negishi/filtered_negishi_sol.csv",
    index=False,
)

## buchwald reactions

In [None]:
data_path = "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/buchwald.csv"
buchwald_df = pd.read_csv(data_path)
buchwald_df["Reagents"] = buchwald_df["Reagents"].apply(ast.literal_eval)
buchwald_df["Solvent (Reaction Details)"] = buchwald_df[
    "Solvent (Reaction Details)"
].apply(ast.literal_eval)
buchwald_df = buchwald_df[
    (buchwald_df["Max Temperature"] >= 20) & (buchwald_df["Max Temperature"] <= 150)
]
buchwald_df = buchwald_df.reset_index(drop=True)
buchwald_df

In [None]:
temperature_all = buchwald_df["Max Temperature"]
# check minimum and maximum temperatures (these may be outliers)
print("min:", temperature_all.min())
print("max:", temperature_all.max())
# plot distribution of temperatures excluding outliers
plot = temperature_all.hist(grid=False)

In [None]:
# temperature solvent
temperature_tiles = pd.qcut(temperature_all, 10, duplicates="drop")
temperature_tiles.to_frame()
buchwald_df["Max Temperature Quantiles"] = temperature_tiles.apply(str)
temps = [[str(i)] for i in temperature_tiles]
temp_mlb = MultiLabelBinarizer()
temp_encoded = pd.DataFrame(
    temp_mlb.fit_transform(temps), columns=temp_mlb.classes_, index=buchwald_df.index
)
temp_encoded

In [None]:
temperature_data = buchwald_df["Max Temperature"].apply(int_to_binary)
columns = [str(2 ** (i - 1)) for i in range(8, 0, -1)]
temp_binary_encoded = pd.DataFrame(temperature_data.tolist(), columns=columns)
temp_binary_encoded

In [None]:
# label reagent
reagent_mlb = MultiLabelBinarizer()
reagent_encoded = pd.DataFrame(
    reagent_mlb.fit_transform(buchwald_df["Reagents"]),
    columns=reagent_mlb.classes_,
    index=buchwald_df.index,
)
reagent_encoded

In [None]:
# label solvent
sol_mlb = MultiLabelBinarizer()
sol_encoded = pd.DataFrame(
    sol_mlb.fit_transform(buchwald_df["Solvent (Reaction Details)"]),
    columns=sol_mlb.classes_,
    index=buchwald_df.index,
)
sol_encoded

In [None]:
for key in ["Solvent (Reaction Details)", "Reagents"]:
    buchwald_df[key] = buchwald_df[key].apply(tuple)
filtered_buchwald_df = buchwald_df.drop_duplicates(
    subset=[
        "Reaction ID",
        "Max Temperature",
        "Solvent (Reaction Details)",
        "Reagents",
        "Max Temperature Quantiles",
    ]
)
filtered_index = filtered_buchwald_df.index
for key in ["Solvent (Reaction Details)", "Reagents"]:
    filtered_buchwald_df[key] = filtered_buchwald_df[key].apply(list)
filtered_buchwald_df.to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/filtered_buchwald.csv",
    index=False,
)
temp_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/filtered_buchwald_temp.csv",
    index=False,
)
temp_binary_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/filtered_buchwald_binary_temp.csv",
    index=False,
)
reagent_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/filtered_buchwald_reagent.csv",
    index=False,
)
sol_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/buchwald/filtered_buchwald_sol.csv",
    index=False,
)

## suzuki reactions

In [None]:
data_path = (
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/suzuki.csv"
)
suzuki_df = pd.read_csv(data_path)
suzuki_df["Reagents"] = suzuki_df["Reagents"].apply(ast.literal_eval)
suzuki_df["Solvent (Reaction Details)"] = suzuki_df["Solvent (Reaction Details)"].apply(
    ast.literal_eval
)
suzuki_df = suzuki_df[
    (suzuki_df["Max Temperature"] >= 20) & (suzuki_df["Max Temperature"] <= 150)
]
suzuki_df = suzuki_df.reset_index(drop=True)
suzuki_df

In [None]:
temperature_all = suzuki_df["Max Temperature"]
# check minimum and maximum temperatures (these may be outliers)
print("min:", temperature_all.min())
print("max:", temperature_all.max())
# plot distribution of temperatures excluding outliers
plot = temperature_all.hist(grid=False)

In [None]:
# temperature solvent
temperature_tiles = pd.qcut(temperature_all, 10, duplicates="drop")
temperature_tiles.to_frame()
suzuki_df["Max Temperature Quantiles"] = temperature_tiles.apply(str)
temps = [[str(i)] for i in temperature_tiles]
temp_mlb = MultiLabelBinarizer()
temp_encoded = pd.DataFrame(
    temp_mlb.fit_transform(temps), columns=temp_mlb.classes_, index=suzuki_df.index
)
temp_encoded

In [None]:
temperature_data = suzuki_df["Max Temperature"].apply(int_to_binary)
columns = [str(2 ** (i - 1)) for i in range(8, 0, -1)]
temp_binary_encoded = pd.DataFrame(temperature_data.tolist(), columns=columns)
temp_binary_encoded

In [None]:
# label reagent
reagent_mlb = MultiLabelBinarizer()
reagent_encoded = pd.DataFrame(
    reagent_mlb.fit_transform(suzuki_df["Reagents"]),
    columns=reagent_mlb.classes_,
    index=suzuki_df.index,
)
reagent_encoded

In [None]:
# label solvent
sol_mlb = MultiLabelBinarizer()
sol_encoded = pd.DataFrame(
    sol_mlb.fit_transform(suzuki_df["Solvent (Reaction Details)"]),
    columns=sol_mlb.classes_,
    index=suzuki_df.index,
)
sol_encoded

In [None]:
for key in ["Solvent (Reaction Details)", "Reagents"]:
    suzuki_df[key] = suzuki_df[key].apply(tuple)
filtered_suzuki_df = suzuki_df.drop_duplicates(
    subset=[
        "Reaction ID",
        "Max Temperature",
        "Solvent (Reaction Details)",
        "Reagents",
        "Max Temperature Quantiles",
    ]
)
filtered_index = filtered_suzuki_df.index
for key in ["Solvent (Reaction Details)", "Reagents"]:
    filtered_suzuki_df[key] = filtered_suzuki_df[key].apply(list)
filtered_suzuki_df.to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/filtered_suzuki.csv",
    index=False,
)
temp_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/filtered_suzuki_temp.csv",
    index=False,
)
temp_binary_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/filtered_suzuki_binary_temp.csv",
    index=False,
)
reagent_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/filtered_suzuki_reagent.csv",
    index=False,
)
sol_encoded.loc[filtered_index].to_csv(
    "/content/drive/MyDrive/SCL_Space/data/Reaxys/preprocessed_data/suzuki/filtered_suzuki_sol.csv",
    index=False,
)