## Install necessary packages

In [None]:
%pip install git+https://github.com/bp-kelley/descriptastorus

In [None]:
%pip install rdkit-pypi chemfunc SyntheMol

In [None]:
import pandas as pd
from rdkit import Chem

## Training models

In [None]:
# Chemprop classification
!python -m SyntheMol.models.train --data_path data/binarized_chemprop_dataset.csv --save_dir models/chemprop --model_type chemprop --dataset_type classification --property_column fluor --num_models 5

In [None]:
# Chemprop + RDKit classification
!python -m SyntheMol.models.train --data_path data/binarized_chemprop_dataset.csv --save_dir models/chemprop_rdkit --model_type chemprop --dataset_type classification --fingerprint_type rdkit --property_column fluor --num_models 5

In [None]:
# Chemprop + Morgan fingerprint classification
!python -m SyntheMol.models.train --data_path data/binarized_chemprop_dataset.csv --save_dir models/chemprop_morgan --model_type chemprop --dataset_type classification --fingerprint_type morgan --property_column fluor --num_models 5

In [None]:
# Random forest classification
!python -m SyntheMol.models.train --data_path data/binarized_chemprop_dataset.csv --save_dir models/random_forest --model_type random_forest --dataset_type classification --fingerprint_type morgan --property_column fluor --num_models 5

## Precompute building block scores

In [None]:
# Chemprop
!python -m SyntheMol.models.predict --data_path files/building_blocks.csv --preds_path models/chemprop/building_block_scores.csv --model_path models/chemprop --model_type chemprop --average_preds

# Rename predictions column to "score" to use with Synthemol.generate
building_blocks_df = pd.read_csv("models/chemprop/building_blocks.csv")

building_blocks_modified_df = building_blocks_df.copy()
building_blocks_modified_df = building_blocks_modified_df.rename(columns={"chemprop_ensemble_preds": "score"})
building_blocks_modified_df.to_csv("models/chemprop/building_blocks.csv", index=False)

In [None]:
# Chemprop + RDKit
!python -m SyntheMol.models.predict --data_path files/building_blocks.csv --preds_path models/chemprop_rdkit/building_block_scores.csv --model_path models/chemprop_rdkit --model_type chemprop --average_preds --fingerprint_type rdkit

# Rename predictions column to "score" to use with Synthemol.generate
building_blocks_df = pd.read_csv("models/chemprop_rdkit/building_blocks.csv")

building_blocks_modified_df = building_blocks_df.copy()
building_blocks_modified_df = building_blocks_modified_df.rename(columns={"chemprop_rdkit_ensemble_preds": "score"})
building_blocks_modified_df.to_csv("models/chemprop_rdkit/building_blocks.csv", index=False)

In [None]:
# Chemprop + Morgan fingerprint
!python -m SyntheMol.models.predict --data_path files/building_blocks.csv --preds_path models/chemprop_morgan/building_block_scores.csv --model_path models/chemprop_morgan --model_type chemprop --average_preds --fingerprint_type morgan

# Rename predictions column to "score" to use with Synthemol.generate
building_blocks_df = pd.read_csv("models/chemprop_morgan/building_blocks.csv")

building_blocks_modified_df = building_blocks_df.copy()
building_blocks_modified_df = building_blocks_modified_df.rename(columns={"chemprop_morgan_ensemble_preds": "score"})
building_blocks_modified_df.to_csv("models/chemprop_morgan/building_blocks.csv", index=False)

In [None]:
# Random forest
!python -m SyntheMol.models.predict --data_path files/building_blocks.csv --preds_path models/random_forest/building_block_scores.csv --model_path models/random_forest --model_type random_forest --average_preds --fingerprint_type morgan

# Rename predictions column to "score" to use with Synthemol.generate
building_blocks_df = pd.read_csv("models/random_forest/building_blocks.csv")

building_blocks_modified_df = building_blocks_df.copy()
building_blocks_modified_df = building_blocks_modified_df.rename(columns={"random_forest_morgan_ensemble_preds": "score"})
building_blocks_modified_df.to_csv("models/random_forest/building_blocks.csv", index=False)

## Generate molecules

In [None]:
# Chemprop
!python -m SyntheMol.generate --model_path models/chemprop --model_type chemprop --building_blocks_path models/chemprop/building_blocks.csv --save_dir models/chemprop/generate_chemprop --reaction_to_building_blocks_path files/reaction_to_building_blocks.pkl --max_reactions 1 --n_rollout 20000

In [None]:
# Chemprop + RDKit
!python -m SyntheMol.generate --model_path models/chemprop_rdkit --model_type chemprop --fingerprint_type rdkit --building_blocks_path models/chemprop_rdkit/building_blocks.csv --save_dir models/chemprop_rdkit/generate_chemprop_rdkit --reaction_to_building_blocks_path files/reaction_to_building_blocks.pkl --max_reactions 1 --n_rollout 20000

In [None]:
# Chemprop + Morgan fingerprint
!python -m SyntheMol.generate --model_path models/chemprop_morgan --model_type chemprop --fingerprint_type morgan --building_blocks_path models/chemprop_morgan/building_blocks.csv --save_dir models/chemprop_morgan/generate_chemprop_morgan --reaction_to_building_blocks_path files/reaction_to_building_blocks.pkl --max_reactions 1 --n_rollout 20000

In [None]:
# Random forest
!python -m SyntheMol.generate --model_path models/random_forest --model_type random_forest --fingerprint_type morgan --building_blocks_path models/random_forest_morgan/building_blocks.csv --save_dir models/random_forest/generate_random_forest --reaction_to_building_blocks_path files/reaction_to_building_blocks.pkl --max_reactions 1 --n_rollout 20000

## Filter molecules

In [None]:
# Filter Chemprop molecules based on level of conjugation
df = pd.read_csv("generate_chemprop/molecules.csv")
mols = [Chem.MolFromSmiles(x) for x in df["smiles"].values]

indices = []
aromatic_dyes = []
for index, genmol in enumerate(mols):
    if len(genmol.GetAromaticAtoms()) >= 24:
        indices.append(index)
        aromatic_dyes.append(genmol)

chemprop_molecules_select_df = df.loc[indices, :]
chemprop_molecules_select_df["model"] = "chemprop"

In [None]:
# Filter Chemprop + RDKit molecules based on level of conjugation
df = pd.read_csv("generate_chemprop_rdkit/molecules.csv")
mols = [Chem.MolFromSmiles(x) for x in df["smiles"].values]

indices = []
aromatic_dyes = []
for index, genmol in enumerate(mols):
    if len(genmol.GetAromaticAtoms()) >= 24:
        indices.append(index)
        aromatic_dyes.append(genmol)

chemprop_rdkit_molecules_select_df = df.loc[indices, :]
chemprop_rdkit_molecules_select_df["model"] = "chemprop_rdkit"

In [None]:
# Filter Chemprop + Morgan molecules based on level of conjugation
df = pd.read_csv("generate_chemprop_morgan/molecules.csv")
mols = [Chem.MolFromSmiles(x) for x in df["smiles"].values]

indices = []
aromatic_dyes = []
for index, genmol in enumerate(mols):
    if len(genmol.GetAromaticAtoms()) >= 24:
        indices.append(index)
        aromatic_dyes.append(genmol)

chemprop_morgan_molecules_select_df = df.loc[indices, :]
chemprop_morgan_molecules_select_df["model"] = "chemprop_morgan"

In [None]:
# Filter Random forest molecules based on level of conjugation
df = pd.read_csv("generate_random_forest/molecules.csv")
mols = [Chem.MolFromSmiles(x) for x in df["smiles"].values]

indices = []
aromatic_dyes = []
for index, genmol in enumerate(mols):
    if len(genmol.GetAromaticAtoms()) >= 24:
        indices.append(index)
        aromatic_dyes.append(genmol)

random_forest_molecules_select_df = df.loc[indices, :]
random_forest_molecules_select_df["model"] = "random_forest"

In [None]:
# Save conjugated molecules
df_conjugated_mols = pd.concat(
    [
        chemprop_molecules_select_df,
        chemprop_rdkit_molecules_select_df,
        chemprop_morgan_molecules_select_df,
        random_forest_molecules_select_df,
    ],
    ignore_index=True,
)
df_conjugated_mols.to_csv("data/conjugated_molecules.csv", index=False)

In [None]:
# Train solubility model
!python -m SyntheMol.models.train --data_path data/binarized_aq_sol_data.csv --save_dir models/sol_chemprop_rdkit --model_type chemprop --dataset_type classification --fingerprint_type rdkit --property_column sol --num_models 5

In [None]:
# Score molecules based on solubility
!python -m SyntheMol.models.predict --data_path data/conjugated_molecules.csv --model_path models/sol_chemprop_rdkit --model_type chemprop --average_preds --fingerprint_type rdkit

df = pd.read_csv("data/conjugated_molecules.csv")

modified_df = df.copy()
modified_df = modified_df.rename(columns={"chemprop_rdkit_ensemble_preds": "sol_score"})

modified_df.to_csv("data/conjugated_molecules.csv", index=False)