In [None]:
import pandas as pd
import numpy as np
import os
import sys

from rdkit import Chem
from rdkit.Chem import AllChem

from chembl_structure_pipeline import standardizer

In [None]:
path = "entry_dataset"
merged_df = pd.DataFrame()
for filename in os.listdir(path):
    print(filename)
    df = pd.read_csv(path + "/" + filename)
    merged_df = pd.concat([merged_df, df], ignore_index=True)

In [None]:
merged_df.dropna(subset=["Accum_class"], inplace=True)

In [None]:
merged_df["Accum_class"] = merged_df["Accum_class"].str.lower()

In [None]:
merged_df.drop_duplicates(subset=["Accum_class", "smiles"], inplace=True)
merged_df.dropna(subset=["smiles"], inplace=True)

In [None]:
accumulators = merged_df[merged_df["Accum_class"] == "high"]
non_accumulators = merged_df[merged_df["Accum_class"] == "low"]

In [None]:
# rename columns to Smiles
accumulators.rename(columns={"smiles": "Smiles"}, inplace=True)
non_accumulators.rename(columns={"smiles": "Smiles"}, inplace=True)

In [None]:
accumulators.Smiles.to_csv("entry_dataset/accumulators_smiles.csv", index=False)
non_accumulators.Smiles.to_csv("entry_dataset/non_accumulators_smiles.csv", index=False)

In [None]:
SMILES_CACHE = {}

def get_clean_smiles(smiles):
    if smiles in SMILES_CACHE:
        return SMILES_CACHE[smiles]
    try:
        mol = Chem.MolFromSmiles(smiles)
        molblock = Chem.MolToMolBlock(mol)
        std_molblock = standardizer.standardize_molblock(molblock)
        parent_molblock, _ = standardizer.get_parent_molblock(std_molblock)
        parent_mol = Chem.MolFromMolBlock(parent_molblock)
        clean_smiles = Chem.MolToSmiles(parent_mol)
        SMILES_CACHE[smiles] = clean_smiles
        return clean_smiles
    except:
        SMILES_CACHE[smiles] = None
        return None
    
merged_df["Smiles"] = merged_df["Smiles"].apply(get_clean_smiles)
merged_df.dropna(subset=["Smiles"], inplace=True)

In [None]:
merged_df.to_csv("entry_dataset/merged_cleaned_dataset.csv", index=False)