In [6]:
import pandas as pd
from numpy import argmax
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem
import pickle
import numpy as np
from mordred import Calculator, descriptors
from standardise_smiles_local_implementation import standardize_jumpcp
from IPython.display import display
from PIL import Image

calc = Calculator(descriptors, ignore_3D=True)

smiles=["CCO"]

DICT_rank_substructures_ppv1 = [ "C=CCCC(=O)",
"CCOCCC[NH+](C)C",
"C(CCCCCC(O)CCC)C(=O)",
"CC[NH+](CC(=O)[O-])CCNC",
"CCC[NH+](C)C(C(=O)[O-])",
"CC(C[N+](C)(C))OC"]

DICT_Concern_substructures = ['CCOCCC[NH+](C)C', 'CN(C)c1ccccn1']


# Create a DataFrame with a single column "Standardized_SMILES"
df = pd.DataFrame({'SMILES': smiles})
df['Standardized_SMILES'] = df['SMILES'].apply(standardize_jumpcp)

# Now, 'df' contains your data as a Data
# To load data_columns from the .pkl file
with open('data_columns.pkl', 'rb') as file:
    data_columns = pickle.load(file)
data_columns

Ser_Mol = df['Standardized_SMILES'].apply(Chem.MolFromSmiles)
Mordred_table = calc.pandas(Ser_Mol)
Mordred_table = Mordred_table.astype('float')
               
     

# Retain only those columns in the test dataset
Mordred_table = Mordred_table[data_columns]


X = np.array(Mordred_table)
X[np.isnan(X)] = 0
X[np.isinf(X)] = 0

classifier = pickle.load(open("FINAL_Physicochemical_model.sav", 'rb'))

prob_test = classifier.predict_proba(X)[:, 1]


df["Probability"] = prob_test
df["Prediction"] = (prob_test >= 0.641338).astype(int)

print("Predicted DICTrank:", df["Prediction"][0])
print("Predicted DICTrank probability:", np.round(df["Probability"][0], 2))
print("Threshold for Cardiotoxicity: ", np.round(0.641338,2))

# Convert the substructure strings into Mol objects
DICT_Concern_mols = [Chem.MolFromSmiles(smiles) for smiles in DICT_Concern_substructures]

DICT_rank_mols = [Chem.MolFromSmiles(smiles) for smiles in DICT_rank_substructures_ppv1]


def check_substructure_presence(smiles_str, sub_mol):
    """Check if a specific substructure is present in the given SMILES string."""
    if not isinstance(smiles_str, str):
        return False
    mol = Chem.MolFromSmiles(smiles_str)
    if mol:
        return mol.HasSubstructMatch(sub_mol)
    return False

# Create a new column for each substructure
for idx, sub_mol in enumerate(DICT_Concern_mols):
    col_name = f"Contains_{DICT_Concern_substructures[idx]}"
    df[col_name] = df['Standardized_SMILES'].parallel_apply(lambda x: check_substructure_presence(x, sub_mol))
    
for idx, sub_mol in enumerate(DICT_rank_mols):
    col_name = f"Contains_{DICT_rank_substructures_ppv1[idx]}"
    df[col_name] = df['Standardized_SMILES'].parallel_apply(lambda x: check_substructure_presence(x, sub_mol))
from rdkit.Chem import Draw



mol = Chem.MolFromSmiles(df['Standardized_SMILES'].values[0])
dpi = 300
molSize = (300, 300)

        
# Define a function to check and display substructures
def check_and_display_substructure(sub_mol):
    if mol.HasSubstructMatch(sub_mol):
        main_img = Draw.MolToImage(mol, size=molSize, highlightAtoms=mol.GetSubstructMatch(sub_mol), dpi=dpi)
        sub_img = Draw.MolToImage(sub_mol, dpi=dpi)
        return main_img, sub_img
    return None, None
        
# Display DICTrank substructures if alerts are present
if any(mol.HasSubstructMatch(sub_mol) for sub_mol in DICT_rank_mols):
    print("Structural Alerts for DICTrank 1")
    for sub_mol in DICT_rank_mols:
        main_img, sub_img = check_and_display_substructure(sub_mol)
        if main_img is not None:
            display(main_img)
            display(sub_img)

# Display DICT Most-concern category substructures if alerts are present
if any(mol.HasSubstructMatch(sub_mol) for sub_mol in DICT_Concern_mols):
    print("Structural Alerts for DICT Most-concern category")
    for sub_mol in DICT_Concern_mols:
        main_img, sub_img = check_and_display_substructure(sub_mol)
        if main_img is not None:
            display(main_img)
            display(sub_img)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  6.56it/s]


Predicted DICTrank: 0
Predicted DICTrank probability: 0.5
Threshold for Cardiotoxicity:  0.64
