In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
import pandas as pd

In [21]:
# 1. load dataset in a Jupyter notebook and extract relevant column to be processed 

input_file = "compounds.csv"
df = pd.read_csv(input_file, delimiter=";")
smiles = df['smiles']
names = df['name']

ref_compound = 'C1=CC(=C(C=C1CC(C(=O)O)N)O)O'

# 2. compute molecular fingerprints for each compound 

ms = [Chem.MolFromSmiles(smile) for smile in smiles]
fps = [FingerprintMols.FingerprintMol(mol, minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128) for mol in ms]

ref_m = Chem.MolFromSmiles(ref_compound)
ref_fp = FingerprintMols.FingerprintMol(ref_m, minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128)

# 3. compute Tanimoto similarity measure for each pair of compound x reference compound 

s = DataStructs.BulkTanimotoSimilarity(ref_fp, fps)
similarities = dict(zip(names, s))

# 4. provide sorting, report top 3 hits

sorted_sims = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))
for i in range(3):
    print(f"Score: {list(sorted_sims.items())[i][1]} | Compound: {list(sorted_sims.items())[i][0]}")

# and discuss your results (with reference to the first lecture and limitations) 
#       U Tanimoto koeficientu se považují za podobné struktury, které dosáhnou hodnoty alespoň 0.85, proto bychom mohli říct, že 
#       L-DOPA, (S)-Methyl, Melevodopa a Tyrosine jsou referenci podobné, kde L-DOPA je podle koeficientu totožný (což odpovídá tomu,
#       že jejich SMILE reprezentace je totožná). 
#       Každopádně tato hodnota udává pouze úroveň podobnosti strukturní, ne podobnost chemotypiskou. Proto nemůžeme říct,
#       že budou mít stejný účinek jako reference.



Score: 1.0 | Compound: L-DOPA (sodium)
Score: 0.8958333333333334 | Compound: (S)-Methyl 2-amino-3-(3,4-dihydroxyphenyl)propanoate hydrochloride
Score: 0.8958333333333334 | Compound: Melevodopa
