In [1]:
import pathlib, os
import pandas as pd
import numpy as np
from scipy import spatial
from rdkit import Chem, DataStructs

parent_data_path = pathlib.Path("__file__").parent.resolve()
data_path = parent_data_path.joinpath("SMILES_METAL_1988_NoPLD.csv")
os.makedirs(parent_data_path.joinpath("similarity"), exist_ok=True)

data = pd.read_csv(data_path)

data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,mof_id_old,mof_id_new,csd_code,linker_smiles,class
0,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,0,0,ABAVIJ,OC(=O)c1ccncc1,1
1,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,1,1,ABAVOP,OC(=O)c1ccncc1,1
2,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,2,2,ABAVUV,OC(=O)c1ccncc1,1
3,0.494382,0.436748,0.261111,0.838488,0.046125,0.278014,3,3,ABAYIM,OC(=O)c1nccnc1C(=O)O,0
4,0.247191,0.2077,0.238889,0.591065,0.073801,0.028369,4,4,ABAYIO,OC(=O)c1cc(cc(c1)C(=O)O)C(=O)O,1


### Normalizing the features

In [2]:
data.iloc[:,0:6] = data.iloc[:,0:6].apply(lambda x: (x-x.min())/ (x.max()-x.min()), axis=0)

data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,mof_id_old,mof_id_new,csd_code,linker_smiles,class
0,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,0,0,ABAVIJ,OC(=O)c1ccncc1,1
1,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,1,1,ABAVOP,OC(=O)c1ccncc1,1
2,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,2,2,ABAVUV,OC(=O)c1ccncc1,1
3,0.494382,0.436748,0.261111,0.838488,0.046125,0.278014,3,3,ABAYIM,OC(=O)c1nccnc1C(=O)O,0
4,0.247191,0.2077,0.238889,0.591065,0.073801,0.028369,4,4,ABAYIO,OC(=O)c1cc(cc(c1)C(=O)O)C(=O)O,1


We notice that the features are already normalized!

### Computing Tanimoto Similarity (using rdkit) based on Linker Smiles

In [3]:
%%time

linker_sim = np.zeros((len(data), len(data)))

for i in range(len(data)):
    if i % 200 == 0:
        print(f"Computing row {i}")
    smile_i = data.iloc[i, -2]
    mol_i = Chem.MolFromSmiles(smile_i)
    fp_i = Chem.RDKFingerprint(mol_i)
    for j in range(i, len(data)):
        smile_j = data.iloc[j, -2]
        mol_j = Chem.MolFromSmiles(smile_j)
        fp_j = Chem.RDKFingerprint(mol_j)
        linker_sim[i, j] = np.round(DataStructs.TanimotoSimilarity(fp_i, fp_j), 4)
        linker_sim[j, i] = linker_sim[i, j]   

np.save(parent_data_path.joinpath("similarity/linkers_similarity.npy"), linker_sim)
print("\n\n\n")

Computing row 0
Computing row 200
Computing row 400
Computing row 600
Computing row 800
Computing row 1000
Computing row 1200
Computing row 1400
Computing row 1600
Computing row 1800




CPU times: user 23min 21s, sys: 246 ms, total: 23min 22s
Wall time: 23min 22s


### Computing Cosine Similarity based on MOF features

In [4]:
%%time

mof_sim = np.zeros((len(data), len(data)))

for i in range(len(data)):
    if i % 200 == 0:
        print(f"Computing row {i}")
    vec_i = data.iloc[i, 0:6].to_numpy()
    for j in range(i, len(data)):
        vec_j = data.iloc[j, 0:6].to_numpy()
        mof_sim[i, j] = np.round(1 - spatial.distance.cosine(vec_i, vec_j), 4)
        mof_sim[j, i] = mof_sim[i, j] 

np.save(parent_data_path.joinpath("similarity/mof_features_similarity.npy"), mof_sim)
print("\n\n\n")

Computing row 0
Computing row 200
Computing row 400
Computing row 600
Computing row 800
Computing row 1000
Computing row 1200
Computing row 1400
Computing row 1600
Computing row 1800




CPU times: user 4min 21s, sys: 90 ms, total: 4min 21s
Wall time: 4min 21s
