In [1]:
import pathlib
from rdkit import Chem, DataStructs
from rdkit import Chem
import pandas as pd
import numpy as np

parent_data_path = pathlib.Path("__file__").parent.resolve()
paper_data_path = parent_data_path.joinpath("SMILES_METAL_2000_NoPLD.csv")

features = {
    "atomic_number" : "feature_1",
    "atomic_weight" : "feature_2",
    "atomic_radius" : "feature_3",
    "electronegativity" : "feature_4",
    "polarizability" : "feature_5",
    "electron_affinity" : "feature_6"
           }

paper_data = pd.read_csv(paper_data_path, 
                         header=None, 
                         names=[
                             features["atomic_number"],
                             features["atomic_weight"],
                             features["atomic_radius"], 
                             features["electronegativity"], 
                             features["polarizability"], 
                             features["electron_affinity"], 
                             "mof_id_old",
                             "csd_code",
                             "linker_smiles",
                             "class"
                         ])

paper_data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,mof_id_old,csd_code,linker_smiles,class
0,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,0,ABAVIJ,OC(=O)c1ccncc1,1
1,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,1,ABAVOP,OC(=O)c1ccncc1,1
2,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071,2,ABAVUV,OC(=O)c1ccncc1,1
3,0.494382,0.436748,0.261111,0.838488,0.046125,0.278014,3,ABAYIM,OC(=O)c1nccnc1C(=O)O,0
4,0.247191,0.2077,0.238889,0.591065,0.073801,0.028369,4,ABAYIO,OC(=O)c1cc(cc(c1)C(=O)O)C(=O)O,1


In [2]:
paper_error_id = []

for i in range(len(paper_data)):
    smi = paper_data.iloc[i, -2]
    try:
        mol = Chem.MolFromSmiles(smi)
        fp = Chem.RDKFingerprint(mol)
    except:
        paper_error_id.append(i)

[03:18:43] SMILES Parse Error: extra open parentheses for input: 'C=CCOc1cc(OCC(=O)O)c2cc1[C@H](c1ccccc1)c1cc(c(cc1OCC'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'ONC(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'ON=C(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:43] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1cc(C'
[03:18:44] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1ccc(cc1)c1cc(C'
[03:18:44] SMILES Parse Error: extra open parentheses for input: 'OC(=O)c1ccc(cc1)c

In [3]:
print(f"Total {len(paper_error_id)} Errors")

Total 16 Errors


#### Writing the entries that have no error

In [4]:
clean_data = pd.read_csv(paper_data_path, 
                         header=None, 
                         skiprows=paper_error_id,
                         names=[
                             features["atomic_number"],
                             features["atomic_weight"],
                             features["atomic_radius"], 
                             features["electronegativity"], 
                             features["polarizability"], 
                             features["electron_affinity"], 
                             "mof_id_old",
                             "csd_code",
                             "linker_smiles",
                             "class"
                         ])

clean_data.insert(7, "mof_id_new", np.arange(0, len(clean_data)))
clean_data_path = parent_data_path.joinpath(f"SMILES_METAL_{len(clean_data)}_NoPLD.csv")
clean_data.to_csv(clean_data_path, index=False)