In [1]:
import os
os.environ["CSDHOME"] = "/Users/siddhant/CCDC2025/ccdc-data/csd"

In [2]:
from ccdc import io 
csd_reader = io.EntryReader("CSD")
print(f"Number of entries: {len(csd_reader)}")

Number of entries: 1371757


In [3]:
# generate SMILES string based on REFcode
io.MoleculeReader("csd").molecule("ABEHUK").components[1].to_string('smiles')

'c1cc:[B-](:cc1)/C=C/c1ccc(cc1)/C=C/[B-]1:ccccc:1'

In [29]:
excluded_elements = [
    "He", "Li", "Be", "B", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
    "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
    "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Fl", "Lv", "Ts", "Og"
]

In [30]:
# search CSD for structures based on SMARTS match
from ccdc import search
smarts = "[NX3;H1]([CX4,c])[c]" # secondaary aromatic amine 
smarts_sub = search.SMARTSSubstructure(smarts)

csd_search = search.SubstructureSearch()
csd_search.add_substructure(smarts_sub)
csd_search.add_substructure(smarts_sub) # enforces second amino group requirement 

csd_search.settings.only_organic = True
csd_search.settings.must_not_have_elements = excluded_elements

hits = csd_search.search()
print(f"Found {len(hits)} hits.")
# tns = TextNumericSearch()
# # tns.add_heat_capacity_notes()
# # tns.add_solubility_notes()
# sub_search = ccdc.search.SubstructureSearch()
# sub_search.add_substructure(ccdc.search.SMARTSSubstructure("[NX3;H1]([CX4,c])[c]")) # secondaary aromatic amine
# combi_search = ccdc.search.CombinedSearch(tns & sub_search)
# hits = combi_search.search()
# print(len(hits))
# # hits[0].identifier

Found 2585 hits.


### AMINO SEARCH

In [93]:
import ccdc
from ccdc import search 
from rdkit import Chem
from ccdc import io 
from rdkit.Chem import AllChem
import re 
import numpy as np
from ccdc import io 
import pandas as pd
from tqdm import tqdm

csd_reader = io.EntryReader("CSD")

##################################
# USER INPUT 
##################################
num_pri_amino = 1 # number of primary amino groups 
num_sec_amino = 0 # number of secondary amino groups
num_ter_amino = 0 # number of tertiary amino groups

num_aromatic_rings = 1 # number of aromatic rings
# excluding any non C,N,H atom
excluded_elements = [
    "He", "Li", "Be", "B", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
    "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
    "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Fl", "Lv", "Ts", "Og"
]

# can implement crystal mass, volume, PLD, maximum pore diameter constraints in addition 

###########################
# SEARCH SETTINGS 
###########################
patterns = ["[NX3;H2][CX4,cX3]","[NX3;H1]([CX4,cX3])[CX4,cX3]",  "[NX3;H0]([CX4,cX3])([CX4,cX3])[CX4,cX3]"] # primary, secondary, and tertiary amines
groups = [num_pri_amino, num_sec_amino, num_ter_amino]

csd_search = search.SubstructureSearch()
csd_search.settings.only_organic = True
csd_search.settings.must_not_have_elements = excluded_elements
csd_search.settings.max_hit_structures = 5000

############################
# BUILDING QUERY
############################
for order in range(3):
    smarts = patterns[order]
    smarts_sub = search.SMARTSSubstructure(smarts)
    for num in range(groups[order]):
        csd_search.add_substructure(smarts_sub)

#############################
# COMPILE SEARCH RESULTS 
#############################
hits = csd_search.search()
hits_id = []
for hit in hits:
    id = hit.identifier
    hits_id.append(id)

hits = list(dict.fromkeys(hits_id)) # remove duplicates from search 

print(f"Found {len(hits)} hits.")

###########################
# PARSE SEARCH RESULTS 
###########################
def extract_temperature(text):
    if not text:
        return None
    match = re.search(r'at\s+([\d.]+)\s*K', text)
    if match:
        return float(match.group(1))
    return np.nan

REFcode = []
SMILES = []
sys_vol = []
sys_dens = [] 
PLD = []
MPD = []
temp = [] 

for hit in tqdm(hits, desc = "Processing hits"):
    mol = io.MoleculeReader('csd').molecule(hit)
    # Skip if SMILES can't be generated
    try:
        smiles = mol.components[0].to_string('smiles')
    except RuntimeError:
        continue  # Skip if SMILES can't be generated

    # Calculate number of aromatic rings
    aromaticity = sum(ring.is_aromatic for ring in mol.components[0].rings) 

    if aromaticity == num_aromatic_rings:
        REFcode.append(hit)
        SMILES.append(smiles)
        crysDes = ccdc.descriptors.CrystalDescriptors.PoreAnalyser(io.CrystalReader('csd').crystal(hit))
        sys_vol.append(crysDes.system_volume) # crystal cell volume 
        sys_dens.append(crysDes.system_density) # crystal cell density? 
        PLD.append(crysDes.pore_limiting_diameter) 
        MPD.append(crysDes.max_pore_diameter)
        temp.append(extract_temperature(csd_reader.entry(hit).temperature)) # studied temperature
        # add ORCA/polarizability calculation and temperature? 

    else: 
        continue

df = pd.DataFrame({
    'REFcode': REFcode,
    'SMILES': SMILES,
    'System Volume (Å³)': sys_vol,
    'System Density (g/cm³)': sys_dens,
    'PLD (Å)': PLD,
    'MPD (Å)': MPD,
    'Temperature (K)': temp
})
       
df 


Found 960 hits.


Processing hits:   3%|▎         | 27/960 [01:44<1:16:15,  4.90s/it]