To install, use `conda install rdkit -c rdkit`.

In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.Chem import AllChem
from collections import Counter

1. Get molecular structure with given SMILE_str.
2. Read functional groups.

In [2]:
HCEPDB_100K = pd.read_csv('HCEPD_100K.csv')

In [3]:
HCEPDB_100K.head()

Unnamed: 0,id,SMILES_str,stoich_str,mass,pce,voc,jsc,e_homo_alpha,e_gap_alpha,e_lumo_alpha,tmp_smiles_str
0,655365,C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1,C18H9N3OSSe,394.3151,5.161953,0.867601,91.567575,-5.467601,2.022944,-3.444656,C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1,1245190,C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...,C22H15NSeSi,400.4135,5.261398,0.504824,160.401549,-5.104824,1.63075,-3.474074,C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
2,21847,C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...,C24H17NOSi,363.4903,0.0,0.0,197.47478,-4.539526,1.462158,-3.077368,C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
3,65553,[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1,C12H12SeSi3,319.4448,6.138294,0.630274,149.887545,-5.230274,1.68225,-3.548025,C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
4,720918,C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1,C20H12OSSe,379.3398,1.991366,0.242119,126.581347,-4.842119,1.809439,-3.03268,C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1


In [4]:
molecule = Chem.MolFromSmiles(HCEPDB_100K['SMILES_str'][0]) # Practise on a single molecule.
molecule # Returning arguments like below indicates loaded successfully.

<rdkit.Chem.rdchem.Mol at 0x11b6cde90>

In [5]:
molecule is None # Alternatively by this way to check if the molecule is loaded properly.

False

In [6]:
molecule.GetAtoms()

<rdkit.Chem.rdchem._ROAtomSeq at 0x11b6e51c0>

In [7]:
# First we care atoms as the existence of some specific atoms might has something to do with PCE.
atomlist = []
for atom in molecule.GetAtoms():
    atomlist.append(atom.GetAtomicNum())
atomlist
# It should be useful when we try to find what atoms exist in a molecule as the atomic number is unique for each element.

[6, 6, 6, 6, 6, 6, 6, 6, 34, 6, 6, 8, 6, 6, 6, 6, 7, 16, 7, 6, 6, 6, 6, 7]

In [8]:
# Use something like this to classify molecules with/without atom x.
molecular_type_a = []
molecular_type_b = []
if 34 in atomlist:
    molecular_type_a.append(HCEPDB_100K.loc[0])