To install, use `conda install rdkit -c rdkit`.

In [5]:
from rdkit import Chem
import pandas as pd
from rdkit.Chem import AllChem
from collections import Counter

1. Get molecular structure with given SMILE_str.
2. Read functional groups.

In [14]:
data = pd.read_csv(r'C:\Users\alex0\Desktop\command\Project\Repository\PV-Cell\PV-Cell\Data\HCEPD_100K.csv') # Try using 100K for practice

In [15]:
data.head()

Unnamed: 0,id,SMILES_str,stoich_str,mass,pce,voc,jsc,e_homo_alpha,e_gap_alpha,e_lumo_alpha,tmp_smiles_str
0,655365,C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1,C18H9N3OSSe,394.3151,5.161953,0.867601,91.567575,-5.467601,2.022944,-3.444656,C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1,1245190,C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...,C22H15NSeSi,400.4135,5.261398,0.504824,160.401549,-5.104824,1.63075,-3.474074,C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
2,21847,C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...,C24H17NOSi,363.4903,0.0,0.0,197.47478,-4.539526,1.462158,-3.077368,C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
3,65553,[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1,C12H12SeSi3,319.4448,6.138294,0.630274,149.887545,-5.230274,1.68225,-3.548025,C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
4,720918,C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1,C20H12OSSe,379.3398,1.991366,0.242119,126.581347,-4.842119,1.809439,-3.03268,C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1


In [16]:
molecule = Chem.MolFromSmiles(data['SMILES_str'][0]) # Practise on a single molecule.
molecule # Returning arguments like below indicates loaded successfully.

<rdkit.Chem.rdchem.Mol at 0x1e89368f9e0>

In [17]:
molecule is None # Alternatively by this way to check if the molecule is loaded properly.

False

In [18]:
molecule.GetAtoms()

<rdkit.Chem.rdchem._ROAtomSeq at 0x1e893b36260>

In [19]:
# First we care atoms as the existence of some specific atoms might has something to do with PCE.
atomlist = []
for atom in molecule.GetAtoms():
    atomlist.append(atom.GetAtomicNum())
atomlist
# It should be useful when we try to find what atoms exist in a molecule as the atomic number is unique for each element.

[6, 6, 6, 6, 6, 6, 6, 6, 34, 6, 6, 8, 6, 6, 6, 6, 7, 16, 7, 6, 6, 6, 6, 7]

In [20]:
# Use something like this to classify molecules with/without atom x.
molecular_type_a = []
molecular_type_b = []
if 34 in atomlist:
    molecular_type_a.append(data.loc[0])

In [21]:
data['stoich_str'][0]

'C18H9N3OSSe'

In [22]:
'Se' in data['stoich_str'][0] # Alernatively we can just directly do it like this to classify contained elements.

True

In [23]:
# What else other than elements/atoms we want to classify.
# Maybe number of C-C double bonds?

In [24]:
double_count = 0
for i in range(len(atomlist)):
    if str(molecule.GetBonds()[i].GetBondType()) == 'DOUBLE':
        double_count += 1
double_count

2

In [25]:
for i in range(len(atomlist)):
    print(molecule.GetBonds()[i].GetBondType())

SINGLE
DOUBLE
SINGLE
DOUBLE
SINGLE
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
SINGLE


In [26]:
# Maybe also the number of rings?
ring_count = 0
for i in range(len(atomlist)):
    if molecule.GetAtomWithIdx(0).IsInRing() == True:
        ring_count += 1
ring_count

24

In [39]:
# sort dataframe by descending pce value
# extract molecules that have pce more than 10
data_hipce = data.sort_values('pce', ascending=False)
data_pce10 = data_hipce.loc[data_hipce['pce'] > 10]
print('there are', len(data_pce10), 'molecules that have pce more than 10')
data_pce10.head()

there are 637 molecules that have pce more than 10


Unnamed: 0,id,SMILES_str,stoich_str,mass,pce,voc,jsc,e_homo_alpha,e_gap_alpha,e_lumo_alpha,tmp_smiles_str
89026,267487,C1C=Cc2[se]c3c4[SiH2]C(=Cc4c4nsnc4c3c12)c1cncc...,C18H9N5S2SeSi,466.4821,11.097939,0.835571,204.411996,-5.435571,1.435758,-3.999813,C1=Cc2[se]c3c4[SiH2]C(=Cc4c4nsnc4c3c2C1)c1cncc...
48713,1852537,[SiH2]1C=c2c3cc(C4=CC=C[SiH2]4)c4nsnc4c3c3c4ns...,C24H14N4S2Si2,478.7066,11.089069,0.828104,206.090379,-5.428104,1.428751,-3.999353,C1=CC=C([SiH2]1)c1cc2c(c3nsnc13)c1c3nsnc3c3ccc...
87897,1541661,[SiH2]1C=Cc2[se]c3c(sc4cc([se]c34)-c3cncc4nsnc...,C15H7N3S2Se2Si,479.3793,11.084254,0.854591,199.616078,-5.454591,1.455117,-3.999474,C1=Cc2[se]c3c(sc4cc([se]c34)-c3cncc4nsnc34)c2[...
20301,2930157,C1C=CC=C1c1cc2c3nsnc3c3c4c5nsnc5ccc4c4=C[SiH2]...,C24H13N5S2Si,463.6197,11.064108,0.851306,200.022349,-5.451306,1.452847,-3.998458,C1=CC=C(C1)c1cc2c3nsnc3c3c4c5nsnc5ccc4c4=C[SiH...
67356,1751105,[SiH2]1C=CC=C1c1cc2c3nsnc3c3c(sc4ccc5cscc5c34)...,C22H10N4S4Si,486.699,11.063341,0.783588,217.29323,-5.383588,1.387194,-3.996394,C1=CC=C([SiH2]1)c1cc2c3nsnc3c3c(sc4ccc5cscc5c3...


In [56]:
data_pce10['SMILES_str'].values[0]

'C1C=Cc2[se]c3c4[SiH2]C(=Cc4c4nsnc4c3c12)c1cncc2nsnc12'

In [58]:
from rdkit.Chem import rdFMCS
mol1 = Chem.MolFromSmiles(data_pce10['SMILES_str'].values[0])
mol2 = Chem.MolFromSmiles(data_pce10['SMILES_str'].values[1])
mol3 = Chem.MolFromSmiles(data_pce10['SMILES_str'].values[2])
mols = [mol1,mol2,mol3]
res=rdFMCS.FindMCS(mols)
res
res.numAtoms

13

In [59]:
res.smartsString

'[#6](:,-[#6]):,-[#6]:[#6]:[#6]:[#6]-,:[#6](:,-[#6]):[#6]1:[#7]:[#16]:[#7]:[#6]:1'

In [69]:
mcs = Chem.MolToSmiles(Chem.MolFromSmarts(res.smartsString))
mcs

'C:C:C:C:C:CC(:C):C1:C:N:[SH]:N:1'

In [66]:
m = Chem.MolFromSmiles(mcs)
AllChem.Compute2DCoords(m)

ArgumentError: Python argument types in
    rdkit.Chem.rdDepictor.Compute2DCoords(NoneType)
did not match C++ signature:
    Compute2DCoords(class RDKit::ROMol {lvalue} mol, bool canonOrient=True, bool clearConfs=True, class boost::python::dict {lvalue} coordMap={}, unsigned int nFlipsPerSample=0, unsigned int nSample=0, int sampleSeed=0, bool permuteDeg4Nodes=False, double bondLength=-1.0, bool forceRDKit=False)

In [1]:
import tensorflow

ModuleNotFoundError: No module named 'tensorflow'