In [1]:
import os
import string
from rdkit import Chem
import pandas as pd
from molvs import standardize_smiles

In [2]:
def _ReadPatts(fileName):
    patts = {}
    order = []
    with open(fileName, "r") as f:
        lines = f.readlines()
    for line in lines:
        if line[0] != "#":
            splitLine = line.split("\t")
            if len(splitLine) >= 4 and splitLine[0] != "":
                sma = splitLine[1]
                if sma != "SMARTS":
                    sma.replace('"', "")
                    p = Chem.MolFromSmarts(sma)
                    if p:
                        cha = splitLine[0].strip()
                        if cha not in order:
                            order.append(cha)
                        l = patts.get(cha, [])
                        l.append((sma, p))
                        patts[cha] = l
                else:
                    print("Problems parsing smarts: %s" % (sma))
    return order, patts

In [3]:
def GhoseCrippenFingerprint(mol, count=False):
    order, patts = _ReadPatts("Crippen.txt")

    GCres = dict()
    for sma in patts:
        match = mol.GetSubstructMatches(patts[sma][0][1], False, False)
        temp = len([i[0] for i in match])
        GCres.update({sma: temp})

    res = {}
    if count == False:
        for i in GCres:
            if GCres[i] > 0:
                res.update({i: 1})
            else:
                res.update({i: 0})
    else:
        res = GCres

    return res



In [None]:

filename = './data/temp.csv'
df = pd.read_csv(filename)               
smiles = [standardize_smiles(i) for i in df['smiles'].values] 
all_ecfp2_data = []
df = pd.DataFrame()
for smile in smiles:
    mol = Chem.MolFromSmiles(smile)
    result=GhoseCrippenFingerprint(mol, count=True)
    df = pd.concat([df, pd.DataFrame([result])], ignore_index=True)

df.insert(loc = 0,
          column = "smiles",
          value = smiles)
   
df.to_csv('./output/output_Ghose_Crippen_FP.csv', index=False)