In [1]:
from openbabel import openbabel

In [2]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [4]:
def split(word):
    return[char for char in word]

In [5]:
def inchi(array):
    Smiles = array[0] 
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

In [6]:
def inchi_generations(array):
    Smiles = array[1]
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

In [7]:
def remove_degeneracy_generations(dataset):
    df = pd.read_csv(dataset, sep='\t')
    df['Inchi'] = df.apply(inchi_generations, axis=1, raw=True, result_type='expand')
    
    generation = [df['Generation'][0]]
    smiles = [df['Smiles'][0]]
    inchi_list = [df['Inchi'][0]]
    
    print(1)
    
    for i in range(len(df['Inchi'])):
        if df['Inchi'][i] not in inchi_list:
            inchi_list.append(df['Inchi'][i])
            smiles.append(df['Smiles'][i])
            generation.append(df['Generation'][i])
    
    data = {'Generation': generation, 'Smiles': smiles, 'Inchi': inchi_list}
    output = pd.DataFrame(data)
   
    return(output)

In [8]:
def remove_degeneracy(dataset):
    df = pd.read_csv(dataset, sep='\t')
    df['Inchi'] = df.apply(inchi, axis=1, raw=True, result_type='expand')
    
    smiles = [df['Smiles'][0]]
    inchi_list = [df['Inchi'][0]]
    
    print(1)
    
    for i in range(len(df['Inchi'])):
        if df['Inchi'][i] not in inchi_list:
            inchi_list.append(df['Inchi'][i])
            smiles.append(df['Smiles'][i])
    
    data = {'Smiles': smiles, 'Inchi': inchi_list}
    output = pd.DataFrame(data)
   
    return(output)

In [25]:
%%time
a = remove_degeneracy('./CHNO_Data/Prepped_CHNO_Smiles.tsv')

1
CPU times: user 35min 13s, sys: 13.3 s, total: 35min 26s
Wall time: 35min 45s


In [26]:
a

Unnamed: 0,Smiles,Inchi
0,C1[C@H]([C@@H]([C@@H]([C@H]1OCOC)O)O)O[H],DTBXEQYAJDNCDO
1,C1[C@H]([C@@H]([C@@H]([C@H]1OCOC)O)O)N([H])[H],UXDGYUVYXFGMTI
2,[C@@H]1([C@H]([C@@H](O[C@@H]1CCOC)O[H])O)O,KLTFHVDRRCTBBL
3,[C@@H]1([C@H]([C@@H](O[C@@H]1CCOC)N([H])[H])O)O,KHNGSMSGCRDGCV
4,[C@H]1([C@@H]([C@@](C)(CCO)O[C@H]1O[H])O)O,AXXZNGSNFMYFEE
...,...,...
134021,c1cc(c(cc1OCN([H])[H])O)O,CJTDGDHSLZZTRL
134022,c1c(cc(cc1O)OCO[H])O,HGBCGHVOAPELOV
134023,c1c(cc(cc1O)OCN([H])[H])O,JUBOLHUMZGUACG
134024,c1c(cc(c(c1O)O)O)OCO[H],AILYZZQFKLJSPC


In [27]:
a.to_csv(r'CHNO_Final.tsv', header=None, index=None, sep='\t', mode='a') 