In [1]:
from openbabel import openbabel

In [2]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [4]:
def split(word):
    return[char for char in word]

In [5]:
def inchi(array):
    Smiles = array[0] 
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

In [6]:
def inchi_generations(array):
    Smiles = array[1]
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

## Nucleoside matching

In [7]:
def prep_data(library_set, test_set): #input filepaths for the 2 sets as tsv files
    library_data = pd.read_csv(library_set, sep='\t')
    test_data = pd.read_csv(test_set, sep='\t')
    print(1)
    test_data['Inchi'] = test_data.apply(inchi_generations, axis=1, raw=True, result_type='expand')
    print(2)
    #library_data['Inchi'] = library_data.apply(inchi, axis=1, raw=True, result_type='expand')
    #Using degeneracy removed datasets with Inchikey already found 
    return(library_data, test_data)

In [8]:
def find_matches(library_set, test_set):
    library_data, test_data = prep_data(library_set, test_set)
    print(3)
    test_codes = []
    for i in range(len(test_data['Inchi'])):
        test_codes.append(test_data['Inchi'][i])
    library_codes = []
    for i in range(len(library_data['Inchi'])):
        library_codes.append(library_data['Inchi'][i])
    print(4)
    matches = []
    for i in range(len(test_codes)):
        if test_codes[i] in library_codes:
            matches.append(test_codes[i])
    print(5)
    test_smiles = []
    #library_smiles = []
    matches_generations = []
    for i in range(len(matches)):
        for j in range(len(test_data['Inchi'])):
            if matches[i] == test_data['Inchi'][j]:
                test_smiles.append(test_data['Smiles'][j])
                matches_generations.append(test_data['Generation'][j])
        #for k in range(len(library_data['Inchi'])):
            #if matches[i] == library_data['Inchi'][k]:
                #library_smiles.append(library_data['Smiles'][k])
    #matches_data = {'Generation': matches_generations, 'Test Smiles': test_smiles, 'Library Smiles': library_smiles, 'Inchi': matches}
    matches_data = {'Generation': matches_generations, 'Smiles': test_smiles, 'Inchi': matches}
    library_test_matches = pd.DataFrame(matches_data)
    return(library_test_matches)

In [9]:
%%time
#a = find_matches('./CHO_Data/CHO_Final.tsv', './PyruvicAcidData/pyruvic_output.tsv')

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 4.05 µs


In [10]:
#a

In [11]:
#a.to_csv(r'NoStereoPyruvicAcid_CHNO.tsv', header=None, index=None, sep='\t', mode='a') 

## Testing dask

In [12]:
#pip install dask

In [13]:
import dask.dataframe as dd

In [24]:
def inchi_apply(df):
    return(df.apply(inchi, axis=1, raw=True, result_type='expand'))

In [25]:
def inchi_generations_apply(df):
    return(df.apply(inchi_generations, axis=1, raw=True, result_type='expand'))

In [54]:
def prep_data_dask(library_set, test_set): #input filepaths for the 2 sets as tsv files
    num_cores = 2
    library_data = pd.read_csv(library_set, sep='\t')
    test_data = pd.read_csv(test_set, sep='\t')
    test_data_ddf = dd.from_pandas(test_data, npartitions=num_cores)
    library_data_ddf = dd.from_pandas(library_data, npartitions=num_cores)
    print(1)
    test_data['Inchi'] = test_data_ddf.map_partitions(inchi_generations_apply, meta='float').compute(scheduler='processes')
    print(2)
    library_data['Inchi'] = library_data_ddf.map_partitions(inchi_apply, meta='float').compute(scheduler='processes')
    return(library_data, test_data)

In [55]:
def matches_search(df, library_codes):
    test_codes = []
    for i in range(len(df['Inchi'])):
        if df['Inchi'][i] in library_codes:
            test_codes.append(df['Inchi'][i])
    return(test_codes)

In [64]:
def find_matches_dask(library_set, test_set):
    num_cores = 2
    library_data, test_data = prep_data_dask(library_set, test_set)
    print(3)
    library_codes = []
    for i in range(len(library_data['Inchi'])):
        library_codes.append(library_data['Inchi'][i])
    print(4)
    test_data_ddf = dd.from_pandas(test_data, npartitions=num_cores)
    matches = []
    matches_data = {'Matches': matches}
    df = pd.DataFrame(matches_data)
    df['Matches'] = test_data_ddf.map_partitions(matches_search, library_codes, meta='float'). compute(scheduler='processes')
    print(5)
    test_smiles = []
    #library_smiles = []
    matches_generations = []
    for i in range(len(matches)):
        for j in range(len(test_data['Inchi'])):
            if matches[i] == test_data['Inchi'][j]:
                test_smiles.append(test_data['Smiles'][j])
                matches_generations.append(test_data['Generation'][j])
        #for k in range(len(library_data['Inchi'])):
            #if matches[i] == library_data['Inchi'][k]:
                #library_smiles.append(library_data['Smiles'][k])
    #matches_data = {'Generation': matches_generations, 'Test Smiles': test_smiles, 'Library Smiles': library_smiles, 'Inchi': matches}
    matches = []
    for i in range(len(df['Matches'])):
        matches.append(df['Matches'][i])
    matches_data = {'Generation': matches_generations, 'Smiles': test_smiles, 'Inchi': matches}
    library_test_matches = pd.DataFrame(matches_data)
    print(library_test_matches)

In [65]:
%%time
find_matches_dask('./CHNO_Data/Prepped_CHNO_Smiles.tsv', './PyruvicAcidData/pyruvic_output.tsv')

1
2
3
4


KeyError: 0

Traceback
---------
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/local.py", line 223, in execute_task
    result = _execute_task(task, data)
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/core.py", line 121, in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/optimization.py", line 969, in __call__
    return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/core.py", line 151, in get
    result = _execute_task(task, cache)
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/core.py", line 121, in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/utils.py", line 35, in apply
    return func(*args, **kwargs)
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/dask/dataframe/core.py", line 5775, in apply_and_enforce
    df = func(*args, **kwargs)
  File "<ipython-input-55-d1aa50b42497>", line 4, in matches_search
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/pandas/core/series.py", line 853, in __getitem__
    return self._get_value(key)
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/pandas/core/series.py", line 961, in _get_value
    loc = self.index.get_loc(label)
  File "/opt/anaconda3/envs/py4chemoinformatics/lib/python3.9/site-packages/pandas/core/indexes/range.py", line 353, in get_loc
    raise KeyError(key) from err
