## Automated matching script

In [38]:
from openbabel import openbabel

In [39]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [41]:
def inchi(array):
    Smiles = array[0] #if no generations
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)
    return(dummy)

In [42]:
def inchi_generations(array):
    Smiles = array[1] # if generations
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)
    return(dummy)

In [43]:
def prep_data(library_set, test_set): #input filepaths for the 2 sets as tsv files
    library_data = pd.read_csv(library_set, sep='\t')
    test_data = pd.read_csv(test_set, sep='\t')
    print(1)
    test_data['Inchi'] = test_data.apply(inchi_generations, axis=1, raw=True, result_type='expand')
    print(2)
    library_data['Inchi'] = library_data.apply(inchi, axis=1, raw=True, result_type='expand')
    return(library_data, test_data)

In [44]:
def find_matches(library_set, test_set):
    library_data, test_data = prep_data(library_set, test_set)
    print(3)
    test_codes = []
    for i in range(len(test_data['Inchi'])):
        test_codes.append(test_data['Inchi'][i])
    library_codes = []
    for i in range(len(library_data['Inchi'])):
        library_codes.append(library_data['Inchi'][i])
    print(4)
    matches = []
    for i in range(len(test_codes)):
        if test_codes[i] in library_codes:
            matches.append(test_codes[i])
    matches_smiles = []
    matches_generations = []
    for i in range(len(matches)):
        for j in range(len(test_data['Inchi'])):
            if matches[i] == test_data['Inchi'][j]:
                matches_smiles.append(test_data['Smiles'][j])
                matches_generations.append(test_data['Generation'][j])
    matches_data = {'Generation': matches_generations, 'Smiles': matches_smiles, 'Inchi': matches}
    library_test_matches = pd.DataFrame(matches_data)
    return(library_test_matches)

In [48]:
%%time
a = find_matches('./CHO_Data/Prepped_CHO_Smiles.tsv', './FormoseFinalData/formose_output.tsv')

1
2
3
4
CPU times: user 10min 1s, sys: 6.19 s, total: 10min 7s
Wall time: 10min 17s


In [49]:
a

Unnamed: 0,Generation,Smiles,Inchi


In [50]:
a.to_csv(r'InchiFormoseFinal_CHO.tsv', header=None, index=None, sep='\t', mode='a') 