In [None]:
from openbabel import openbabel
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [None]:
def split(word):
    return[char for char in word]

def inchi(array):
    Smiles = array[0] 
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

def inchi2(Smiles):
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

def inchi_generations(array):
    Smiles = array[1]
    mol = Chem.MolFromSmiles(Smiles)
    dummy = Chem.inchi.MolToInchiKey(mol)#split at first layer (contains everything but stereochemical info)
    characters = split(dummy)
    dummy = ''
    position = 0
    for i in range(len(characters)):
        if characters[i] == '-':
            position = i
            break
    for i in range(position):
        dummy += characters[i]
    return(dummy)

In [None]:
def prep_data(library_set, test_set):
    library_data = pd.read_csv(library_set, sep='\t')
    test_data = pd.read_csv(test_set, sep='\t')
    print("Checkpoint 1")
    test_data['Inchi'] = test_data.apply(inchi_generations, axis=1, raw=True, result_type='expand')
    print("Checkpoint 2")
    #Using degeneracy removed datasets with Inchikey already found 
    return(library_data, test_data)

In [None]:
def find_matches(library_set, test_set):
    library_data, test_data = prep_data(library_set, test_set)
    print("Checkpoint 3")
    test_codes = []
    for i in range(len(test_data['Inchi'])):
        test_codes.append(test_data['Inchi'][i])
    library_codes = []
    for i in range(len(library_data['Inchi'])):
        library_codes.append(library_data['Inchi'][i])
    print("Checkpoint 4")
    matches = []
    for i in range(len(test_codes)):
        if test_codes[i] in library_codes:
            matches.append(test_codes[i])
    print("Checkpoint 5")
    test_smiles = []
    matches_generations = []
    for i in range(len(matches)):
        for j in range(len(test_data['Inchi'])):
            if matches[i] == test_data['Inchi'][j]:
                test_smiles.append(test_data['Smiles'][j])
                matches_generations.append(test_data['Generation'][j])
    matches_data = {'Generation': matches_generations, 'Smiles': test_smiles, 'Inchi': matches}
    library_test_matches = pd.DataFrame(matches_data)
    return(library_test_matches)

In [None]:
%%time
a = find_matches('Nucleoside_Structisomers.tsv', 'formose_output.tsv')

In [None]:
a.to_csv(r'FormoseMatchesG3.tsv', header=None, index=None, sep='\t', mode='a') 