In [1]:
import sys
import argparse
import io
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [2]:
import os

In [3]:
os.getcwd()

'/home/rgur/CS6250_project/scripts'

In [4]:
os.chdir('/home/rgur/CS6250_project/g2g/polymers_trial3/lr_0.001_bs_8_depthT_6_depthG_3/results')

In [5]:
stdin = open('results.test', 'r')
num_decode = 5
sim_delta = .2
prop_delta = 6
total_n = 899
mols_path = '../../data/mols.txt'

In [6]:
# Tanimoto similarity function
def similarity(a, b):
    if a is None or b is None:
        return 0.0
    amol = Chem.MolFromSmiles(a)
    bmol = Chem.MolFromSmiles(b)
    if amol is None or bmol is None:
        return 0.0
    fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False)
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def diversity(pairs):
    diversity_values = []
    sources = set()
    decoded = {}
    # Build decoded dictionary that maps source polymers to the list of translated polymers
    for pair in pairs:
        source = pair[0]
        translated = pair[1]
        sources.add(source)
        if source in decoded:
            decoded[source].append(translated)
        else:
            decoded[source] = [translated]

    # Iterate over source molecules in dictionary and determine individual diversity scores
    for source in decoded:
        div = 0.0
        total = 0
        test_list = decoded[source]
        if len(test_list) > 1:
            for test in test_list:
                div += 1 - similarity(source, test)
                total += 1
            div /= total
        diversity_values.append(div)
    sources = list(sources)
    print 'Number of source polymers: ' + str(len(sources))
    return np.mean(diversity_values)

In [7]:
mols = []
for line in open(mols_path, 'r'):
    mols.append(line.strip())
mols[0:5]

['CC(C1=CC=CC=C1)CC(C2=CC=CC=C2)CC(C3=CC=CC=C3)CC(C4=CC=CC=C4)CC(C1=CC=CC=C1)CC(C2=CC=CC=C2)CC(C3=CC=CC=C3)CC(C4=CC=CC=C4)CC(C1=CC=CC=C1)CC(C2=CC=CC=C2)CC(C3=CC=CC=C3)CC(C4=CC=CC=C4)',
 'CCCCCOC(=O)CCCCCOC(=O)CCCCCOC(=O)CCCCCOC(=O)CCCCCOC(=O)CCCCCOC(=O)',
 'CCC(C(F)(F)(F))(C(F)(F)(F))OCCC(C(F)(F)(F))(C(F)(F)(F))OCCC(C(F)(F)(F))(C(F)(F)(F))OCCC(C(F)(F)(F))(C(F)(F)(F))OCCC(C(F)(F)(F))(C(F)(F)(F))OCCC(C(F)(F)(F))(C(F)(F)(F))O',
 'CC(=O)OCC(=O)OCC(=O)OCC(=O)OCC(=O)OCC(=O)O',
 'C(C=C1)=CC=C1SC(C=C2)=CC=C2SC(C=C1)=CC=C1SC(C=C2)=CC=C2SC(C=C1)=CC=C1SC(C=C2)=CC=C2S']

In [8]:
data = []
start_append = False

len_d = total_n
for line in stdin:
    if 'Done' in line:
        start_append = True
    elif start_append:
            data.append(line.split())
data

[['0',
  'C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1',
  'O=C(O)CCCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCCC(=O)O',
  '0.0405405405405',
  '6.7986934096'],
 ['2',
  'C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1',
  'O=C(O)CCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCC(=O)OCCCC(=O)O',
  '0.2405405405405',
  '6.7986934096'],
 ['17',
  'Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1',
  'O=C(O)CCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCC(=O)O',
  '0.2483870967742',
  '6.68117658508'],
 ['19',
  'Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(

In [9]:

#data = [line.split() for line in sys.stdin]
data = [(int(e),a,b,float(c),float(d)) for e,a,b,c,d in data]

In [11]:
n_mols = len_d

n_succ = 0.0

#load fp_df_fixed
fp_df = pd.read_csv('./fp_df_fixed.csv')
ignore_cols = [col for col in fp_df.keys() if col == 'ID' or 'Unnamed' in col]
fp_df =fp_df.drop(ignore_cols, axis=1)

In [12]:
def build_dict(data):
    '''
    Build a dictionary for all successful pairs
    '''
    d = {}
    for i in data:
        ind = i[0]
        x = i[1]
        y = i[2]
        sim = i[3]
        bg = i[4]
        if x in d:
            d[x].append((ind, y, sim, bg))
        else:
            d[x] = [(ind, y, sim, bg)]
    return d

data_d = build_dict(data)

In [72]:
fps = fp_df.iloc[pd.Index([x[0] for x in data]), :]
fps

Unnamed: 0,afp_C3_C4_C3,afp_C3_C4_C4,afp_C3_C4_H1,afp_C3_C4_N3,afp_C3_C4_O2,afp_C3_N3_C3,afp_C3_N3_C4,afp_C3_N3_H1,afp_C3_N3_O2,afp_C3_O2_C3,...,mfp_MQNs36,mfp_MQNs37,mfp_MQNs38,mfp_MQNs39,mfp_MQNs40,mfp_MQNs41,mfp_MQNs42,mfp_NumAliphaticRings,mfp_NumAromaticRings,mfp_tpsa
0,0.0,0.037037,0.074074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243519
2,0.0,0.028571,0.057143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187857
17,0.0,0.029412,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193382
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89,0.0,0.0,0.333333,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.095833


In [53]:
new_targets = []
pairs = []
caught = []

In [55]:
for x, val in zip(data_d.keys(), data_d.values()):
    #print "Values: %s\n" %val
    good = [(ind,sim,bg,y) for ind,y,sim,bg in val if 1>sim>=sim_delta and bg>=prop_delta]
    #print "Good: %s\n" %good
    for tup in good:
        target = tup[3]
        ind = tup[0]
        #print "Target %s\n" %target
        if target not in mols:
            #print "target not in mols\n"
            fp = fp_df.iloc[ind, :].tolist()
            is_same = []
            for other in new_targets:
                #print "Other %s\n" %other[0:10]
                #print "New fp %s\n" %fp[0:10]
                result = (np.abs(np.subtract(other, fp)) < .001).all()
                #print "Bool %s\n" %result
                is_same.append(result)

            #print "Is same: %s" %is_same
            if not any(is_same):
                new_targets.append(fp)
                #print "new_targets %s\n" %[i[0:10] for i in new_targets]
                pairs.append((x, target))
                
                n_succ += 1
                print '%s %s %s' %(ind, x, target)
            else:
                #print "Target already in new targets\n"
                #print "new_targets %s\n" %[i[0:10] for i in new_targets]
                caught.append(target)
        else:
            print "target in mols"

2 C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1 O=C(O)CCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCC(=O)OCCCC(=O)O
89 c1ccc(N(c2ccc(C#N)cc2)c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)NC6=O)cc4C3=O)cc2)cc1c1ccc(N(c2ccc(C#N)cc2)c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)NC6=O)cc4C3=O)cc2)cc1c1ccc(N(c2ccc(C#N)cc2)c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)NC6=O)cc4C3=O)cc2)cc1 CCCCCCOC(=O)NCNCNCNCNCNCNCNCNC(=O)OCCCC(=O)OCCCC(=O)O
17 Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1 O=C(O)CCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCC(=O)O
19 Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1 O=C(O)CCCCCCC(=O)OCCCCCC(=O)OCCCCCC(=O)OCCCCCC(=O)O


In [49]:
data

[(0,
  'C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1',
  'O=C(O)CCCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCCC(=O)O',
  0.0405405405405,
  6.7986934096),
 (2,
  'C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4ccc(-c5nc6ccccc6[nH]5)cc4)nc3c2)c1',
  'O=C(O)CCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCC(=O)OCCCC(=O)O',
  0.2405405405405,
  6.7986934096),
 (17,
  'Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1',
  'O=C(O)CCCCC(=O)OCCCCC(=O)OCCCCC(=O)OCCCC(=O)O',
  0.2483870967742,
  6.68117658508),
 (19,
  'Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2)cc1Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc(C=O)cc3)cc2