In [1]:
cd /projectnb/darpa/pyuning/Attribute graphs/NetOTC Application

/projectnb/darpa/pyuning/Attribute graphs/NetOTC Application


In [16]:
import sys
sys.path.append('/projectnb/darpa/pyuning/Attribute graphs/NetOTC Application')

import NetOTC_torch
import utils
import EntropicOTC as EOTC

In [17]:
import pandas as pd
import numpy as np
import time

df = pd.read_excel('NetOTCDistance.xlsx')
df.head()

Unnamed: 0,COLOR,STRUCTURE,MOELCULE NAME ON OSSILA,SMILES,HOMO (eV),LUMO (eV),Lmax (emission) nm (Solvent)
0,GREEN,,TXO-PhCz,N#CC1=C(N2C3=CC=CC=C3C4=CC=CC=C42)C(N5C6=CC=CC...,-5.78,-3.58,522
1,GREEN,,Px-VPN,N#CC1=C(C#N)C=C(C2=CC=C(N3C4=C(C=CC=C4)OC5=CC=...,-5.7,-3.3,577
2,GREEN,,DPS-PXZ,O=S(C1=CC=C(N2C3=C(C=CC=C3)OC4=CC=CC=C24)C=C1)...,-5.59,-2.79,507
3,GREEN,,4CzPN-Me,N#CC1=C(N2C3=C(C4=C2C=CC(C)=C4)C=C(C)C=C3)C(N5...,-5.8,-3.4,552
4,GREEN,,PXZ-TRZ,C12=CC=CC=C1N(C3=CC=C(C4=NC(C5=CC=CC=C5)=NC(C6...,-5.5,-3.1,545


In [18]:
import rdkit
from rdkit.Chem import rdchem
from rdkit import Chem
from ogb.utils.features import (allowable_features, atom_to_feature_vector, bond_to_feature_vector, atom_feature_vector_to_dict, bond_feature_vector_to_dict) 

In [19]:
def rdkit_feat(atom):

    prop = []
    nb = [a.GetSymbol() for a in atom.GetNeighbors()] # neighbor atom type symbols
    nb_h = sum([_ == 'H' for _ in nb]) # number of hydrogen as neighbor
    nb_o = sum([_ == 'O' for _ in nb]) # number of oxygen as neighbor
    nb_c = sum([_ == 'C' for _ in nb]) # number of carbon as neighbor
    nb_n = sum([_ == 'N' for _ in nb]) # number of nitrogen as neighbor
    nb_na = len(nb) - nb_h - nb_o - nb_n - nb_c
    prop.append(atom.GetAtomicNum())
    prop.append(str(atom.GetHybridization()))
    prop.append(atom.IsInRing())
    prop.append(atom.GetTotalDegree())
    prop.append(atom.GetFormalCharge())
    prop.append(atom.GetNumRadicalElectrons())
    prop.append(atom.GetIsAromatic())
    prop.append(nb_o)
    prop.append(nb_c)
    prop.append(nb_n)
    return prop

def feat_vec(smiles_string):

    mol = Chem.MolFromSmiles(smiles_string)
    atom_features_list = []
    for atom in mol.GetAtoms():
        #atom_to_feature_vector(atom)  # will extract 9 different atomic properties
        atom_features_list.append(rdkit_feat(atom))
    
    x = np.array(atom_features_list)
    df = pd.DataFrame(x, columns = ['AtomicNum','Hybridization','ring','TotalDegree','FormalCharge','NumRadicalElectrons','Aromatic','NumO','NumC','NumN'])
    #x = np.array(atom_features_list, dtype = np.int64)
    return df

In [20]:
smiles_list = df['SMILES']
mol_list = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
A_list = [Chem.GetAdjacencyMatrix(mol,useBO = True) for mol in mol_list]
P_list = [utils.adj_to_trans1(A) for A in A_list]
P_list2 = [utils.adj_to_trans2(A) for A in A_list]

In [21]:
for x in P_list:
    print(x.shape[0])

62
50
43
70
38
54
54
36
110
60
50
47
113
50


In [22]:
AN_list = [feat_vec(smiles)['AtomicNum'] for smiles in smiles_list]
Hybrid_list = [feat_vec(smiles)['Hybridization'] for smiles in smiles_list]
ring_list = [feat_vec(smiles)['ring'] for smiles in smiles_list]
aroma_list = [feat_vec(smiles)['Aromatic'] for smiles in smiles_list]

# This is the first example. Two networks with 62 and 70 nodes.

In [23]:
i, j = 0, 3
P1, P2 = P_list[i], P_list[j]
print(P1.shape[0],P2.shape[0])
lst = aroma_list
cost = utils.get_01_cost(lst[i], lst[j])

62 70


## This is the original exactOTC algorithm. Takes 482 seconds.

In [17]:
start_time = time.time()
[result1, Px, stat_dist1] = NetOTC.exact_otc_lp(P1, P2, cost)
now = time.time()
print(result1)
print(now-start_time)

0.08091488932927283
482.6685380935669


## This is exactOTC with emd function from POT. Takes 92 seconds.

In [10]:
start_time = time.time()
[result2, Py, stat_dist2] = NetOTC.exact_otc(P1, P2, cost)
now = time.time()
print(result2)
print(now-start_time)

0.0809148893292851
92.50548076629639


## This is original entropicOTC, takes 19 seconds.

In [24]:
start_time = time.time()
[result_e1, P_e1, stat_dist_e1] = EOTC.entropic_otc(P1, P2, cost, sink_iter = 20, get_sd = False)
now = time.time()
print(result_e1)
print(now-start_time)

0.2428947954512574
19.092622756958008


## This is entropicOTC with sinkhorn function from POT, will take a little bit longer. But it has more arguments in the function, e.g. regularization, and we also got a slightly larger distance indicating that the optimization is not as good as the original one.

In [26]:
start_time = time.time()
[result_e2, P_e2, stat_dist_e2] = EOTC.entropic_otc1(P1, P2, cost, reg_num = 0.1, sink_iter = 20, get_sd = False)
now = time.time()
print(result_e2)
print(now-start_time)

0.243070560335461
19.353610277175903


# This is the second example, two networks with 110 and 113 nodes.

In [27]:
i, j = 8, 12
P1, P2 = P_list[i], P_list[j]
print(P1.shape[0],P2.shape[0])
lst = aroma_list
cost = utils.get_01_cost(lst[i], lst[j])

110 113


## Original exactOTC algorithm.

In [12]:
start_time = time.time()
[result1, Px, stat_dist1] = NetOTC.exact_otc(P1, P2, cost)
now = time.time()
print(result1)
print(now-start_time)

0.20467463121546883
9304.753986597061


## exactOTC with emd from POT.

In [13]:
start_time = time.time()
[result2, Py, stat_dist2] = NetOTC.exact_otc_ot(P1, P2, cost)
now = time.time()
print(result2)
print(now-start_time)

0.20467463077813536
5506.978949308395


## Original entropicOTC.

In [33]:
start_time = time.time()
[result_e1, P_e1, stat_dist_e1] = EOTC.entropic_otc(P1, P2, cost, sink_iter = 10, get_sd = False)
now = time.time()
print(result_e1)
print(now-start_time)

0.5076235337328329
151.5682475566864


## entropicOTC with sinkhorn from POT.

In [32]:
start_time = time.time()
[result_e2, P_e2, stat_dist_e2] = EOTC.entropic_otc1(P1, P2, cost, reg_num = 0.1, sink_iter = 20, get_sd = False)
now = time.time()
print(result_e2)
print(now-start_time)

0.5079334918995498
150.21376729011536
