In [1]:
import pandas as pd
from pandas import Series
import numpy as np
import os
import pickle
import random
from collections import Counter
from operator import itemgetter
import joblib
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from scipy.stats import rankdata
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from rdkit import Chem
from rdkit.Chem.rdmolfiles import MolToSmiles, MolFromSmiles
from rdkit.Chem import Lipinski, Descriptors
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
import tmap as tm
from faerun import Faerun
from map4 import MAP4Calculator

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
folder = "/data/lipoNP/"

# TEST with Epicospirocins, penicimeroterpenoids, and Phakefustatin A + SI structures
We considered the class *bacterium* to be the positive class and the class *fungus* to be the negative one. 

In [111]:
epicospirocins_smiles_list = ["OC1=C(O)C(O)=C(C(OC)OC23CC(C(O)=C(O)C(O)=C4C)=C4CO3)C2=C1C", \
          "OC1=C(O)C(C)=C(C(OC)OC23CC(C(O)=C(O)C(O)=C4C)=C4CO3)C2=C1O", \
          "OC5=C(O)C(O)=C(C(O)OC67CC(C(O)=C(O)C(O)=C8C)=C8CO7)C6=C5C", \
          "OC9=C(O)C(C)=C(C(O)OC%10%11CC(C(O)=C(O)C(O)=C%12C)=C%12CO%11)C%10=C9O"]

Rhizolutin_smiles = "C/C1=C\[C@H](O2)[C@@](/C=C/C(C)=C\[C@]3([H])[C@]1([H])C[C@H](CC)OC3=O)([H])C[C@H](O)CC2=O"

penicimeroterpenoids_smiles_list = ["O=C1[C@@](C)(O)C([C@@]2(C(OC)=O)[C@@]3(C)C[C@H](OC4=O)[C@]5([H])C(C)(C)[C@@H](OC(C)=O)CC[C@@]54[C@]3([H])[C@@H]1C(C)=C2C)=O", \
              "C[C@]12C[C@H](OC3=O)[C@]4([H])C(C)(C)[C@@H](OC(C)=O)CC[C@@]43[C@]1([H])C=C(C)[C@]5(C)[C@@]2(C(OC)=O)C([C@]5(O)C)=O", \
              "C[C@]12C[C@H](OC3=O)[C@]4([H])C(C)(C)[C@@H](OC(C)=O)CC[C@@]43[C@]1([H])C=C(C)[C@]5(C)[C@@]2(C(OC)=O)[C@@](O)(C)C5=O"]

BosamycinA_smiles = "N[C@H](CC1=CC=C(O)C=C1)C(N[C@@H](CC(C)C)C(N[C@H]([C@H](C(O)=O)O)C(N[C@@H](CO)C(N[C@H](CC2=CC=C(O)C=C2OC)C(N[C@@H](CC(C)C)C(NCC(O)=O)=O)=O)=O)=O)=O)=O"

phakefustatin_smiles = "O=C(N1[C@@H](CCC1)C(N[C@@H](CC2=CC=CC=C2)C(N3)=O)=O)[C@H](C(C)C)NC([C@H]4N(C([C@H]5N(C([C@H](CCCNC(N)=N)NC([C@@H]3CC(C6=CC=CC=C6N)=O)=O)=O)CCC5)=O)CCC4)=O"

SI_smiles_list = ["OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1OCCC2=CC=C(O)C=C2", "OC(CCC/C=C1C[C@@H]2[C@H]([C@@H](C[C@@H]2O/1)O)/C=C/[C@H](CCCCC)O)=O", \
               "CC[C@H]1[C@@H](C)C(C(C)=C([C@H]([C@@H](O)CC)C)O1)=O", "C[C@@H]([C@H]1CC[C@@H]2[C@]1(C)CC[C@H]3[C@H]2CC=C4[C@]3(C)CC[C@H](O)C4)CCCC(C)C", \
               "OC/C=C(C)/CC/C=C(C)/CC/C=C(C)\C", "O[C@@H]1[C@H](CC[C@@H](C1)C)C(C)C", "O=CC1=CC=CC=C1", \
               "CC1C(=O)NC(C(=O)NC2CSSCC3C(=O)NC(C(=O)NC(C(=O)NCC(=O)NC(C(=O)NC(CSSCC(C(=O)NC(CSSCC(C(=O)NC(C(=O)NCC(=O)NC(C(=O)NCC(=O)N1)CCCCN)CCCCN)N)C(=O)NC(C(=O)NCC(=O)NC(C(=O)N3)CO)C(C)O)NC(=O)C(NC(=O)C(NC(=O)C(NC(=O)C(NC(=O)C(NC(=O)C(NC2=O)CO)CCCNC(=N)N)CC(C)C)CCSC)CC4=CC=C(C=C4)O)CC(=O)O)C(=O)N)CCCCN)CO)CCCNC(=N)N)CCCCN"]



#new_plants:
hunzeylanineA_smiles = "[H][C@]1(C(OC)=O)N2C3=CC=CC=C3[C@]45[C@@]2(OC6=CC7=C([C@@]8(C9(C(OC)=O)CO%10)[C@]%10([C@@](C[C@H]9/C(C%11)=C\C)([H])N%11CC8)N7C)C=C6C5)[C@@](C[C@@]1([H])/C(C%12)=C\C)([H])N%12CC4"   
hyperfolB_smiles = "O=C([C@@]1(C/C=C(C)/C)C[C@H](C/C=C(C)/C)/C(C)=C2/C(C(C)C)=O)[C@@](C/C=C(C)/C)(OC1=O)C2=O"
pegaharmolA_smiles = "OC1=CC=C2C(NC3=C2C=CN=C3C4=NC(N)=NC=C4)=C1[C@@]5([H])N6C([C@@H](O)CC6)=NC7=C5C=CC=C7"
perovsfolinA_smiles = "OC(C(O)=C1)=CC2=C1[C@H]3[C@H](C(O[C@H](CC4=CC=C(O)C(O)=C4)C(OC)=O)=O)[C@@H]5C6=C7[C@@]2(O3)C(C(C(C)C)=CC7=CC=C6C(C)(C)CC5)=O"
meloyunnanineA_smiles = "O[C@@]12[C@]3(C(OC)=O)[C@]4(C5=CC=CC=C5N2)[C@H]6N(CC4)CC=C[C@]6(C3)[C@H]1C"
mucroniferalA_smiles = "O=C(OC)[C@@H]([C@H]1O2)[C@@H](C(O)=O)[C@H]2C3=C1C=CC4=C3OCO4"

In [4]:
coconut = pd.read_pickle(folder + "coconut_4classifier_less_classes.pkl")
frame = coconut

In [72]:
MAP4 = MAP4Calculator(dimensions=1024)
def calc_map4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
    mol = Chem.MolFromSmiles(smiles)
    map4 = MAP4.calculate(mol)
    return np.array(map4)

def map4_kernel_SVM(a, b=None):
    if b is None:
        b=a
    JS_all_pairs = np.zeros((len(a),len(b)))
    for i,fp1 in enumerate(a):
        for j,fp2 in enumerate(b):
            JS_all_pairs[i,j] = np.float(np.count_nonzero(fp1 == fp2)) / np.float(len(fp1))
    return JS_all_pairs

def map4_kernel_NN(fp1, fp2):
    return 1- np.float(np.count_nonzero(fp1 == fp2)) / np.float(len(fp1))


def distance(a, b):
    """Estimates the Jaccard distance of two binary arrays based on their hashes.

Arguments:
  a {numpy.ndarray} -- An array containing hash values.
  b {numpy.ndarray} -- An array containing hash values.

Returns:
  float -- The estimated Jaccard distance.
"""

    # The Jaccard distance of Minhashed values is estimated by
    return 1.0 - np.float(np.count_nonzero(a == b)) / np.float(len(a))

def find_map_seqNN(fp, dataframe = frame[frame.Set=="training"]):
    best_dist = float("inf")
    dists = dataframe["MAP4"].map(lambda fp2 : distance(fp,fp2))
    NNi = np.argmin(dists)
    best_dist = dists.iloc[NNi]
    NN = dataframe["coconut_id"].iloc[NNi]
    return best_dist, NN

In [73]:
SVM = joblib.load(folder + "SVM-coconut_less_classes.pkl")

In [112]:
epicospirocins_map4_list = [calc_map4(smiles) for smiles in epicospirocins_smiles_list]
penicimeroterpenoids_map4_list = [calc_map4(smiles) for smiles in penicimeroterpenoids_smiles_list]
Rhizolutin_map4 = calc_map4(Rhizolutin_smiles)
BosamycinA_map4 = calc_map4(BosamycinA_smiles)
phakefustatin_map4 = calc_map4(phakefustatin_smiles)
SI_map4_list = [calc_map4(smiles) for smiles in SI_smiles_list]
hunzeylanine_map4 = calc_map4(hunzeylanineA_smiles)
hyperfol_map4 = calc_map4(hyperfolB_smiles)
pegaharmol_map4 = calc_map4(pegaharmolA_smiles)
perovsfolin_map4 = calc_map4(perovsfolinA_smiles)
meloyunnanine_map4 = calc_map4(meloyunnanineA_smiles)
mucroniferal_map4 = calc_map4(mucroniferalA_smiles)

epicospirocins_NN_dist_list = [find_map_seqNN(fp) for fp in epicospirocins_map4_list]
penicimeroterpenoids_NN_dist_list = [find_map_seqNN(fp) for fp in penicimeroterpenoids_map4_list]
Rhizolutin_NN_dist = find_map_seqNN(Rhizolutin_map4)
BosamycinA_NN_dist = find_map_seqNN(BosamycinA_map4)
phakefustatin_NN_dist = find_map_seqNN(phakefustatin_map4)
SI_NN_dist_list = [find_map_seqNN(fp) for fp in SI_map4_list]
hunzeylanine_NN_dist = find_map_seqNN(hunzeylanine_map4)
hyperfol_NN_dist = find_map_seqNN(hyperfol_map4)
pegaharmol_NN_dist = find_map_seqNN(pegaharmol_map4)
perovsfolin_NN_dist = find_map_seqNN(perovsfolin_map4)
meloyunnanine_NN_dist = find_map_seqNN(meloyunnanine_map4)
mucroniferal_NN_dist = find_map_seqNN(mucroniferal_map4)

In [88]:
epicospirocins_NN_dist_list

[(0.7607421875, 'CNP0378686'),
 (0.6201171875, 'CNP0378686'),
 (0.6748046875, 'CNP0299907'),
 (0.556640625, 'CNP0299907')]

In [89]:
penicimeroterpenoids_NN_dist_list

[(0.6083984375, 'CNP0145120'),
 (0.4365234375, 'CNP0145120'),
 (0.4873046875, 'CNP0184369')]

In [90]:
Rhizolutin_NN_dist

(0.7998046875, 'CNP0120181')

In [91]:
BosamycinA_NN_dist

(0.7978515625, 'CNP0426295')

In [92]:
phakefustatin_NN_dist

(0.7021484375, 'CNP0426614')

In [93]:
SI_NN_dist_list

[(0.279296875, 'CNP0167785'),
 (0.7060546875, 'CNP0085851'),
 (0.8154296875, 'CNP0299326'),
 (0.3271484375, 'CNP0211064'),
 (0.0, 'CNP0232612'),
 (0.501953125, 'CNP0206281'),
 (0.685546875, 'CNP0132365'),
 (0.7509765625, 'CNP0343194')]

In [94]:
hunzeylanine_NN_dist

(0.806640625, 'CNP0270143')

In [95]:
hyperfol_NN_dist

(0.7470703125, 'CNP0167147')

In [96]:
pegaharmol_NN_dist

(0.8818359375, 'CNP0181707')

In [97]:
perovsfolin_NN_dist

(0.830078125, 'CNP0254054')

In [98]:
meloyunnanine_NN_dist

(0.548828125, 'CNP0229684')

In [113]:
mucroniferal_NN_dist

(0.80078125, 'CNP0345199')

In [114]:
pred_epicospirocins_SVM = SVM.predict(epicospirocins_map4_list)
pred_penicimeroterpenoids_SVM = SVM.predict(penicimeroterpenoids_map4_list)
pred_Rhizolutin_SVM = SVM.predict([Rhizolutin_map4])
pred_BosamycinA_SVM = SVM.predict([BosamycinA_map4])
pred_phakefustatin_SVM = SVM.predict([phakefustatin_map4])
pred_SI_SVM = SVM.predict(SI_map4_list)
pred_hunzeylanine_SVM = SVM.predict([hunzeylanine_map4])
pred_hyperfol_SVM= SVM.predict([hyperfol_map4])
pred_pegaharmol_SVM = SVM.predict([pegaharmol_map4])
pred_perovsfolin_SVM = SVM.predict([perovsfolin_map4])
pred_meloyunnanine_SVM = SVM.predict([meloyunnanine_map4])
pred_mucroniferal_SVM = SVM.predict([mucroniferal_map4])

In [100]:
print(pred_epicospirocins_SVM) # predicted fungal origin

[1. 1. 1. 1.]


In [101]:
pred_Rhizolutin_SVM

array([0.])

In [102]:
print(pred_penicimeroterpenoids_SVM) # predicted fungal origin

[1. 1. 1.]


In [103]:
print(pred_BosamycinA_SVM)

[2.]


In [104]:
print(pred_phakefustatin_SVM) # predicted bacterial origin

[2.]


In [105]:
print(pred_SI_SVM)

[0. 1. 2. 0. 0. 0. 0. 2.]


In [106]:
pred_hunzeylanine_SVM

array([0.])

In [107]:
pred_hyperfol_SVM

array([0.])

In [108]:
pred_pegaharmol_SVM

array([0.])

In [109]:
pred_perovsfolin_SVM

array([0.])

In [110]:
pred_meloyunnanine_SVM

array([0.])

In [115]:
pred_mucroniferal_SVM

array([0.])

## Farnesol is in the training set!

### 0 = plant, 1 = fungi, 2 = bacteria 
salidroside (plant) 0, prostacyclin (animal) 1, serricorole (animal) 2, cholesterol (animal) 0,
farnesol (plant) 0, menthol (plant) 0, benzaldehyde (?) 0, conotoxin (animal) 2