In [1]:
import pandas as pd
from matplotlib.colors import ListedColormap
from faerun import Faerun
from rdkit import Chem 
from rdkit.Chem import Lipinski, Descriptors, rdMolDescriptors, AllChem, PandasTools, rdchem
from pandarallel import pandarallel
import numpy as np
pandarallel.initialize(progress_bar=False)
import tmap as tm
from map4 import MAP4Calculator
import os
import seaborn as sns
import joblib
import matplotlib.pyplot as plt
from rdkit.Chem.AtomPairs import Pairs
folder = "/data/coconut/"

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
coconut = pd.read_pickle(folder + "coconut_prop.pkl")

In [3]:
coconut.origin.unique()

array([ 0.,  3., nan,  1.,  2.])

In [4]:
extra = coconut.query("simple_tax=='other'").copy()

In [5]:
coconut = coconut.query("origin == 0 or origin == 1 or origin == 2").copy()
coconut.replace(["NaN", 'NaT'], np.nan, inplace = True)
coconut = coconut.dropna()

  mask = arr == x
  mask |= arr == x


In [6]:
len(coconut)

60171

In [7]:
np.max(np.array(coconut.MW.to_list()))

2899.245115524003

In [8]:
np.min(np.array(coconut.MW.to_list()))

80.037448128

In [9]:
coconut["LOW_fcsp3"] = coconut.fcsp3.map(lambda x: x<=0.2)
coconut["MED_fcsp3"] = coconut.fcsp3.map(lambda x: x<0.8)
coconut["HIGH_fcsp3"] = coconut.fcsp3.map(lambda x: x>=0.8)
coconut["LOW_mw"] = coconut.MW.map(lambda x: x<=300)
coconut["MED_mw"] = coconut.MW.map(lambda x: x<800)
coconut["HIGH_mw"] = coconut.MW.map(lambda x: x>=800)

In [10]:
coconut.alogp

1356       -1.143500000000003
2262       -4.003099999999998
2281       0.5711000000000008
2292       -4.090200000000002
2298        3.803999999999998
                 ...         
377234    -13.033400000000007
379207                -4.0466
381009     -2.045899999999998
385490    -4.9348000000000045
401521     1.3658999999999992
Name: alogp, Length: 60171, dtype: object

In [11]:
coconut["LOW_alogp"] = coconut.alogp.map(lambda x: float(x)<=-2)
coconut["MED_alogp"] = coconut.alogp.map(lambda x: float(x)<8)
coconut["HIGH_alogp"] = coconut.alogp.map(lambda x: float(x)>=8)

In [12]:
LOW_alogp = len(coconut.query("LOW_alogp==True"))
print("LOW_alogp", LOW_alogp)
LOW_fcsp3_plants = len(coconut.query("LOW_alogp==True and origin==0"))
print("LOW_alogp_plants", LOW_fcsp3_plants)
LOW_alogp_fungi = len(coconut.query("LOW_alogp==True and origin==1"))
print("LOW_alogp_fungi", LOW_alogp_fungi)
LOW_alogp_bacteria = len(coconut.query("LOW_alogp==True and origin==2"))
print("LOW_alogp_bacteria", LOW_alogp_bacteria)

LOW_alogp 6674
LOW_alogp_plants 4855
LOW_alogp_fungi 373
LOW_alogp_bacteria 1446


In [13]:
MED_alogp = len(coconut.query("MED_alogp==True and LOW_fcsp3==False"))
print("MED_alogp", MED_alogp)
MED_alogp_plant = len(coconut.query("MED_alogp==True and LOW_alogp==False and origin==0"))
print("MED_alogp_plant", MED_alogp_plant)
MED_alogp_fungi = len(coconut.query("MED_alogp==True and LOW_alogp==False and origin==1"))
print("MED_alogp_fungi", MED_alogp_fungi)
MED_alogp_bacteria = len(coconut.query("MED_alogp==True and LOW_alogp==False and origin==2"))
print("MED_alogp_bacteria", MED_alogp_bacteria)

MED_alogp 52030
MED_alogp_plant 28315
MED_alogp_fungi 15000
MED_alogp_bacteria 8906


In [14]:
HIGH_alogp = len(coconut.query("HIGH_alogp==True"))
print("HIGH_alogp", HIGH_alogp)
HIGH_alogp_plants = len(coconut.query("HIGH_alogp==True and origin==0"))
print("HIGH_alogp_plants", HIGH_alogp_plants)
HIGH_alogp_fungi = len(coconut.query("HIGH_alogp==True and origin==1"))
print("HIGH_alogp_fungi", HIGH_alogp_fungi)
HIGH_alogp_bacteria = len(coconut.query("HIGH_alogp==True and origin==2"))
print("HIGH_alogp_bacteria", HIGH_alogp_bacteria)

HIGH_alogp 1276
HIGH_alogp_plants 602
HIGH_alogp_fungi 275
HIGH_alogp_bacteria 399


In [15]:
LOW_fcsp3 = len(coconut.query("LOW_fcsp3==True"))
print("LOW_fcsp3", LOW_fcsp3)
LOW_fcsp3_plants = len(coconut.query("LOW_fcsp3==True and origin==0"))
print("LOW_fcsp3_plants", LOW_fcsp3_plants)
LOW_fcsp3_fungi = len(coconut.query("LOW_fcsp3==True and origin==1"))
print("LOW_fcsp3_fungi", LOW_fcsp3_fungi)
LOW_fcsp3_bacteria = len(coconut.query("LOW_fcsp3==True and origin==2"))
print("LOW_fcsp3_bacteria", LOW_fcsp3_bacteria)

LOW_fcsp3 6866
LOW_fcsp3_plants 4213
LOW_fcsp3_fungi 1580
LOW_fcsp3_bacteria 1073


In [16]:
MED_fcsp3 = len(coconut.query("MED_fcsp3==True and LOW_fcsp3==False"))
print("MED_fcsp3", MED_fcsp3)
MED_fcsp3_plant = len(coconut.query("MED_fcsp3==True and LOW_fcsp3==False and origin==0"))
print("MED_fcsp3_plant", MED_fcsp3_plant)
MED_fcsp3_fungi = len(coconut.query("MED_fcsp3==True and LOW_fcsp3==False and origin==1"))
print("MED_fcsp3_fungi", MED_fcsp3_fungi)
MED_fcsp3_bacteria = len(coconut.query("MED_fcsp3==True and LOW_fcsp3==False and origin==2"))
print("MED_fcsp3_bacteria", MED_fcsp3_bacteria)

MED_fcsp3 41352
MED_fcsp3_plant 22032
MED_fcsp3_fungi 11334
MED_fcsp3_bacteria 7986


In [17]:
HIGH_fcsp3 = len(coconut.query("HIGH_fcsp3==True"))
print("HIGH_fcsp3", HIGH_fcsp3)
HIGH_fcsp3_plants = len(coconut.query("HIGH_fcsp3==True and origin==0"))
print("HIGH_fcsp3_plants", HIGH_fcsp3_plants)
HIGH_fcsp3_fungi = len(coconut.query("HIGH_fcsp3==True and origin==1"))
print("HIGH_fcsp3_fungi", HIGH_fcsp3_fungi)
HIGH_fcsp3_bacteria = len(coconut.query("HIGH_fcsp3==True and origin==2"))
print("HIGH_fcsp3_bacteria", HIGH_fcsp3_bacteria)

HIGH_fcsp3 11953
HIGH_fcsp3_plants 7527
HIGH_fcsp3_fungi 2734
HIGH_fcsp3_bacteria 1692


In [18]:
HIGH_fcsp3+MED_fcsp3+LOW_fcsp3

60171

In [19]:
LOW_mw= len(coconut.query("LOW_mw==True"))
print("LOW_mw", LOW_mw)
LOW_mw_plant = len(coconut.query("LOW_mw==True and origin ==0"))
print("LOW_mw_plant", LOW_mw_plant)
LOW_mw_fungi = len(coconut.query("LOW_mw==True and origin ==1"))
print("LOW_mw_fungi", LOW_mw_fungi)
LOW_mw_bacteria = len(coconut.query("LOW_mw==True and origin ==2"))
print("LOW_mw_bacteria", LOW_mw_bacteria)

LOW_mw 14228
LOW_mw_plant 7072
LOW_mw_fungi 4919
LOW_mw_bacteria 2237


In [20]:
HIGH_mw=len(coconut.query("HIGH_mw==True"))
print("HIGH_mw", HIGH_mw)
HIGH_mw_plant=len(coconut.query("HIGH_mw==True and origin==0"))
print("HIGH_mw_plant", HIGH_mw_plant)
HIGH_mw_fungi=len(coconut.query("HIGH_mw==True and origin==1"))
print("HIGH_mw_fungi", HIGH_mw_fungi)
HIGH_mw_bacteria=len(coconut.query("HIGH_mw==True and origin==2"))
print("HIGH_mw_bacteria", HIGH_mw_bacteria)

HIGH_mw 5688
HIGH_mw_plant 2622
HIGH_mw_fungi 618
HIGH_mw_bacteria 2448


In [21]:
HIGH_mw_bacteria + HIGH_mw_fungi+ HIGH_mw_plant

5688

In [22]:
MED_mw=len(coconut.query("MED_mw==True and LOW_mw==False"))
print("MED_mw", MED_mw)
MED_mw_plants=len(coconut.query("MED_mw==True and LOW_mw==False and origin==0"))
print("MED_mw_plants", MED_mw_plants)
MED_mw_fungi=len(coconut.query("MED_mw==True and LOW_mw==False and origin ==1"))
print("MED_mw_fungi", MED_mw_fungi)
MED_mw_bacteria=len(coconut.query("MED_mw==True and LOW_mw==False and origin==2"))
print("MED_mw_bacteria", MED_mw_bacteria)

MED_mw 40255
MED_mw_plants 24078
MED_mw_fungi 10111
MED_mw_bacteria 6066


In [23]:
MED_mw+HIGH_mw+LOW_mw

60171

In [24]:
len(coconut)

60171

In [25]:
peptide=len(coconut.query("isPeptide==True"))
print("peptide", peptide)
peptide_plant=len(coconut.query("isPeptide==True and origin==0"))
print("peptide_plant", peptide_plant)
peptide_fungi=len(coconut.query("isPeptide==True and origin==1"))
print("peptide_fungi", peptide_fungi)
peptide_bacteria=len(coconut.query("isPeptide==True and origin==2"))
print("peptide_bacteria", peptide_bacteria)

peptide 2923
peptide_plant 194
peptide_fungi 676
peptide_bacteria 2053


In [26]:
sugar=len(coconut.query("hasSugar==True"))
print("sugar", sugar)
sugar_plants=len(coconut.query("hasSugar==True and origin ==0"))
print("sugar_plants", sugar_plants)
sugar_fungi=len(coconut.query("hasSugar==True and origin ==1"))
print("sugar_fungi", sugar_fungi)
sugar_bacteria=len(coconut.query("hasSugar==True and origin ==2"))
print("sugar_bacteria", sugar_bacteria)

sugar 10850
sugar_plants 8260
sugar_fungi 797
sugar_bacteria 1793


In [27]:
plant = len(coconut.query("origin==0"))
print("plant", plant)
fungi = len(coconut.query("origin==1"))
print("fungi", fungi)
bacteria = len(coconut.query("origin==2"))
print("bacteria", bacteria)

plant 33772
fungi 15648
bacteria 10751


In [28]:
SVM = joblib.load(folder + f"MAP4-SVM-coconut.all.pkl")

In [29]:
MAP4 = MAP4Calculator(dimensions=1024)
def calc_map4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
    mol = Chem.MolFromSmiles(smiles)
    map4 = MAP4.calculate(mol)
    return np.array(map4)

def map4_kernel_SVM(a, b=None):
    if b is None:
        b=a
    JS_all_pairs = np.zeros((len(a),len(b)))
    for i,fp1 in enumerate(a):
        for j,fp2 in enumerate(b):
            JS_all_pairs[i,j] = np.float(np.count_nonzero(fp1 == fp2)) / np.float(len(fp1))
    return JS_all_pairs

In [30]:
extra["map4"] = extra.SMILES.map(calc_map4)

In [None]:
extra["prediction"] = extra.map4.map(lambda x : SVM.predict_proba([x]))

In [None]:
extra["prediction_"] = extra.map4.map(lambda x : SVM.predict([x]))

In [None]:
def clean_pred(pred_):
    dict = {0:"plants", 1:"fungi", 2:"bacteria"}
    return dict[pred_[0]]

In [None]:
extra["predicted_organism"] = extra.prediction_.map(clean_pred)

In [None]:
extra_=extra[["coconut_id", "textTaxa", "SMILES", 'citationDOI', 'name', 'synonyms']].copy()
PandasTools.AddMoleculeColumnToFrame(extra_, "SMILES", "molecule", includeFingerprints=False)
PandasTools.SaveXlsxFromFrame(extra_, "other_taxa_with_prediction.xlsx", "molecule")

In [36]:
extra["LOW_fcsp3"] = extra.fcsp3.map(lambda x: x<=0.2)
extra["MED_fcsp3"] = extra.fcsp3.map(lambda x: x<0.8)
extra["HIGH_fcsp3"] = extra.fcsp3.map(lambda x: x>=0.8)
extra["LOW_mw"] = extra.MW.map(lambda x: x<=300)
extra["MED_mw"] = extra.MW.map(lambda x: x<800)
extra["HIGH_mw"] = extra.MW.map(lambda x: x>=800)

In [42]:
len(extra.query("predicted_organism == 'bacteria'"))

430

In [46]:
LOW_fcsp3 = len(extra.query("LOW_fcsp3==True"))
print("LOW_fcsp3", LOW_fcsp3)
LOW_fcsp3_plants = len(extra.query("LOW_fcsp3==True and predicted_organism == 'plants'"))
print("LOW_fcsp3_plants", LOW_fcsp3_plants)
LOW_fcsp3_fungi = len(extra.query("LOW_fcsp3==True and predicted_organism == 'fungi'"))
print("LOW_fcsp3_fungi", LOW_fcsp3_fungi)
LOW_fcsp3_bacteria = len(extra.query("LOW_fcsp3==True and predicted_organism == 'bacteria'"))
print("LOW_fcsp3_bacteria", LOW_fcsp3_bacteria)
print("******")

MED_fcsp3 = len(extra.query("MED_fcsp3==True and LOW_fcsp3==False"))
print("MED_fcsp3", MED_fcsp3)
MED_fcsp3_plant = len(extra.query("MED_fcsp3==True and LOW_fcsp3==False and predicted_organism == 'plants'"))
print("MED_fcsp3_plant", MED_fcsp3_plant)
MED_fcsp3_fungi = len(extra.query("MED_fcsp3==True and LOW_fcsp3==False and predicted_organism == 'fungi'"))
print("MED_fcsp3_fungi", MED_fcsp3_fungi)
MED_fcsp3_bacteria = len(extra.query("MED_fcsp3==True and LOW_fcsp3==False and predicted_organism == 'bacteria'"))
print("MED_fcsp3_bacteria", MED_fcsp3_bacteria)
print("******")

HIGH_fcsp3 = len(extra.query("HIGH_fcsp3==True"))
print("HIGH_fcsp3", HIGH_fcsp3)
HIGH_fcsp3_plants = len(extra.query("HIGH_fcsp3==True and predicted_organism == 'plants'"))
print("HIGH_fcsp3_plants", HIGH_fcsp3_plants)
HIGH_fcsp3_fungi = len(extra.query("HIGH_fcsp3==True and predicted_organism == 'fungi'"))
print("HIGH_fcsp3_fungi", HIGH_fcsp3_fungi)
HIGH_fcsp3_bacteria = len(extra.query("HIGH_fcsp3==True and predicted_organism == 'bacteria'"))
print("HIGH_fcsp3_bacteria", HIGH_fcsp3_bacteria)
print("******")

LOW_mw= len(extra.query("LOW_mw==True"))
print("LOW_mw", LOW_mw)
LOW_mw_plant = len(extra.query("LOW_mw==True and predicted_organism == 'plants'"))
print("LOW_mw_plant", LOW_mw_plant)
LOW_mw_fungi = len(extra.query("LOW_mw==True and predicted_organism == 'fungi'"))
print("LOW_mw_fungi", LOW_mw_fungi)
LOW_mw_bacteria = len(extra.query("LOW_mw==True and predicted_organism == 'bacteria'"))
print("LOW_mw_bacteria", LOW_mw_bacteria)
print(LOW_mw_bacteria+LOW_mw_fungi+LOW_mw_plant)
print("******")

HIGH_mw=len(extra.query("HIGH_mw==True"))
print("HIGH_mw", HIGH_mw)
HIGH_mw_plant=len(extra.query("HIGH_mw==True and predicted_organism == 'plants'"))
print("HIGH_mw_plant", HIGH_mw_plant)
HIGH_mw_fungi=len(extra.query("HIGH_mw==True and predicted_organism == 'fungi'"))
print("HIGH_mw_fungi", HIGH_mw_fungi)
HIGH_mw_bacteria=len(extra.query("HIGH_mw==True and predicted_organism == 'bacteria'"))
print("HIGH_mw_bacteria", HIGH_mw_bacteria)
print(HIGH_mw_plant+HIGH_mw_fungi+HIGH_mw_bacteria)
print("******")

MED_mw=len(extra.query("MED_mw==True and LOW_mw==False"))
print("MED_mw", MED_mw)
MED_mw_plants=len(extra.query("MED_mw==True and LOW_mw==False and predicted_organism == 'plants'"))
print("MED_mw_plants", MED_mw_plants)
MED_mw_fungi=len(extra.query("MED_mw==True and LOW_mw==False and predicted_organism == 'fungi'"))
print("MED_mw_fungi", MED_mw_fungi)
MED_mw_bacteria=len(extra.query("MED_mw==True and LOW_mw==False and predicted_organism == 'bacteria'"))
print("MED_mw_bacteria", MED_mw_bacteria)
print(MED_mw_bacteria+MED_mw_fungi+MED_mw_plants)
print("******")

MED_mw=len(extra.query("MW <= 800 and MW >= 300"))
print("MED_mw", MED_mw)
MED_mw_plants=len(extra.query("MW <= 800 and MW >= 300 and predicted_organism == 'plants'"))
print("MED_mw_plants", MED_mw_plants)
MED_mw_fungi=len(extra.query("MW <= 800 and MW >= 300 and predicted_organism == 'fungi'"))
print("MED_mw_fungi", MED_mw_fungi)
MED_mw_bacteria=len(extra.query("MW <= 800 and MW >= 300 and predicted_organism == 'bacteria'"))
print("MED_mw_bacteria", MED_mw_bacteria)
print(MED_mw_bacteria+MED_mw_fungi+MED_mw_plants)
print("******")



peptide=len(extra.query("isPeptide==True"))
print("peptide", peptide)
peptide_plant=len(extra.query("isPeptide==True and predicted_organism == 'plants'"))
print("peptide_plant", peptide_plant)
peptide_fungi=len(extra.query("isPeptide==True and predicted_organism == 'fungi'"))
print("peptide_fungi", peptide_fungi)
peptide_bacteria=len(extra.query("isPeptide==True and predicted_organism == 'bacteria'"))
print("peptide_bacteria", peptide_bacteria)
print("******")

sugar=len(extra.query("hasSugar==True"))
print("sugar", sugar)
sugar_plants=len(extra.query("hasSugar==True and origin ==0"))
print("sugar_plants", sugar_plants)
sugar_fungi=len(extra.query("hasSugar==True and predicted_organism == 'fungi'"))
print("sugar_fungi", sugar_fungi)
sugar_bacteria=len(extra.query("hasSugar==True and predicted_organism == 'bacteria'"))
print("sugar_bacteria", sugar_bacteria)
print("******")

plant = len(extra.query("predicted_organism == 'plants'"))
print("plant", plant)
fungi = len(extra.query("predicted_organism == 'fungi'"))
print("fungi", fungi)
bacteria = len(extra.query("predicted_organism == 'bacteria'"))
print("bacteria", bacteria)
print("******")

LOW_fcsp3 237
LOW_fcsp3_plants 164
LOW_fcsp3_fungi 25
LOW_fcsp3_bacteria 48
******
MED_fcsp3 2267
MED_fcsp3_plant 1714
MED_fcsp3_fungi 242
MED_fcsp3_bacteria 311
******
HIGH_fcsp3 860
HIGH_fcsp3_plants 663
HIGH_fcsp3_fungi 126
HIGH_fcsp3_bacteria 71
******
LOW_mw 590
LOW_mw_plant 408
LOW_mw_fungi 104
LOW_mw_bacteria 78
590
******
HIGH_mw 333
HIGH_mw_plant 191
HIGH_mw_fungi 25
HIGH_mw_bacteria 117
333
******
MED_mw 2441
MED_mw_plants 1942
MED_mw_fungi 264
MED_mw_bacteria 235
2441
******
MED_mw 2441
MED_mw_plants 1942
MED_mw_fungi 264
MED_mw_bacteria 235
2441
******
peptide 130
peptide_plant 12
peptide_fungi 17
peptide_bacteria 101
******
sugar 462
sugar_plants 0
sugar_fungi 38
sugar_bacteria 52
******
plant 2541
fungi 393
bacteria 430
******
