In [22]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import Lipinski, Descriptors, rdMolDescriptors, AllChem, PandasTools
from pandarallel import pandarallel
import numpy as np
pandarallel.initialize(progress_bar=False)
import tmap as tm
from map4 import MAP4Calculator
import os
import joblib
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
folder = "/data/coconut/"

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [23]:
coconut = pd.read_pickle(folder + "coconut_prop.pkl")

In [24]:
origins = ["plants", "fungi", "bacteria", "animal", "Homo_sapiens", "marine", "other"]
def origin(simple_tax):
    if simple_tax in origins:
        return origins.index(simple_tax)


In [25]:
coconut["origin"] = coconut.simple_tax.map(origin)

In [26]:
def norm_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    smiles_n = Chem.MolToSmiles(mol, isomericSmiles=False)
    return smiles_n

coconut["norm_smiles"] = coconut["SMILES"].map(norm_smiles) 

In [27]:
coconut = coconut.query("origin != 6 and origin != 5 and origin != 4 and origin != 3").sort_values("origin", ascending=False)

In [28]:
coconut.replace(["NaN", 'NaT'], np.nan, inplace = True)
coconut = coconut.dropna()

  mask = arr == x
  mask |= arr == x


In [29]:
len(coconut)

60171

In [30]:
features = ['molecular_weight','number_of_carbons',\
       'number_of_nitrogens', 'number_of_oxygens',\
       'total_atom_number', 'bond_count', 'topoPSA', 'fcsp3', 'HBA',\
       'HBD', 'aLogP']
for feat in features:
    coconut[f"{feat}"]=coconut[f"{feat}"].map(float) 

In [31]:
if not os.path.exists(folder+"coconut_LSHforest_Micro_Plants"):
    fps = coconut["MAP4"].tolist()
    fps_ = []

    for fp in fps:
        fp_ = tm.VectorUint(fp)
        fps_.append(fp_)

    lf = tm.LSHForest(1024, 32)
    lf.batch_add(fps_)

    lf.index()
    lf.store(folder + 'coconut_LSHforest_Micro_Plants')
else:
    lf = tm.LSHForest(1024, 32)
    lf.restore(folder + 'coconut_LSHforest_Micro_Plants')

In [32]:
if not os.path.exists(folder+"coconut_tm_layout_Micro_Plants.pkl"):
    cfg = tm.LayoutConfiguration()

    # config
    cfg.node_size = 1 / 30
    cfg.mmm_repeats = 2
    cfg.sl_extra_scaling_steps = 5
    cfg.k = 20
    cfg.sl_scaling_type = tm.RelativeToAvgLength
    # Compute the layout

    x_, y_, s, t, gp = tm.layout_from_lsh_forest(lf, cfg)
    tm_layout = {"x": list(x_), "y": list(y_), "s" : list(s), "t" : list(t)}
    joblib.dump(tm_layout, folder+"coconut_tm_layout_Micro_Plants.pkl")
else:
    tm_layout = joblib.load(folder+"coconut_tm_layout_Micro_Plants.pkl")

In [33]:
coconut["LOW_fcsp3"] = coconut.fcsp3.map(lambda x: x<=0.2)
coconut["MED_fcsp3"] = coconut.fcsp3.map(lambda x: x<0.8)
coconut["HIGH_fcsp3"] = coconut.fcsp3.map(lambda x: x>=0.8)
coconut["LOW_mw"] = coconut.MW.map(lambda x: x<=300)
coconut["MED_mw"] = coconut.MW.map(lambda x: x<800)
coconut["HIGH_mw"] = coconut.MW.map(lambda x: x>=800)

In [74]:
def smarts(row):
    if row.hasSugar and row.isPeptide:
        return 2
    elif row.isPeptide:
        return 1
    elif row.hasSugar:
        return 0
    else:
        return 3

In [75]:
def assign_table_color(row):
    if row.hasSugar:
        return 0
    elif row.isPeptide:
        return 1
    elif row.HIGH_mw:
        return 2
    elif row.HIGH_fcsp3:
        return 3
    elif row.LOW_fcsp3:
        return 4
    elif row.LOW_mw:
        return 5
    else:
        return 6

In [76]:
coconut["smarts"] =  coconut.apply(smarts, 1)

In [77]:
coconut["color_table"] = coconut.apply(assign_table_color, 1)

 sugar>peptide>highMW>higcoconutp3>lowfsp3>lowMW

In [78]:
tmp = coconut.copy()

idx = tmp[tmp["MW"] >= 1000].index
tmp.loc[idx, "MW"] = 1000

idx = tmp[tmp.HBA >= 20].index
tmp.loc[idx, "HBA"] = 20

idx = tmp[tmp.HBD >= 10].index
tmp.loc[idx, "HBD"] = 10

idx = tmp[tmp.aLogP >= 8].index
tmp.loc[idx, "aLogP"] = 8

idx = tmp[tmp.aLogP <= -2].index
tmp.loc[idx, "aLogP"] = -2

n_C = tmp.number_of_carbons.tolist()
n_O = tmp.number_of_oxygens.tolist()
n_N = tmp.number_of_nitrogens.tolist()
n_a = tmp.total_atom_number.tolist()
n_b = tmp.bond_count.tolist()
TPSA = tmp.topoPSA.tolist()
MW = tmp.MW.tolist()
fcsp3 = tmp.fcsp3.tolist()
HBA = tmp.HBA.tolist()
HBD = tmp.HBD.tolist()
alogp = tmp.aLogP.tolist()

isLipinski = tmp.isLipinski.tolist()

isPeptide = tmp.isPeptide.tolist()
hasSugar = tmp.hasSugar.tolist()
labels = tmp.TMAPlabel.tolist()
SMILES = tmp.SMILES.tolist()
ID = tmp.coconut_id.tolist()
origin = tmp.origin.tolist()
color_table = tmp.color_table.tolist()
smarts = tmp.smarts.tolist()

In [109]:
from matplotlib.colors import ListedColormap
from faerun import Faerun
custom_cmap = ListedColormap([ 'lightgray', "magenta"], name="custom")
custom_cmap2 = ListedColormap([ 'limegreen', "lightgray"], name="custom2")
custom_cmap3 = ListedColormap([ "lightgray","cyan", "magenta"], name="custom3")
custom_cmap4 = ListedColormap([ "limegreen", "blue", "tomato", "magenta", "green", "red","lightgray"], name="custom4")
custom_cmap4_b = ListedColormap([ "limegreen", "cyan", "tomato", "magenta", "pink", "cyan","darkgray"], name="custom4")

custom_cmap5 = ListedColormap([ "blue", "cyan", "green", "yellow", "orange", "red","gray"], name="custom4")
custom_cmap5_b = ListedColormap([ "blue", "cyan", "green", "yellow", "orange", "red","gray"], name="custom4")

custom_cmap6 = ListedColormap([  "cyan", "limegreen", "magenta", "gray"], name="custom6")
custom_cmap6_b = ListedColormap([ "blue", "green", "magenta", "lightgray"], name="custom6")


groups = ["0-No", "1-Yes"]
labels_groups, groups = Faerun.create_categories(groups)

groups4 = ["0-Plants", "1-Fungi", "2-Bacteria"]#, "3-animals", "4-Homo sapiens", "6-others"]
labels_groups4, groups4 = Faerun.create_categories(groups4)


groups3 = ["0-CyclicAcetals", "1-Peptide", "2-CyclicAcetals and Peptide", "3-None"]#, "3-animals", "4-Homo sapiens", "6-others"]
labels_groups3, groups3 = Faerun.create_categories(groups3)

groups2 =  ["0-Glycoside", "1-Peptide", "2-HighMW(>=800Da)", "3-Highfsp3(>=0.8)", "4-Lowfsp3(<=0.2)", "5-LowMW(<=300Da)", "6-Other"]
labels_groups2, groups2 = Faerun.create_categories(groups2)

origin., sugar, peptide, table1 categories
e origin., sugar, peptide, table1 categories, would be listed just below the MW and fsp3 categories.

In [110]:
faerun = Faerun(view="front", coords=False, title='MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants', clear_color="#ffffff")
faerun.add_scatter("MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants",{"x": tm.VectorFloat(tm_layout["x"]), "y": tm.VectorFloat(tm_layout["y"]),\
                        "c": [MW, fcsp3, origin, smarts, color_table, alogp],\
                          "labels": labels}, 
                            has_legend=True, \
                            colormap=["rainbow", "rainbow", \
                            custom_cmap4, custom_cmap6_b, custom_cmap5_b, "rainbow"], \
                            point_scale=2, categorical=[False, False, True, True, True, False],\
                            series_title=["MW", "Fsp3", "Origin", "Cyclic Acetals and Peptide substructure", "Table1 Categories", "AlogP"], \
                            max_legend_label=[">=1000", str(round(max(fcsp3),2)), None, None, None,">=8"],\
                            min_legend_label=[str(round(min(MW),2)), str(round(min(fcsp3),2)), None, None, None,"<=2"],\
                            legend_labels=[None, None, labels_groups4, labels_groups3, labels_groups2, None])


faerun.add_tree("MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants_tree",{"from": tm.VectorUint(tm_layout["s"]), "to": tm.VectorUint(tm_layout["t"])},\
                point_helper="MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants", color="aaaaaa")
faerun.plot('MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants', template = 'smiles')

In [111]:
faerun = Faerun(view="front", coords=False, title='MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants_b')
faerun.add_scatter("MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants_b",{"x": tm.VectorFloat(tm_layout["x"]), "y": tm.VectorFloat(tm_layout["y"]),\
                        "c": [MW, fcsp3, origin, smarts, color_table, alogp],\
                          "labels": labels}, 
                            has_legend=True, \
                            colormap=["rainbow", "rainbow", \
                            custom_cmap4_b, custom_cmap6, custom_cmap5_b, "rainbow"], \
                            point_scale=2, categorical=[False, False, True, True, True, False],\
                            series_title=["MW", "Fsp3", "Origin", "Glycoside and Peptide substructure", "Table1 Categories", "AlogP"], \
                            max_legend_label=[">=1000", str(round(max(fcsp3),2)), None, None, None,">=8"],\
                            min_legend_label=[str(round(min(MW),2)), str(round(min(fcsp3),2)), None, None, None,"<=2"],\
                            legend_labels=[None, None, labels_groups4, labels_groups3, labels_groups2, None])

faerun.add_tree("MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants_b_tree",{"from": tm.VectorUint(tm_layout["s"]), "to": tm.VectorUint(tm_layout["t"])},\
                point_helper="MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants_b", color="aaaaaa")
faerun.plot('MAP4_curatedCOCONUT_TMAP_MicrobialAndPlants_b', template = 'smiles')