In [1]:
import pickle
import numpy as np
import tmap as tm
import pandas as pd
import scipy.stats as ss
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from mhfp.encoder import MHFPEncoder
from faerun import Faerun
from collections import Counter
from matplotlib.colors import ListedColormap
from matplotlib import pyplot as plt
import util

In [2]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
from matplotlib import pylab as plt
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
df = pd.read_csv('datafiles/g_dataset.csv')

enc = MHFPEncoder(1024)
lf = tm.LSHForest(1024, 64)

labels=[]
has_DBDO = []
fps = []
tpsa = []
logp = []
mw = []
h_acceptors = []
h_donors =[]
ring_count =[]
numberoffragments =[]

fng_methoxy = []
fng_ether = []
fng_phenol = []
fng_aliphaticOH_secondary = []
fng_aliphaticOH_primary=[]
fng_cinnamyl_alcohol_endgrp =[]

for i, row in df.iterrows():
    if i != 0 and i % 1000 == 0:
        print(100 * i / len(df))
    compound = row["smile"] 
    numberoffragments.append(len(compound.split(".")))
    mol = Chem.MolFromSmiles(compound)            
    tpsa.append(Descriptors.TPSA(mol))
    logp.append(Descriptors.MolLogP(mol))
    mw.append(Descriptors.MolWt(mol))
    h_acceptors.append(Descriptors.NumHAcceptors(mol))
    h_donors.append(Descriptors.NumHDonors(mol))
    ring_count.append(util.find_functional_groups_counts(compound, "benzene"))    
    has_DBDO.append(row["Bonds"].find("DBDO") != -1)            
    fng_methoxy.append(util.find_functional_groups_counts(compound, "methoxy"))    
    fng_ether.append(util.find_functional_groups_counts(compound, "ether"))   
    fng_phenol.append(util.find_functional_groups_counts(compound, "phenol"))
    fng_aliphaticOH_secondary.append(util.find_functional_groups_counts(compound, "aliphaticOH_secondary"))
    fng_aliphaticOH_primary.append(util.find_functional_groups_counts(compound, "aliphaticOH_primary"))
    fng_cinnamyl_alcohol_endgrp.append(util.find_functional_groups_counts(compound, "cinnamyl_alcohol"))
    
    rt_methoxy = util.find_functional_groups_counts(compound, "methoxy")
    bondsStr = row["Bonds"]   
    bondsStr = bondsStr.lstrip("{").rstrip("}")
    bondslist = bondsStr.split(",")
    bondsStr = ",".join(x.replace("'","").replace(": "," : ") for x in bondslist)
    
    funcgrp = row["FunctionalGroup"]
    funcgrp = funcgrp.lstrip("{").rstrip("}")
    funclist = funcgrp.split(",")    
    funcgrp = "__".join(x.replace("'","").replace(": "," : ") for x in funclist)
    lg_id = row["struct_id"]
    fps.append(tm.VectorUint(enc.encode_mol(mol)))
    labels.append(
        str(compound)
        + "__"        
        + "__"
        + str(lg_id)
        + "__"
        + "__"
        + f"__{str(bondsStr)}"      
        + "__"
        + "__"
        + f"__{str(funcgrp)}"    
    )
lf.batch_add(fps)
lf.index()
#lf.store("lf_sg.dat")

with open("props_sg.pickle", "wb+") as f:
    pickle.dump(
        (tpsa, logp, mw, h_acceptors,h_donors, ring_count, numberoffragments, fng_methoxy,fng_ether, fng_phenol,fng_aliphaticOH_secondary, fng_aliphaticOH_primary,fng_cinnamyl_alcohol_endgrp),
        f,
        protocol=pickle.HIGHEST_PROTOCOL,
    )
    
#lf = tm.LSHForest(1024, 64)
#lf.restore("lf_g.dat")
#tpsa, logp, mw, h_acceptors,h_donors, ring_count, numberoffragments, fng_methoxy,fng_ether, fng_phenol,fng_aliphaticOH_secondary, fng_aliphaticOH_primary,fng_cinnamyl_alcohol_endgrp = pickle.load(
#    open("props_g.pickle", "rb")
#)

#c_frak_ranked = ss.rankdata(np.array(c_frac) / max(c_frac)) / len(c_frac)

cfg = tm.LayoutConfiguration()
cfg.k = 20
cfg.kc = 20
cfg.sl_scaling_min = 1.0
cfg.sl_scaling_max = 1.0
cfg.sl_repeats = 1
cfg.sl_extra_scaling_steps = 2
cfg.placer = tm.Placer.Barycenter
cfg.merger = tm.Merger.LocalBiconnected
cfg.merger_factor = 2.0
cfg.merger_adjustment = 0
cfg.fme_iterations = 500
cfg.sl_scaling_type = tm.ScalingType.RelativeToDesiredLength
cfg.node_size = 1 / 37
cfg.mmm_repeats = 1



type_labels, type_data = Faerun.create_categories(df["Type"])
# Define colormaps
set1 = plt.get_cmap("Set1").colors
rainbow = plt.get_cmap("rainbow")
colors = rainbow(np.linspace(0, 1, len(set(type_data))))[:, :3].tolist()
custom_cm = LinearSegmentedColormap.from_list("my_map", colors, N=len(colors))
bin_cmap = ListedColormap([set1[8], "#5400F6"], name="bin_cmap")
x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)



df["smile"] = (df["smile"])



f = Faerun(view="front", coords=False, clear_color="#ffffff")
f.add_scatter(
    "GTypeStructures",
    {
        "x": x,
        "y": y,
        "c": [
            type_data,
            has_DBDO,
            tpsa,
            logp,
            mw,
            h_acceptors,
            h_donors,
            ring_count,
            numberoffragments,            
            fng_methoxy,
            fng_ether,
            fng_phenol,
            fng_aliphaticOH_secondary,
            fng_aliphaticOH_primary,
            fng_cinnamyl_alcohol_endgrp,
        ],
        "labels": (labels),
    },
    shader="sphere",
    point_scale=10.0,
    max_point_size=20,
    legend_labels=[type_labels,[(0, "No"), (1, "Yes")],],    
    categorical=[True, False, False, False, False, False],
    colormap=["tab10", "rainbow", "rainbow", "rainbow", "rainbow", "rainbow"],
    selected_labels=["", "Lignin_ID","", "","Linkages","","","Functional Groups"],
    series_title=[
        "Type",
        "DBDO Linkage",
        "TPSA",
        "LogP",
        "Molecular Weight",
        "H Accpetors",
        "H Donors",
        "Ring count",
        "Number of fragments",
        "Methoxy content",
        "Ether linkages",
        "OH-Phenolic (free)",
        "OH-Aliphatic (primary)",
        "OH-Aliphatic (secondary)",
        "End group - Cinnamyl alcohol",
    ],
    has_legend=True,
)
f.add_tree("np_atlas_tree", {"from": s, "to": t}, point_helper="GTypeStructures")
f.plot("lignindatasetview_g_jan26_3",template="smiles")

15.782828282828282
31.565656565656564
47.34848484848485
63.13131313131313
