In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import bokeh
import holoviews as hv
import hvplot.pandas
hv.extension('bokeh')
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [119]:
def plot_two_cond(df1,df2,alpha,col_name=None,type_name=None):
    if col_name!= None:
        if type_name == None:
            a = df1.hvplot(kind='scatter', x='pc_1', y='pc_2',color = col_name).opts(tools=["hover"], legend_position="right", \
                                                                                     width=600,height = 300, size=1, padding=0.2, alpha = alpha,title = "target")
        else:
            a = df1.loc[df1[col_name]==type_name,:].hvplot(kind='scatter', x='pc_1', y='pc_2').opts(tools=\
                                                                                                        ["hover"], legend_position="right", width=300,height = 300, size=1, padding=0.2, alpha = alpha,title = type_name)
    else:
        a = df1.hvplot(kind='scatter', x='pc_1', y='pc_2').opts(tools=["hover"], legend_position="right", width=300,height = 300,\
                                                                                 size=1, padding=0.2, alpha = alpha,title = "target")
    
    not_a = df2.hvplot(kind='scatter', x='pc_1', y='pc_2').opts(
    tools=["hover"], legend_position="right", width=300,height = 300, size=1, padding=0.2, alpha = alpha/20)
    
    b = df2.hvplot(kind='scatter', x='pc_1', y='pc_2').opts(
    tools=["hover"], legend_position="right", width=300,height = 300, size=1, padding=0.2, alpha = alpha,title = "background")
    
    not_b = df1.hvplot(kind='scatter', x='pc_1', y='pc_2').opts(
    tools=["hover"], legend_position="right", width=300,height = 300, size=1, padding=0.2, alpha = alpha/20)
    return a*not_a + b*not_b

def plot_one_vs_rest(df_plot,col_name,type_name,alpha,size = None,contrast = 10):
    if size != None:
        a = df_plot.loc[df_plot[col_name]==type_name,:].hvplot(kind='scatter', x='pc_1', y='pc_2').opts(
    tools=["hover"], legend_position="right", width=300,height = 300, size=size, padding=0.2, alpha = alpha,title = str(type_name))
    else:
        a = df_plot.loc[df_plot[col_name]==type_name,:].hvplot(kind='scatter', x='pc_1', y='pc_2').opts(
    tools=["hover"], legend_position="right", width=300,height = 300, size=1, padding=0.2, alpha = alpha,title = str(type_name))
    
    not_a = df_plot.loc[df_plot[col_name]!=type_name,:].hvplot(kind='scatter', x='pc_1', y='pc_2').opts(
    tools=["hover"], legend_position="right", width=300,height = 300, size=5, padding=0.2, alpha = alpha/contrast)
    return a*not_a



## Load Data

In [17]:
thermo  = pd.read_csv("../data/thermo/for_vis/thermo_sampled.csv")

In [31]:
sum(thermo["is_thermophilic"]==1)

28266

In [19]:
umap_full = np.load("../data/thermo/for_vis/thermo_sampled_embed_scaled_umap.npy")
umap_reduced = np.load("../data/thermo/for_vis/thermo_sampled_embed_scaled_reduced_umap.npy")

In [20]:
# Plot the embedded space of kinesin proteins from motor toolkit set
df_plot = thermo
df_plot['pc_1'] = umap_full[:,0]
df_plot['pc_2'] = umap_full[:,1]
df_plot.groupby("clan").count().iloc[:,0]

clan
actin_like          7208
p_loop_gtpase      48764
tubulin_binding      556
tubulin_c              4
Name: Unnamed: 0, dtype: int64

In [21]:
p_loop_gtpase = plot_one_vs_rest(df_plot,"clan","p_loop_gtpase",0.2)
actin_like = plot_one_vs_rest(df_plot,"clan","actin_like",0.2)
tubulin_c = plot_one_vs_rest(df_plot,"clan","tubulin_c",0.2)
tubulin_binding = plot_one_vs_rest(df_plot,"clan","tubulin_binding",0.2)
(p_loop_gtpase+actin_like+tubulin_c+tubulin_binding).cols(2)

In [34]:
plot_one_vs_rest(df_plot.loc[df_plot["clan"]=="p_loop_gtpase",:],"is_thermophilic",1,0.3)

In [49]:
top_fam = df_plot.loc[df_plot["clan"]=="p_loop_gtpase",:].groupby("pfam_id").count().reset_index().sort_values(['uniprot_id'], ascending=False).head(7)["pfam_id"]

In [53]:
# select top 5 families from each clan, inspect whether the thermophilic ones are grouped within/between family
ps = []
for family in top_fam:
    print(family)
    ps.append(plot_one_vs_rest(df_plot.loc[df_plot["pfam_id"]==family,:],"is_thermophilic",1,0.5))
p = ps[0]
for i in range(1,len(ps)):
    p = p+ps[i]
p.cols(2)

PF00005
PF01926
PF01656
PF00006
PF00004
PF00437
PF00448


#### Observation: The thermophilic and non-thermophilic proteins are first grouped by pfam families, but form sub-clusters like populations within each pfam families

In [61]:
top_fam = df_plot.loc[df_plot["clan"]=="tubulin_binding",:].groupby("pfam_id").count().reset_index().sort_values(['uniprot_id'], ascending=False).head(7)["pfam_id"]

In [62]:
# select top 5 families from each clan, inspect whether the thermophilic ones are grouped within/between family
ps = []
for family in top_fam:
    print(family)
    ps.append(plot_one_vs_rest(df_plot.loc[df_plot["pfam_id"]==family,:],"is_thermophilic",1,0.5))
p = ps[0]
for i in range(1,len(ps)):
    p = p+ps[i]
p.cols(2)

PF00091
PF13809


## Inspect the incorrectly-predicted ones

In [76]:
df_plot["incorrect"] = thermo["pred_y"] != thermo["is_thermophilic"]

In [86]:
df_plot.loc[df_plot["incorrect"]==1,:].groupby(["clan","pfam_id"]).count().reset_index().sort_values(['uniprot_id'], ascending=False).head(3)


Unnamed: 0.1,clan,pfam_id,Unnamed: 0,uniprot_id,is_thermophilic,token,seq,pred_y,pc_1,pc_2,incorrect
24,p_loop_gtpase,PF00005,1485,1485,1485,1485,1485,1485,1485,1485,1485
55,p_loop_gtpase,PF01926,192,192,192,192,192,192,192,192,192
25,p_loop_gtpase,PF00006,161,161,161,161,161,161,161,161,161


In [105]:
tmp_incorrect = df_plot.loc[df_plot["incorrect"]==1,:].groupby(["clan","pfam_id"]).count().reset_index()
tmp_all = df_plot.groupby(["clan","pfam_id"]).count().reset_index()
# 
tmp_merged = tmp_incorrect.merge(tmp_all,on = "pfam_id")

In [107]:
tmp_incorrect["incorrect_prop"] = tmp_merged["uniprot_id_x"]/tmp_merged["uniprot_id_y"]

In [112]:
top_fam = tmp_incorrect.sort_values(['incorrect_prop'], ascending=False)["pfam_id"].head(5)

In [109]:
# number of families that predict perfectly:
tmp_all.shape[0] - tmp_incorrect.shape[0]

26

In [97]:
plot_one_vs_rest(df_plot.loc[df_plot["pfam_id"] == "PF00005",:],"incorrect",1,alpha = 3,size = 0.5)

In [120]:
# select top 5 families from each clan, inspect whether the thermophilic ones are grouped within/between family
ps = []
for family in top_fam:
    print(family)
    ps.append(plot_one_vs_rest(df_plot.loc[df_plot["pfam_id"]==family,:],"is_thermophilic",1,0.5,size = 5,contrast = 1))
p = ps[0]
for i in range(1,len(ps)):
    p = p+ps[i]
p.cols(2)

PF00931
PF12327
PF12128
PF05378
PF12848


In [121]:
# select top 5 families from each clan, inspect whether the thermophilic ones are grouped within/between family
ps = []
for family in top_fam:
    print(family)
    ps.append(plot_one_vs_rest(df_plot.loc[df_plot["pfam_id"]==family,:],"incorrect",1,0.5,size = 5,contrast = 1))
p = ps[0]
for i in range(1,len(ps)):
    p = p+ps[i]
p.cols(2)

PF00931
PF12327
PF12128
PF05378
PF12848


## The performace of the predictor is bad when 2 proteins are strongly overlapped in the embedded space, or are of very close distance in the 2D space, but coming from different type of organism (thermophilic/non-thermophilic).