In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import pandas as pd
import numpy as np

import os
os.chdir("..")
%aimport AD_comparison_tools
%aimport AD_predictor_tools
%aimport ADpred_LambertTFs_helper

os.chdir("Yeast TFs")
import protfasta
import matplotlib.pyplot as plt
import seaborn as sns
import metapredict as meta
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, auc

from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.lines import Line2D

sns.set_theme(rc={'figure.figsize':(6,4), 'figure.dpi' : 300})
sns.set_style("white")

In [3]:
# 1. Loading in predictor outputs on Lambert TFs
yeast_TFs = pd.read_csv("../../output/yeast_TF_seqs.csv", index_col = 0)
yeast_TFs = yeast_TFs.rename(columns = {"seq" : "sequence"})
yeast_TFs["uniprotID"] = yeast_TFs["id"].str.split("|").str[1]
yeast_TFs["length"] = yeast_TFs["sequence"].str.len()
yeast_TFs

Unnamed: 0,id,sequence,uniprotID,length
0,sp|O14467|MBF1_YEAST Multiprotein-bridging fac...,MSDWDTNTIIGSRARAGGSGPRANVARSQGQINAARRQGLVVSVDK...,O14467,151
1,sp|O93958|MATA2_YARLL Mating-type protein A2 O...,MENTILHIHSFQLPQTEQPYPEAMLFDRDTSDSRTVLTQKPNGLEI...,O93958,291
2,sp|P03069|GCN4_YEAST General control transcrip...,MSEYQPSLFALNPMGFSPLDGSKSTNENVSASTSTAKPMVGQLIFD...,P03069,281
3,sp|P04386|GAL4_YEAST Regulatory protein GAL4 O...,MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKR...,P04386,881
4,sp|P04387|GAL80_YEAST Galactose/lactose metabo...,MDYNKRSSVSTVPNAAPIRVGFVGLNAAKGWAIKTHYPAILQLSSQ...,P04387,435
...,...,...,...,...
242,sp|Q707Y3|MATA1_YARLL Mating-type protein A1 O...,MPSRTPTDIWRCQRLILAARKGETTCQALHEQSIEISSSLKWFEEI...,Q707Y3,176
243,sp|Q707Y6|MATA1_PICAN Mating-type protein A1 O...,MQFTILNEPSLDSQRREGDLASENYVFGDIRKEGVRILEDSLRSER...,Q707Y6,181
244,sp|Q708A1|MATA1_NAKDE Mating-type protein A1 O...,MNVQEIHNIREACITILSGTKHNSVLFEPCDKFDEVINSLDIDPDS...,Q708A1,122
245,sp|Q9HG12|MATA1_KLULA Mating-type protein A1 O...,MCDNDMADIQSKLSSFCEEIRALALKEGYNLEGDKSPSSKPYFMSW...,Q9HG12,228


In [4]:
adhunter = pd.read_csv("../../output/yeast_TFs_preds/cleaned/adhunter.csv", index_col=0)
adhunter["predictor"] = "adhunter"
adhunter

Unnamed: 0,uniprotID,Start,End,predictor
0,O93958,127,167,adhunter
1,O93958,186,291,adhunter
2,P03069,58,146,adhunter
3,P04386,134,210,adhunter
4,P04386,242,307,adhunter
...,...,...,...,...
447,Q12457,128,184,adhunter
448,Q12531,66,108,adhunter
449,Q707Y3,13,82,adhunter
450,Q707Y6,52,92,adhunter


In [5]:
tada = pd.read_csv("../../output/yeast_TFs_preds/cleaned/tada.csv", index_col=0)
tada["predictor"] = "tada"
tada

Unnamed: 0,uniprotID,Start,End,predictor
0,O93958,192,290,tada
1,P03069,40,149,tada
2,P04386,135,207,tada
3,P04386,830,880,tada
4,P04387,364,410,tada
...,...,...,...,...
291,Q12363,49,94,tada
292,Q12457,129,180,tada
293,Q12753,272,337,tada
294,Q12753,651,694,tada


In [27]:
adpred = pd.read_csv("../../output/yeast_TFs_preds/cleaned/adpred.csv", index_col=0)
adpred["predictor"] = "adpred"
adpred

Unnamed: 0,GeneName,Start,End,predictor
0,O93958,211,225,adpred
1,O93958,256,273,adpred
2,P03069,109,124,adpred
3,P04386,167,177,adpred
4,P04386,853,863,adpred
...,...,...,...,...
316,Q12340,360,374,adpred
317,Q12340,542,559,adpred
318,Q12340,752,780,adpred
319,Q707Y3,34,59,adpred


In [28]:
mechanistic = pd.read_csv("../../output/yeast_TFs_preds/cleaned/composition.csv", index_col=0)
mechanistic["predictor"] = "mechanistic"
mechanistic

Unnamed: 0,uniprotID,Start,End,predictor
0,P33400,128,172,mechanistic
1,P33400,520,618,mechanistic
2,P13574,229,341,mechanistic
3,P53968,504,543,mechanistic
4,P21192,28,86,mechanistic
...,...,...,...,...
99,P38830,10,49,mechanistic
100,P14681,310,368,mechanistic
101,P32896,449,505,mechanistic
102,P26370,358,396,mechanistic


In [29]:
paddle = pd.read_csv("../../output/yeast_TFs_preds/cleaned/paddle_noSS.csv", index_col=0)
paddle["predictor"] = "paddle"
paddle

Unnamed: 0,uniprotID,Start,End,predictor
1,O93958,223,262,paddle
2,P03069,69,135,paddle
3,P04386,149,154,paddle
3,P04386,155,180,paddle
3,P04386,837,854,paddle
...,...,...,...,...
236,Q12363,317,339,paddle
236,Q12363,340,350,paddle
239,Q12457,143,175,paddle
241,Q12753,284,317,paddle


In [30]:
all_models = pd.concat([adhunter, tada, adpred, mechanistic, paddle])
all_models

Unnamed: 0,uniprotID,Start,End,predictor,GeneName
0,O93958,127,167,adhunter,
1,O93958,186,291,adhunter,
2,P03069,58,146,adhunter,
3,P04386,134,210,adhunter,
4,P04386,242,307,adhunter,
...,...,...,...,...,...
236,Q12363,317,339,paddle,
236,Q12363,340,350,paddle,
239,Q12457,143,175,paddle,
241,Q12753,284,317,paddle,


In [33]:
def combine_intervals(uniprotID):
    all_models_rows = all_models[all_models["uniprotID"] == uniprotID]
    TF_length = yeast_TFs[yeast_TFs["uniprotID"] == uniprotID]["length"].iloc[0].astype(int)
    
    if len(all_models_rows) > 0:
        preds = []

        for i in range(len(all_models_rows)):
            row_preds = np.zeros(TF_length)
            row = all_models_rows.iloc[i]
            row_preds[row["Start"]:row["End"]] = 1
            preds.append(row_preds)

        all_models_rows["preds"] = preds

        return sum(all_models_rows["preds"]) / 5

    else:
        return np.zeros(TF_length)
        

In [55]:
yeast_TFs_with_preds = yeast_TFs[["uniprotID", "sequence"]]
yeast_TFs_with_preds["combined_preds"] = yeast_TFs_with_preds["uniprotID"].apply(lambda id: combine_intervals(id))
yeast_TFs_with_preds

Unnamed: 0,uniprotID,sequence,combined_preds
0,O14467,MSDWDTNTIIGSRARAGGSGPRANVARSQGQINAARRQGLVVSVDK...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,O93958,MENTILHIHSFQLPQTEQPYPEAMLFDRDTSDSRTVLTQKPNGLEI...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,P03069,MSEYQPSLFALNPMGFSPLDGSKSTNENVSASTSTAKPMVGQLIFD...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,P04386,MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKR...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,P04387,MDYNKRSSVSTVPNAAPIRVGFVGLNAAKGWAIKTHYPAILQLSSQ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
242,Q707Y3,MPSRTPTDIWRCQRLILAARKGETTCQALHEQSIEISSSLKWFEEI...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
243,Q707Y6,MQFTILNEPSLDSQRREGDLASENYVFGDIRKEGVRILEDSLRSER...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
244,Q708A1,MNVQEIHNIREACITILSGTKHNSVLFEPCDKFDEVINSLDIDPDS...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
245,Q9HG12,MCDNDMADIQSKLSSFCEEIRALALKEGYNLEGDKSPSSKPYFMSW...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [56]:
# Convert each list in combined_preds into separate columns
preds_df = yeast_TFs_with_preds['combined_preds'].apply(pd.Series)

# Join the new columns back to the original DataFrame (drop old combined_preds)
yeast_TFs_with_preds = yeast_TFs_with_preds.drop(columns=['combined_preds'])
yeast_TFs_with_preds = pd.concat([yeast_TFs_with_preds, preds_df], axis=1)

# Save as proper TSV
yeast_TFs_with_preds.to_csv('../../output/yeast_TF_aggreg_preds.tsv', sep='\t', index=False, header=False)

In [57]:
yeast_TFs_with_preds

Unnamed: 0,uniprotID,sequence,0,1,2,3,4,5,6,7,...,1638,1639,1640,1641,1642,1643,1644,1645,1646,1647
0,O14467,MSDWDTNTIIGSRARAGGSGPRANVARSQGQINAARRQGLVVSVDK...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,O93958,MENTILHIHSFQLPQTEQPYPEAMLFDRDTSDSRTVLTQKPNGLEI...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,P03069,MSEYQPSLFALNPMGFSPLDGSKSTNENVSASTSTAKPMVGQLIFD...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,P04386,MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,P04387,MDYNKRSSVSTVPNAAPIRVGFVGLNAAKGWAIKTHYPAILQLSSQ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,Q707Y3,MPSRTPTDIWRCQRLILAARKGETTCQALHEQSIEISSSLKWFEEI...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
243,Q707Y6,MQFTILNEPSLDSQRREGDLASENYVFGDIRKEGVRILEDSLRSER...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
244,Q708A1,MNVQEIHNIREACITILSGTKHNSVLFEPCDKFDEVINSLDIDPDS...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
245,Q9HG12,MCDNDMADIQSKLSSFCEEIRALALKEGYNLEGDKSPSSKPYFMSW...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
