In [2]:
%load_ext autoreload

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import protfasta 
import re

from Bio import pairwise2
from Bio.Seq import Seq 

%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools



In [4]:
# Loading in this 
TF_prot_coords = pd.read_csv("../data/SFARI_TFs_with_knownADs_coords_ENSG.csv", index_col = 0)
TF_prot_coords

Unnamed: 0,uniprotID,Start,End,ENSG
0,Q9HBZ2,1,717,ENSG00000172379
1,Q96QS3,1,562,ENSG00000004848
2,Q86V15,1,1759,ENSG00000130940
3,Q6P1N0,1,951,ENSG00000132024
4,O94983,1,1202,ENSG00000108509
5,O60479,1,287,ENSG00000064195
6,Q9H4W6,1,596,ENSG00000108001
7,Q06889,1,387,ENSG00000179388
8,Q92731,1,530,ENSG00000140009
9,O95718,1,433,ENSG00000119715


In [5]:
for uniprotID in TF_prot_coords["uniprotID"]:
    print(uniprotID)

Q9HBZ2
Q96QS3
Q86V15
Q6P1N0
O94983
O60479
Q9H4W6
Q06889
Q92731
O95718
Q8NBF1
P11308
Q5T1R4
Q13422
Q9UGL1
Q9BXK1
O75840
Q03164
Q06413
O14770
P35548
Q14872
Q9UL68
Q15788
Q9Y4A8
Q12857
Q14938
P20393
P43354
O95096
P32242
Q02548
P26367
P78337
P35398
Q13485
Q6ZRS2
Q9Y458
Q9UGU0
P15884
Q9NQB0
Q6N021
P19532
P10827
P25490
P11473


In [6]:
biomart_output = pd.read_csv('../data/TF_CDS_biomart_output.txt')
# biomart_output = biomart_output[["Chromosome/scaffold name", "Genomic coding start", "Genomic coding end", "Gene stable ID", "Transcript stable ID", "Strand"]]
biomart_output = biomart_output.dropna()
biomart_output["Genomic coding start"] -= 1 # because BED format
biomart_output["Genomic coding start"] = biomart_output["Genomic coding start"].astype(int)
biomart_output["Genomic coding end"] = biomart_output["Genomic coding end"].astype(int)

In [7]:
biomart_output["Strand"] = biomart_output["Strand"].map({-1 : "-", +1 : "+"})
biomart_output

Unnamed: 0,Gene stable ID,Genomic coding start,Genomic coding end,Strand,Transcript stable ID,Chromosome/scaffold name
0,ENSG00000134138,37099454,37099466,-,ENST00000561208,15
1,ENSG00000134138,37097966,37098199,-,ENST00000561208,15
2,ENSG00000134138,37096288,37096430,-,ENST00000561208,15
3,ENSG00000134138,37095563,37095614,-,ENST00000561208,15
4,ENSG00000134138,37094526,37094577,-,ENST00000561208,15
...,...,...,...,...,...,...
2457,ENSG00000277800,74638,74813,-,ENST00000619509,HSCHRX_2_CTG12
2458,ENSG00000050344,26152498,26153068,+,ENST00000056233,7
2459,ENSG00000050344,26177942,26178122,+,ENST00000056233,7
2460,ENSG00000050344,26183700,26183784,+,ENST00000056233,7


In [8]:
biomart_output[biomart_output["Gene stable ID"] == "ENSG00000157554"]

Unnamed: 0,Gene stable ID,Genomic coding start,Genomic coding end,Strand,Transcript stable ID,Chromosome/scaffold name
303,ENSG00000157554,38403505,38403709,-,ENST00000398897,21
304,ENSG00000157554,38402556,38402637,-,ENST00000398897,21
305,ENSG00000157554,38392375,38392444,-,ENST00000398897,21
306,ENSG00000157554,38391658,38391715,-,ENST00000398897,21
307,ENSG00000157554,38390994,38391042,-,ENST00000398897,21
308,ENSG00000157554,38383402,38383923,-,ENST00000398897,21
310,ENSG00000157554,38423409,38423521,-,ENST00000398897,21
311,ENSG00000157554,38445403,38445621,-,ENST00000398911,21
312,ENSG00000157554,38423409,38423561,-,ENST00000398911,21
313,ENSG00000157554,38403505,38403709,-,ENST00000398911,21


In [9]:
len(set(biomart_output['Gene stable ID']))

50

In [10]:
len(TF_prot_coords["uniprotID"])

46

In [11]:
# Matches whats expected
set(TF_prot_coords["ENSG"]) - set(biomart_output['Gene stable ID'])

set()

In [12]:
set(biomart_output['Gene stable ID']) - set(TF_prot_coords["ENSG"])

{'ENSG00000262024', 'ENSG00000276461', 'ENSG00000277800', 'ENSG00000288293'}

In [13]:
uniprotID_ENST_mapping = pd.read_csv("../data/SFARI_TFs_with_ENST_corrected.csv")
uniprotID_ENST_mapping = uniprotID_ENST_mapping[["uniprotID", "ENST"]]
uniprotID_ENST_mapping["ENST"] = uniprotID_ENST_mapping["ENST"].str.split(".").str[0]
uniprotID_ENST_mapping_dict= dict(zip(uniprotID_ENST_mapping["uniprotID"], uniprotID_ENST_mapping["ENST"]))
# New ones after redone with ADs I found in canonical
uniprotID_ENST_mapping_dict['O60479'] = 'ENST00000434704'
# uniprotID_ENST_mapping_dict['O95718'] = 
# I couldn't find a ENST code corresponding to O95718

In [14]:
# Saving bed files now
for i in TF_prot_coords.index:
    ENSG = TF_prot_coords["ENSG"].loc[i]
    uniprotID = TF_prot_coords["uniprotID"].loc[i]
    if uniprotID == "O95718":
        continue
    ENST = uniprotID_ENST_mapping_dict[uniprotID]
    biomart_output_for_ENSG = biomart_output[biomart_output["Gene stable ID"] == ENSG]
    biomart_output_for_ENSG = biomart_output_for_ENSG[biomart_output_for_ENSG["Transcript stable ID"] == ENST]
    print(uniprotID)
    
    biomart_output_for_ENSG["uniprotID"] = uniprotID
    biomart_output_for_ENSG = biomart_output_for_ENSG[["Chromosome/scaffold name", "Genomic coding start", 
                                                       "Genomic coding end", "uniprotID", "Transcript stable ID", "Strand"]]
    display(biomart_output_for_ENSG)

    filepath = "../soto_analysis/outputs/mutations/cds_bed_format/" + ENST
    biomart_output_for_ENSG.to_csv(filepath, index = False, sep = '\t', header = None)


#12	47878967	47879113	P11473	ENST00000229022	-


Q9HBZ2


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2037,15,80404515,80404546,Q9HBZ2,ENST00000303329,+
2038,15,80450879,80450994,Q9HBZ2,ENST00000303329,+
2039,15,80457928,80457976,Q9HBZ2,ENST00000303329,+
2040,15,80470217,80470431,Q9HBZ2,ENST00000303329,+
2041,15,80475009,80475223,Q9HBZ2,ENST00000303329,+
2042,15,80508155,80508258,Q9HBZ2,ENST00000303329,+
2043,15,80513910,80513976,Q9HBZ2,ENST00000303329,+
2044,15,80514319,80514405,Q9HBZ2,ENST00000303329,+
2045,15,80551198,80551275,Q9HBZ2,ENST00000303329,+
2046,15,80552639,80552774,Q9HBZ2,ENST00000303329,+


Q96QS3


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1991,X,25015541,25015737,Q96QS3,ENST00000379044,-
1992,X,25012921,25013798,Q96QS3,ENST00000379044,-
1993,X,25010259,25010305,Q96QS3,ENST00000379044,-
1994,X,25007110,25007439,Q96QS3,ENST00000379044,-
1995,X,25004669,25004910,Q96QS3,ENST00000379044,-


Q86V15


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1879,1,10693873,10693889,Q86V15,ENST00000377022,-
1880,1,10665082,10665571,Q86V15,ENST00000377022,-
1881,1,10659701,10660536,Q86V15,ENST00000377022,-
1882,1,10658507,10658576,Q86V15,ENST00000377022,-
1883,1,10656645,10656736,Q86V15,ENST00000377022,-
1884,1,10655648,10655813,Q86V15,ENST00000377022,-
1885,1,10654418,10654591,Q86V15,ENST00000377022,-
1886,1,10653376,10654218,Q86V15,ENST00000377022,-
1887,1,10650940,10651076,Q86V15,ENST00000377022,-
1888,1,10650691,10650755,Q86V15,ENST00000377022,-


Q6P1N0


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1721,19,13906441,13906501,Q6P1N0,ENST00000318003,+
1722,19,13909822,13909958,Q6P1N0,ENST00000318003,+
1723,19,13912322,13912438,Q6P1N0,ENST00000318003,+
1724,19,13912527,13912593,Q6P1N0,ENST00000318003,+
1725,19,13913167,13913302,Q6P1N0,ENST00000318003,+
1726,19,13913403,13913638,Q6P1N0,ENST00000318003,+
1727,19,13918069,13918194,Q6P1N0,ENST00000318003,+
1728,19,13918503,13918576,Q6P1N0,ENST00000318003,+
1729,19,13918745,13918817,Q6P1N0,ENST00000318003,+
1730,19,13918911,13919042,Q6P1N0,ENST00000318003,+


O94983


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
203,17,4985879,4985983,O94983,ENST00000348066,-
204,17,4982975,4983043,O94983,ENST00000348066,-
205,17,4982756,4982892,O94983,ENST00000348066,-
206,17,4982088,4982160,O94983,ENST00000348066,-
207,17,4981677,4981831,O94983,ENST00000348066,-
208,17,4981224,4981359,O94983,ENST00000348066,-
209,17,4979683,4980621,O94983,ENST00000348066,-
210,17,4978503,4978630,O94983,ENST00000348066,-
211,17,4977057,4977192,O94983,ENST00000348066,-
212,17,4974384,4974500,O94983,ENST00000348066,-


O60479


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
108,17,49994673,49994998,O60479,ENST00000434704,-
109,17,49993399,49993590,O60479,ENST00000434704,-
110,17,49991516,49991864,O60479,ENST00000434704,-


Q9H4W6


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2021,10,129963634,129963768,Q9H4W6,ENST00000355311,-
2022,10,129963366,129963523,Q9H4W6,ENST00000355311,-
2023,10,129962941,129963005,Q9H4W6,ENST00000355311,-
2024,10,129962170,129962226,Q9H4W6,ENST00000355311,-
2025,10,129958933,129959007,Q9H4W6,ENST00000355311,-
2026,10,129957257,129957326,Q9H4W6,ENST00000355311,-
2027,10,129877767,129877849,Q9H4W6,ENST00000355311,-
2028,10,129873451,129873596,Q9H4W6,ENST00000355311,-
2029,10,129867781,129867912,Q9H4W6,ENST00000355311,-
2030,10,129867140,129867267,Q9H4W6,ENST00000355311,-


Q06889


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1348,8,22692790,22692944,Q06889,ENST00000317216,-
1349,8,22690472,22691482,Q06889,ENST00000317216,-


Q92731


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1982,14,64282623,64282985,Q92731,ENST00000341099,-
1983,14,64279980,64280153,Q92731,ENST00000341099,-
1984,14,64268794,64268911,Q92731,ENST00000341099,-
1985,14,64260448,64260748,Q92731,ENST00000341099,-
1986,14,64257225,64257364,Q92731,ENST00000341099,-
1987,14,64249545,64249679,Q92731,ENST00000341099,-
1988,14,64234969,64235150,Q92731,ENST00000341099,-
1990,14,64233136,64233323,Q92731,ENST00000341099,-


Q8NBF1


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1912,1,53594107,53594902,Q8NBF1,ENST00000312233,-
1913,1,53529790,53529952,Q8NBF1,ENST00000312233,-
1914,1,53524776,53524887,Q8NBF1,ENST00000312233,-
1915,1,53520633,53520766,Q8NBF1,ENST00000312233,-
1916,1,53514624,53514781,Q8NBF1,ENST00000312233,-
1917,1,53509848,53510027,Q8NBF1,ENST00000312233,-
1918,1,53509119,53509287,Q8NBF1,ENST00000312233,-
1919,1,53506618,53506776,Q8NBF1,ENST00000312233,-


P11308


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
332,21,38445403,38445621,P11308,ENST00000288319,-
333,21,38423409,38423561,P11308,ENST00000288319,-
334,21,38403505,38403709,P11308,ENST00000288319,-
335,21,38402556,38402637,P11308,ENST00000288319,-
336,21,38392375,38392444,P11308,ENST00000288319,-
337,21,38391658,38391715,P11308,ENST00000288319,-
338,21,38390994,38391042,P11308,ENST00000288319,-
339,21,38400573,38400645,P11308,ENST00000288319,-
340,21,38498362,38498380,P11308,ENST00000288319,-
341,21,38383402,38383923,P11308,ENST00000288319,-


Q5T1R4


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1624,1,41579736,41584797,Q5T1R4,ENST00000372583,-
1625,1,41575543,41575689,Q5T1R4,ENST00000372583,-
1626,1,41524734,41524910,Q5T1R4,ENST00000372583,-
1627,1,41518401,41518488,Q5T1R4,ENST00000372583,-
1628,1,41512815,41513750,Q5T1R4,ENST00000372583,-
1629,1,41510450,41511266,Q5T1R4,ENST00000372583,-


Q13422


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1396,7,50319061,50319101,Q13422,ENST00000331340,+
1397,7,50327637,50327757,Q13422,ENST00000331340,+
1398,7,50382539,50382707,Q13422,ENST00000331340,+
1399,7,50387344,50387470,Q13422,ENST00000331340,+
1400,7,50391728,50391863,Q13422,ENST00000331340,+
1401,7,50376532,50376793,Q13422,ENST00000331340,+
1403,7,50399917,50400627,Q13422,ENST00000331340,+


Q9UGL1


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2179,1,202808101,202808305,Q9UGL1,ENST00000367265,-
2180,1,202777016,202777094,Q9UGL1,ENST00000367265,-
2181,1,202774612,202774735,Q9UGL1,ENST00000367265,-
2182,1,202773117,202773288,Q9UGL1,ENST00000367265,-
2183,1,202766925,202767060,Q9UGL1,ENST00000367265,-
2184,1,202764048,202764145,Q9UGL1,ENST00000367265,-
2185,1,202762698,202762808,Q9UGL1,ENST00000367265,-
2186,1,202760414,202760573,Q9UGL1,ENST00000367265,-
2187,1,202758390,202758510,Q9UGL1,ENST00000367265,-
2188,1,202756357,202756516,Q9UGL1,ENST00000367265,-


Q9BXK1


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1999,19,1863040,1863497,Q9BXK1,ENST00000250916,-
2000,19,1854458,1854760,Q9BXK1,ENST00000250916,-


O75840


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
111,2,207165466,207165568,O75840,ENST00000309446,-
112,2,207123773,207124404,O75840,ENST00000309446,-
113,2,207088457,207088581,O75840,ENST00000309446,-
114,2,207081212,207081264,O75840,ENST00000309446,-


Q03164


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1105,11,118468774,118468844,Q03164,ENST00000389506,+
1106,11,118471661,118474315,Q03164,ENST00000389506,+
1107,11,118476804,118476982,Q03164,ENST00000389506,+
1108,11,118477966,118478201,Q03164,ENST00000389506,+
1109,11,118480173,118480238,Q03164,ENST00000389506,+
1110,11,118481714,118482092,Q03164,ENST00000389506,+
1111,11,118482421,118482495,Q03164,ENST00000389506,+
1112,11,118484182,118484314,Q03164,ENST00000389506,+
1113,11,118484861,118484975,Q03164,ENST00000389506,+
1114,11,118488613,118488760,Q03164,ENST00000389506,+


Q06413


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1265,5,88804597,88804801,Q06413,ENST00000437473,-
1266,5,88761184,88761328,Q06413,ENST00000437473,-
1267,5,88751856,88752043,Q06413,ENST00000437473,-
1268,5,88749069,88749117,Q06413,ENST00000437473,-
1269,5,88731728,88731901,Q06413,ENST00000437473,-
1270,5,88730210,88730234,Q06413,ENST00000437473,-
1271,5,88729217,88729347,Q06413,ENST00000437473,-
1272,5,88728492,88728628,Q06413,ENST00000437473,-
1273,5,88823734,88823788,Q06413,ENST00000437473,-
1275,5,88722603,88722925,Q06413,ENST00000437473,-


O14770


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
0,15,37099454,37099466,O14770,ENST00000561208,-
1,15,37097966,37098199,O14770,ENST00000561208,-
2,15,37096288,37096430,O14770,ENST00000561208,-
3,15,37095563,37095614,O14770,ENST00000561208,-
4,15,37094526,37094577,O14770,ENST00000561208,-
5,15,37093580,37093730,O14770,ENST00000561208,-
6,15,37083770,37083885,O14770,ENST00000561208,-
7,15,37036813,37036959,O14770,ENST00000561208,-
8,15,36950323,36950400,O14770,ENST00000561208,-
9,15,36896627,36896686,O14770,ENST00000561208,-


P35548


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
997,5,174724659,174725038,P35548,ENST00000239243,+
998,5,174729158,174729583,P35548,ENST00000239243,+


Q14872


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1455,1,37857250,37857658,Q14872,ENST00000373036,-
1456,1,37839919,37840158,Q14872,ENST00000373036,-
1457,1,37838624,37838756,Q14872,ENST00000373036,-
1458,1,37835670,37835744,Q14872,ENST00000373036,-
1459,1,37835078,37835215,Q14872,ENST00000373036,-
1460,1,37832244,37832322,Q14872,ENST00000373036,-
1461,1,37823709,37823812,Q14872,ENST00000373036,-
1462,1,37822120,37822716,Q14872,ENST00000373036,-
1463,1,37817418,37817482,Q14872,ENST00000373036,-
1464,1,37815135,37815566,Q14872,ENST00000373036,-


Q9UL68


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2383,2,1979722,1979777,Q9UL68,ENST00000428368,-
2384,2,1979520,1979554,Q9UL68,ENST00000428368,-
2385,2,1979164,1979227,Q9UL68,ENST00000428368,-
2386,2,1942981,1943334,Q9UL68,ENST00000428368,-
2387,2,1917204,1917339,Q9UL68,ENST00000428368,-
2388,2,1912019,1912110,Q9UL68,ENST00000428368,-
2389,2,1910239,1910347,Q9UL68,ENST00000428368,-
2390,2,1903079,1903294,Q9UL68,ENST00000428368,-
2391,2,1892036,1892287,Q9UL68,ENST00000428368,-
2392,2,1889240,1889477,Q9UL68,ENST00000428368,-


Q15788


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1597,2,24658677,24658766,Q15788,ENST00000348332,+
1598,2,24665748,24665915,Q15788,ENST00000348332,+
1599,2,24673365,24673463,Q15788,ENST00000348332,+
1600,2,24682950,24683128,Q15788,ENST00000348332,+
1601,2,24691480,24691660,Q15788,ENST00000348332,+
1602,2,24693251,24693347,Q15788,ENST00000348332,+
1603,2,24697657,24697798,Q15788,ENST00000348332,+
1604,2,24705085,24705233,Q15788,ENST00000348332,+
1605,2,24706567,24707888,Q15788,ENST00000348332,+
1606,2,24710930,24711111,Q15788,ENST00000348332,+


Q9Y4A8


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2458,7,26152498,26153068,Q9Y4A8,ENST00000056233,+
2459,7,26177942,26178122,Q9Y4A8,ENST00000056233,+
2460,7,26183700,26183784,Q9Y4A8,ENST00000056233,+
2461,7,26184532,26185783,Q9Y4A8,ENST00000056233,+


Q12857


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1375,1,61088148,61088680,Q12857,ENST00000403491,+
1376,1,61277519,61277585,Q12857,ENST00000403491,+
1377,1,61332511,61332586,Q12857,ENST00000403491,+
1378,1,61352449,61352567,Q12857,ENST00000403491,+
1379,1,61359146,61359274,Q12857,ENST00000403491,+
1380,1,61383236,61383365,Q12857,ENST00000403491,+
1381,1,61404103,61404282,Q12857,ENST00000403491,+
1382,1,61406561,61406727,Q12857,ENST00000403491,+
1383,1,61426464,61426556,Q12857,ENST00000403491,+
1384,1,61082791,61082818,Q12857,ENST00000403491,+


Q14938


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1465,19,12995837,12995864,Q14938,ENST00000592199,+
1466,19,13025020,13025552,Q14938,ENST00000592199,+
1467,19,13073046,13073109,Q14938,ENST00000592199,+
1468,19,13073421,13073496,Q14938,ENST00000592199,+
1469,19,13073905,13074026,Q14938,ENST00000592199,+
1470,19,13075534,13075671,Q14938,ENST00000592199,+
1471,19,13078612,13078735,Q14938,ENST00000592199,+
1472,19,13081679,13081855,Q14938,ENST00000592199,+
1473,19,13087988,13088136,Q14938,ENST00000592199,+
1474,19,13090298,13090390,Q14938,ENST00000592199,+


P20393


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
781,17,40100063,40100094,P20393,ENST00000246672,-
782,17,40097064,40097403,P20393,ENST00000246672,-
783,17,40096690,40096779,P20393,ENST00000246672,-
784,17,40096442,40096587,P20393,ENST00000246672,-
785,17,40095443,40096087,P20393,ENST00000246672,-
786,17,40094934,40095120,P20393,ENST00000246672,-
787,17,40093911,40094122,P20393,ENST00000246672,-
788,17,40093082,40093282,P20393,ENST00000246672,-


P43354


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1000,2,156329322,156330186,P43354,ENST00000339562,-
1001,2,156328403,156328533,P43354,ENST00000339562,-
1002,2,156327850,156328014,P43354,ENST00000339562,-
1003,2,156326717,156326920,P43354,ENST00000339562,-
1004,2,156326149,156326328,P43354,ENST00000339562,-
1005,2,156325743,156326000,P43354,ENST00000339562,-


O95096


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
226,20,21513410,21513669,O95096,ENST00000377142,-
227,20,21511922,21512485,O95096,ENST00000377142,-


P32242


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
971,2,63052990,63053087,P32242,ENST00000282549,+
972,2,63054046,63054198,P32242,ENST00000282549,+
973,2,63055500,63056316,P32242,ENST00000282549,+


Q02548


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1027,9,37033985,37034031,Q02548,ENST00000358127,-
1028,9,37020635,37020801,Q02548,ENST00000358127,-
1029,9,37014996,37015194,Q02548,ENST00000358127,-
1030,9,37006472,37006537,Q02548,ENST00000358127,-
1031,9,37002647,37002776,Q02548,ENST00000358127,-
1032,9,36966548,36966724,Q02548,ENST00000358127,-
1033,9,36923354,36923484,Q02548,ENST00000358127,-
1034,9,36882003,36882105,Q02548,ENST00000358127,-
1035,9,36846842,36846929,Q02548,ENST00000358127,-
1036,9,36840559,36840636,Q02548,ENST00000358127,-


P26367


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
946,11,31806401,31806411,P26367,ENST00000241001,-
947,11,31802703,31802834,P26367,ENST00000241001,-
948,11,31801560,31801776,P26367,ENST00000241001,-
949,11,31800690,31800856,P26367,ENST00000241001,-
950,11,31794629,31794788,P26367,ENST00000241001,-
951,11,31794031,31794114,P26367,ENST00000241001,-
952,11,31793651,31793802,P26367,ENST00000241001,-
953,11,31793437,31793553,P26367,ENST00000241001,-
954,11,31790709,31790860,P26367,ENST00000241001,-
955,11,31789933,31790019,P26367,ENST00000241001,-


P78337


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1020,5,135033712,135033881,P78337,ENST00000265340,-
1021,5,135031275,135031508,P78337,ENST00000265340,-
1022,5,135028778,135029321,P78337,ENST00000265340,-


P35398


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
974,15,61229052,61229218,P35398,ENST00000335670,-
975,15,60678656,60678686,P35398,ENST00000335670,-
976,15,60531765,60531851,P35398,ENST00000335670,-
977,15,60514615,60514757,P35398,ENST00000335670,-
978,15,60511225,60511621,P35398,ENST00000335670,-
979,15,60505507,60505629,P35398,ENST00000335670,-
980,15,60503534,60503667,P35398,ENST00000335670,-
981,15,60502759,60502867,P35398,ENST00000335670,-
982,15,60500958,60501069,P35398,ENST00000335670,-
983,15,60499891,60500004,P35398,ENST00000335670,-


Q13485


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1442,18,51047046,51047295,Q13485,ENST00000342988,+
1443,18,51048685,51048860,Q13485,ENST00000342988,+
1444,18,51049294,51049324,Q13485,ENST00000342988,+
1445,18,51054780,51054993,Q13485,ENST00000342988,+
1446,18,51058124,51058244,Q13485,ENST00000342988,+
1447,18,51058339,51058456,Q13485,ENST00000342988,+
1448,18,51059865,51059916,Q13485,ENST00000342988,+
1449,18,51065422,51065606,Q13485,ENST00000342988,+
1450,18,51067018,51067187,Q13485,ENST00000342988,+
1451,18,51076637,51076776,Q13485,ENST00000342988,+


Q6ZRS2


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1846,16,30700824,30700878,Q6ZRS2,ENST00000262518,+
1847,16,30704063,30704315,Q6ZRS2,ENST00000262518,+
1848,16,30707182,30707368,Q6ZRS2,ENST00000262518,+
1849,16,30707571,30707712,Q6ZRS2,ENST00000262518,+
1850,16,30709512,30709735,Q6ZRS2,ENST00000262518,+
1851,16,30709850,30710128,Q6ZRS2,ENST00000262518,+
1852,16,30710753,30710847,Q6ZRS2,ENST00000262518,+
1853,16,30710998,30711088,Q6ZRS2,ENST00000262518,+
1854,16,30711570,30711744,Q6ZRS2,ENST00000262518,+
1855,16,30711834,30712157,Q6ZRS2,ENST00000262518,+


Q9Y458


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2424,X,80022269,80022444,Q9Y458,ENST00000373294,+
2425,X,80023059,80023240,Q9Y458,ENST00000373294,+
2426,X,80024062,80024164,Q9Y458,ENST00000373294,+
2427,X,80025602,80025777,Q9Y458,ENST00000373294,+
2428,X,80026703,80026868,Q9Y458,ENST00000373294,+
2429,X,80027255,80027320,Q9Y458,ENST00000373294,+
2430,X,80027990,80028076,Q9Y458,ENST00000373294,+
2431,X,80030497,80031111,Q9Y458,ENST00000373294,+


Q9UGU0


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2257,22,42209650,42215305,Q9UGU0,ENST00000359486,-
2258,22,42179608,42179702,Q9UGU0,ENST00000359486,-
2259,22,42169846,42169896,Q9UGU0,ENST00000359486,-
2260,22,42168652,42168736,Q9UGU0,ENST00000359486,-


P15884


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
426,18,55587044,55587116,P15884,ENST00000356073,-
427,18,55585279,55585352,P15884,ENST00000356073,-
428,18,55464075,55464137,P15884,ENST00000356073,-
429,18,55461018,55461115,P15884,ENST00000356073,-
430,18,55403453,55403518,P15884,ENST00000356073,-
431,18,55350873,55351003,P15884,ENST00000356073,-
432,18,55350358,55350408,P15884,ENST00000356073,-
433,18,55279550,55279656,P15884,ENST00000356073,-
434,18,55275618,55275752,P15884,ENST00000356073,-
435,18,55269830,55269963,P15884,ENST00000356073,-


Q9NQB0


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
2094,10,112950756,112950945,Q9NQB0,ENST00000355995,+
2095,10,112951206,112951273,Q9NQB0,ENST00000355995,+
2096,10,112951482,112951607,Q9NQB0,ENST00000355995,+
2097,10,112964555,112964624,Q9NQB0,ENST00000355995,+
2098,10,113040024,113040126,Q9NQB0,ENST00000355995,+
2099,10,113141183,113141316,Q9NQB0,ENST00000355995,+
2100,10,113143922,113144025,Q9NQB0,ENST00000355995,+
2101,10,113146010,113146097,Q9NQB0,ENST00000355995,+
2102,10,113150997,113151123,Q9NQB0,ENST00000355995,+
2103,10,113151724,113151884,Q9NQB0,ENST00000355995,+


Q6N021


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
1654,4,105241338,105241429,Q6N021,ENST00000380013,+
1655,4,105242833,105242927,Q6N021,ENST00000380013,+
1656,4,105243569,105243778,Q6N021,ENST00000380013,+
1657,4,105259618,105259769,Q6N021,ENST00000380013,+
1658,4,105261758,105261848,Q6N021,ENST00000380013,+
1659,4,105269609,105269747,Q6N021,ENST00000380013,+
1660,4,105272563,105272918,Q6N021,ENST00000380013,+
1661,4,105233942,105237351,Q6N021,ENST00000380013,+
1662,4,105275047,105276519,Q6N021,ENST00000380013,+


P19532


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
767,X,49043110,49043226,P19532,ENST00000315869,-
768,X,49040454,49040568,P19532,ENST00000315869,-
769,X,49039106,49039410,P19532,ENST00000315869,-
770,X,49038196,49038442,P19532,ENST00000315869,-
771,X,49038009,49038114,P19532,ENST00000315869,-
772,X,49034133,49034251,P19532,ENST00000315869,-
773,X,49033725,49033782,P19532,ENST00000315869,-
774,X,49033464,49033540,P19532,ENST00000315869,-
775,X,49031396,49031544,P19532,ENST00000315869,-
776,X,49030157,49030601,P19532,ENST00000315869,-


P10827


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
275,17,40074488,40074541,P10827,ENST00000264637,+
276,17,40076870,40076938,P10827,ENST00000264637,+
277,17,40077507,40077608,P10827,ENST00000264637,+
278,17,40083834,40083982,P10827,ENST00000264637,+
279,17,40084609,40084815,P10827,ENST00000264637,+
280,17,40086706,40086853,P10827,ENST00000264637,+
281,17,40088241,40088500,P10827,ENST00000264637,+
282,17,40089205,40089333,P10827,ENST00000264637,+
284,17,40093019,40093382,P10827,ENST00000264637,+


P25490


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
789,14,100239244,100239923,P25490,ENST00000262238,+
790,14,100262303,100262466,P25490,ENST00000262238,+
791,14,100274697,100274758,P25490,ENST00000262238,+
792,14,100276489,100276648,P25490,ENST00000262238,+
793,14,100277417,100277600,P25490,ENST00000262238,+


P11473


Unnamed: 0,Chromosome/scaffold name,Genomic coding start,Genomic coding end,uniprotID,Transcript stable ID,Strand
373,12,47878967,47879113,P11473,ENST00000395324,-
374,12,47865046,47865177,P11473,ENST00000395324,-
375,12,47857503,47857688,P11473,ENST00000395324,-
376,12,47857128,47857249,P11473,ENST00000395324,-
377,12,47855629,47855801,P11473,ENST00000395324,-
378,12,47846656,47846808,P11473,ENST00000395324,-
379,12,47846334,47846451,P11473,ENST00000395324,-
380,12,47844745,47845005,P11473,ENST00000395324,-


In [15]:
# Now, preparing the format of the domain input to get_DBD_ED_coords.py

In [16]:
AD_coords = pd.read_csv("../data/SFARI_ADs_AA_coords_ENSG.csv", index_col = 0)
AD_coords

Unnamed: 0,uniprotID,Start,End,ENSG
0,Q9HBZ2,524,717,ENSG00000172379
1,Q96QS3,472,562,ENSG00000004848
2,Q6P1N0,22,60,ENSG00000132024
3,O94983,285,468,ENSG00000108509
4,O94983,472,581,ENSG00000108509
...,...,...,...,...
58,P10827,1,52,ENSG00000126351
59,P11473,415,427,ENSG00000111424
60,P11473,195,238,ENSG00000111424
61,P25490,1,69,ENSG00000100811


In [17]:
AD_coords["coord"] = AD_coords["Start"].astype(str)+"-"+AD_coords["End"].astype(str)
AD_coords

Unnamed: 0,uniprotID,Start,End,ENSG,coord
0,Q9HBZ2,524,717,ENSG00000172379,524-717
1,Q96QS3,472,562,ENSG00000004848,472-562
2,Q6P1N0,22,60,ENSG00000132024,22-60
3,O94983,285,468,ENSG00000108509,285-468
4,O94983,472,581,ENSG00000108509,472-581
...,...,...,...,...,...
58,P10827,1,52,ENSG00000126351,1-52
59,P11473,415,427,ENSG00000111424,415-427
60,P11473,195,238,ENSG00000111424,195-238
61,P25490,1,69,ENSG00000100811,1-69


In [18]:
merged_AD_coords = AD_coords[["uniprotID", "coord"]].groupby("uniprotID").agg(lambda c: ",".join(c))
merged_AD_coords = merged_AD_coords.drop(["O95718"])
merged_AD_coords = merged_AD_coords.reset_index()
merged_AD_coords

Unnamed: 0,uniprotID,coord
0,O14770,340-477
1,O60479,"199-263,2-91"
2,O75840,2-101
3,O94983,"285-468,472-581"
4,O95096,220-273
5,P10827,1-52
6,P11308,"433-479,118-261"
7,P11473,"415-427,195-238"
8,P15884,"1-100,340-400"
9,P19532,"212-333,472-575,1-127"


In [19]:
TF_table = merged_AD_coords.rename(columns = {"coord" : "AD_coords"})
TF_table = pd.merge(TF_table, AD_coords[["uniprotID", "ENSG"]], on = "uniprotID", how = "left").drop_duplicates()
TF_table = TF_table.reset_index(drop = True)
TF_table

Unnamed: 0,uniprotID,AD_coords,ENSG
0,O14770,340-477,ENSG00000134138
1,O60479,"199-263,2-91",ENSG00000064195
2,O75840,2-101,ENSG00000118263
3,O94983,"285-468,472-581",ENSG00000108509
4,O95096,220-273,ENSG00000125820
5,P10827,1-52,ENSG00000126351
6,P11308,"433-479,118-261",ENSG00000157554
7,P11473,"415-427,195-238",ENSG00000111424
8,P15884,"1-100,340-400",ENSG00000196628
9,P19532,"212-333,472-575,1-127",ENSG00000068323


In [20]:
TF_table["ENST"] = TF_table["uniprotID"].map(uniprotID_ENST_mapping_dict)
TF_table["DBD_coords"] = "NA-NA"
TF_table["RD_coords"] = ""
TF_table["Bif_coords"] = ""
TF_table["1"] = ""
TF_table["2"] = ""
TF_table["length"] = "1"
TF_table

Unnamed: 0,uniprotID,AD_coords,ENSG,ENST,DBD_coords,RD_coords,Bif_coords,1,2,length
0,O14770,340-477,ENSG00000134138,ENST00000561208,NA-NA,,,,,1
1,O60479,"199-263,2-91",ENSG00000064195,ENST00000434704,NA-NA,,,,,1
2,O75840,2-101,ENSG00000118263,ENST00000309446,NA-NA,,,,,1
3,O94983,"285-468,472-581",ENSG00000108509,ENST00000348066,NA-NA,,,,,1
4,O95096,220-273,ENSG00000125820,ENST00000377142,NA-NA,,,,,1
5,P10827,1-52,ENSG00000126351,ENST00000264637,NA-NA,,,,,1
6,P11308,"433-479,118-261",ENSG00000157554,ENST00000288319,NA-NA,,,,,1
7,P11473,"415-427,195-238",ENSG00000111424,ENST00000395324,NA-NA,,,,,1
8,P15884,"1-100,340-400",ENSG00000196628,ENST00000356073,NA-NA,,,,,1
9,P19532,"212-333,472-575,1-127",ENSG00000068323,ENST00000315869,NA-NA,,,,,1


Format of text table seems to be 

"../outputs/TFs_table_proteins.txt"

- 0
- 1
- 2
- 3 uniprotID
- 4 ensg
- 5 enst
- 6 dbd_coords 
- 7 ad_coords
- 8 rd_coords
- 9 bif_coords
- -1 len

the coords look like 1-3,4-5,6-8 etc.

In [21]:
TF_table_formatted = TF_table[["1","2", "uniprotID", "ENSG", "ENST", "DBD_coords", "AD_coords", "RD_coords", "Bif_coords", "length"]]
TF_table_formatted

Unnamed: 0,1,2,uniprotID,ENSG,ENST,DBD_coords,AD_coords,RD_coords,Bif_coords,length
0,,,O14770,ENSG00000134138,ENST00000561208,NA-NA,340-477,,,1
1,,,O60479,ENSG00000064195,ENST00000434704,NA-NA,"199-263,2-91",,,1
2,,,O75840,ENSG00000118263,ENST00000309446,NA-NA,2-101,,,1
3,,,O94983,ENSG00000108509,ENST00000348066,NA-NA,"285-468,472-581",,,1
4,,,O95096,ENSG00000125820,ENST00000377142,NA-NA,220-273,,,1
5,,,P10827,ENSG00000126351,ENST00000264637,NA-NA,1-52,,,1
6,,,P11308,ENSG00000157554,ENST00000288319,NA-NA,"433-479,118-261",,,1
7,,,P11473,ENSG00000111424,ENST00000395324,NA-NA,"415-427,195-238",,,1
8,,,P15884,ENSG00000196628,ENST00000356073,NA-NA,"1-100,340-400",,,1
9,,,P19532,ENSG00000068323,ENST00000315869,NA-NA,"212-333,472-575,1-127",,,1


In [22]:
TF_table_formatted.to_csv("../soto_analysis/outputs/TFs_table_proteins.txt", sep = "\t")

In [23]:
TF_table_formatted[["uniprotID"]].to_csv("../data/TF_table_uniprotIDs.txt",
                               header=None, index=None, sep=' ', mode='a')

In [24]:
uniprot_DBD_output = pd.read_csv("../data/TF_table_DBDs.txt", sep = "\t")
uniprot_DBD_output["DBD_split"] = uniprot_DBD_output["DNA binding"].str.split("DNA_bind")
uniprot_DBD_output

Unnamed: 0,From,Entry,DNA binding,DBD_split
0,O14770,O14770,"DNA_BIND 276..338; /note=""Homeobox; TALE-type""...","[DNA_BIND 276..338; /note=""Homeobox; TALE-type..."
1,O60479,O60479,"DNA_BIND 129..188; /note=""Homeobox""; /evidence...","[DNA_BIND 129..188; /note=""Homeobox""; /evidenc..."
2,O75840,O75840,,
3,O94983,O94983,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO...","[DNA_BIND 30..155; /note=""CG-1""; /evidence=""EC..."
4,O95096,O95096,"DNA_BIND 128..187; /note=""Homeobox""; /evidence...","[DNA_BIND 128..187; /note=""Homeobox""; /evidenc..."
5,P10827,P10827,"DNA_BIND 53..127; /note=""Nuclear receptor""; /e...","[DNA_BIND 53..127; /note=""Nuclear receptor""; /..."
6,P11308,P11308,"DNA_BIND 311..391; /note=""ETS""; /evidence=""ECO...","[DNA_BIND 311..391; /note=""ETS""; /evidence=""EC..."
7,P11473,P11473,"DNA_BIND 21..96; /note=""Nuclear receptor""; /ev...","[DNA_BIND 21..96; /note=""Nuclear receptor""; /e..."
8,P15884,P15884,,
9,P19532,P19532,,


In [25]:
DBDs = uniprot_DBD_output.explode("DBD_split")
DBDs = DBDs.dropna()
DBDs["Start"] = DBDs["DBD_split"].str.extract(r'DNA_BIND (\d*)\.\.')
DBDs["End"] = DBDs["DBD_split"].str.extract(r'DNA_BIND \d*\.\.(\d*)')
DBDs = DBDs[["From", "Start", "End"]]
DBDs = DBDs.rename(columns = {"From" : "uniprotID"})
DBDs["Start"] = DBDs["Start"].astype(int)
DBDs["End"] = DBDs["End"].astype(int)
DBDs["coord"] = DBDs["Start"].astype(str)+"-"+DBDs["End"].astype(str)
DBDs

Unnamed: 0,uniprotID,Start,End,coord
0,O14770,276,338,276-338
1,O60479,129,188,129-188
3,O94983,30,155,30-155
4,O95096,128,187,128-187
5,P10827,53,127,53-127
6,P11308,311,391,311-391
7,P11473,21,96,21-96
10,P20393,129,205,129-205
12,P26367,4,130,4-130
13,P32242,38,97,38-97


In [26]:
merged_DBD_coords = DBDs[["uniprotID", "coord"]].groupby("uniprotID").agg(lambda c: ",".join(c))
# merged_DBD_coords = merged_DBD_coords.drop(["O95718"])
merged_DBD_coords = merged_DBD_coords.reset_index()
merged_DBD_coords

Unnamed: 0,uniprotID,coord
0,O14770,276-338
1,O60479,129-188
2,O94983,30-155
3,O95096,128-187
4,P10827,53-127
5,P11308,311-391
6,P11473,21-96
7,P20393,129-205
8,P26367,4-130
9,P32242,38-97


In [27]:
TF_table_formatted = pd.merge(TF_table_formatted, merged_DBD_coords, how = "left", on = "uniprotID")
TF_table_formatted["DBD_coords"] = TF_table_formatted["coord"]
TF_table_formatted = TF_table_formatted.drop(columns = ["coord"])
TF_table_formatted

Unnamed: 0,1,2,uniprotID,ENSG,ENST,DBD_coords,AD_coords,RD_coords,Bif_coords,length
0,,,O14770,ENSG00000134138,ENST00000561208,276-338,340-477,,,1
1,,,O60479,ENSG00000064195,ENST00000434704,129-188,"199-263,2-91",,,1
2,,,O75840,ENSG00000118263,ENST00000309446,,2-101,,,1
3,,,O94983,ENSG00000108509,ENST00000348066,30-155,"285-468,472-581",,,1
4,,,O95096,ENSG00000125820,ENST00000377142,128-187,220-273,,,1
5,,,P10827,ENSG00000126351,ENST00000264637,53-127,1-52,,,1
6,,,P11308,ENSG00000157554,ENST00000288319,311-391,"433-479,118-261",,,1
7,,,P11473,ENSG00000111424,ENST00000395324,21-96,"415-427,195-238",,,1
8,,,P15884,ENSG00000196628,ENST00000356073,,"1-100,340-400",,,1
9,,,P19532,ENSG00000068323,ENST00000315869,,"212-333,472-575,1-127",,,1


In [28]:
TF_table_formatted = TF_table_formatted[["1","2", "uniprotID", "ENSG", "ENST", "DBD_coords", "AD_coords", "RD_coords", "Bif_coords", "length"]]
TF_table_formatted["DBD_coords"] = TF_table_formatted["DBD_coords"].fillna("NA-NA")
TF_table_formatted

Unnamed: 0,1,2,uniprotID,ENSG,ENST,DBD_coords,AD_coords,RD_coords,Bif_coords,length
0,,,O14770,ENSG00000134138,ENST00000561208,276-338,340-477,,,1
1,,,O60479,ENSG00000064195,ENST00000434704,129-188,"199-263,2-91",,,1
2,,,O75840,ENSG00000118263,ENST00000309446,NA-NA,2-101,,,1
3,,,O94983,ENSG00000108509,ENST00000348066,30-155,"285-468,472-581",,,1
4,,,O95096,ENSG00000125820,ENST00000377142,128-187,220-273,,,1
5,,,P10827,ENSG00000126351,ENST00000264637,53-127,1-52,,,1
6,,,P11308,ENSG00000157554,ENST00000288319,311-391,"433-479,118-261",,,1
7,,,P11473,ENSG00000111424,ENST00000395324,21-96,"415-427,195-238",,,1
8,,,P15884,ENSG00000196628,ENST00000356073,NA-NA,"1-100,340-400",,,1
9,,,P19532,ENSG00000068323,ENST00000315869,NA-NA,"212-333,472-575,1-127",,,1


In [29]:
TF_table_formatted.to_csv("../soto_analysis/outputs/TFs_table_proteins.txt", sep = "\t")