In [245]:
import pandas as pd

## Adding UniProt DBDs to Lambert TFs

In [247]:
# Loading in supplemental table 1 from Lambert et al 2018
lambert_TFs = pd.read_excel("../data/lambert_supp_tables.xlsx", sheet_name = "Table S1. Related to Figure 1B")
lambert_TFs = lambert_TFs[lambert_TFs["Is TF?"] == "Yes"]
lambert_TFs.head()

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
1,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
2,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
3,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,No,Yes,
4,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Has known motif,1 Monomer or homomultimer,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,
5,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,


In [248]:
# Downloading all Lambert TF ENSG codes
lambert_TFs_ENSG = lambert_TFs[["Gene Information"]]
lambert_TFs_ENSG.to_csv("../data/LambertTFs_ENSG.csv", header = None, index = False)
lambert_TFs_ENSG

Unnamed: 0,Gene Information
1,ENSG00000137203
2,ENSG00000008196
3,ENSG00000087510
4,ENSG00000008197
5,ENSG00000116819
...,...
2760,ENSG00000177683
2761,ENSG00000174796
2762,ENSG00000184436
2763,ENSG00000161277


Steps
1. Uploaded to UniProt ID mapping: https://www.uniprot.org/id-mapping
2. Selected: From database "Ensembl" To database "UniProtKB" then hit enter
4. Once search completed, filtered to only reviewed entries
5. Downloaded TSV, and added "DNA binding" column

In [250]:
# Output from uniprot
uniprot_DBD_output = pd.read_csv("../data/Lambert_TFs_DBD_uniprot.txt", sep = "\t")
uniprot_DBD_output

Unnamed: 0,From,Entry,Reviewed,Entry Name,Gene Names,DNA binding
0,ENSG00000233757,A0A087WUV0,reviewed,ZN892_HUMAN,ZNF892,
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN..."
2,ENSG00000128610,A0PJY2,reviewed,FEZF1_HUMAN,FEZF1 FEZ ZNF312B,
3,ENSG00000164334,A1A519,reviewed,F170A_HUMAN,FAM170A ZNFD,
4,ENSG00000184828,A1YPR0,reviewed,ZBT7C_HUMAN,ZBTB7C APM1 ZBTB36 ZNF857C,
...,...,...,...,...,...,...
1608,ENSG00000124151,Q9Y6Q9,reviewed,NCOA3_HUMAN,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,
1609,ENSG00000128000,Q9Y6R6,reviewed,Z780B_HUMAN,ZNF780B ZNF779,
1610,ENSG00000152217,Q9Y6X0,reviewed,SETBP_HUMAN,SETBP1 KIAA0437,"DNA_BIND 584..596; /note=""A.T hook 1""; DNA_BIN..."
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden..."


In [251]:
# Checking how many DBDs found per TF
uniprot_DBD_output["DBD_count"] = uniprot_DBD_output["DNA binding"].str.count("DNA_BIND")
uniprot_DBD_output

Unnamed: 0,From,Entry,Reviewed,Entry Name,Gene Names,DNA binding,DBD_count
0,ENSG00000233757,A0A087WUV0,reviewed,ZN892_HUMAN,ZNF892,,
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0
2,ENSG00000128610,A0PJY2,reviewed,FEZF1_HUMAN,FEZF1 FEZ ZNF312B,,
3,ENSG00000164334,A1A519,reviewed,F170A_HUMAN,FAM170A ZNFD,,
4,ENSG00000184828,A1YPR0,reviewed,ZBT7C_HUMAN,ZBTB7C APM1 ZBTB36 ZNF857C,,
...,...,...,...,...,...,...,...
1608,ENSG00000124151,Q9Y6Q9,reviewed,NCOA3_HUMAN,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,,
1609,ENSG00000128000,Q9Y6R6,reviewed,Z780B_HUMAN,ZNF780B ZNF779,,
1610,ENSG00000152217,Q9Y6X0,reviewed,SETBP_HUMAN,SETBP1 KIAA0437,"DNA_BIND 584..596; /note=""A.T hook 1""; DNA_BIN...",3.0
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0


In [252]:
# Extracting DBDs and saving as integers

# One row per DBD
uniprot_DBD_output["DBD_split"] = uniprot_DBD_output["DNA binding"].str.split("DNA_BIND")
DBDs = uniprot_DBD_output.explode("DBD_split")
DBDs = DBDs.dropna()
DBDs

Unnamed: 0,From,Entry,Reviewed,Entry Name,Gene Names,DNA binding,DBD_count,DBD_split
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0,
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0,"113..182; /evidence=""ECO:0000255"";"
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0,"261..347; /evidence=""ECO:0000255"""
6,ENSG00000188816,A2RU54,reviewed,HMX2_HUMAN,HMX2,"DNA_BIND 149..208; /note=""Homeobox""; /evidence...",1.0,
6,ENSG00000188816,A2RU54,reviewed,HMX2_HUMAN,HMX2,"DNA_BIND 149..208; /note=""Homeobox""; /evidence...",1.0,"149..208; /note=""Homeobox""; /evidence=""ECO:00..."
...,...,...,...,...,...,...,...,...
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0,"439..501; /note=""Homeobox 2""; /evidence=""ECO:..."
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0,"530..591; /note=""Homeobox 3""; /evidence=""ECO:..."
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0,"628..690; /note=""Homeobox 4""; /evidence=""ECO:..."
1612,ENSG00000171735,Q9Y6Y1,reviewed,CMTA1_HUMAN,CAMTA1 KIAA0833 MSTP023,"DNA_BIND 63..188; /note=""CG-1""; /evidence=""ECO...",1.0,


In [253]:
# Formatting, regex to extact DBDs
DBDs["Start"] = DBDs["DBD_split"].str.extract(r'(\d*)\.\.')
DBDs["End"] = DBDs["DBD_split"].str.extract(r'\d*\.\.(\d*)')
DBDs = DBDs[["From", "Start", "End"]]
DBDs = DBDs.rename(columns = {"From" : "Gene Information"})
DBDs = DBDs.dropna()
DBDs["Start"] = DBDs["Start"].astype(int)
DBDs["End"] = DBDs["End"].astype(int)
DBDs = DBDs.drop_duplicates()
DBDs

Unnamed: 0,Gene Information,Start,End
1,ENSG00000129173,113,182
1,ENSG00000129173,261,347
6,ENSG00000188816,149,208
8,ENSG00000180053,132,191
11,ENSG00000204595,16,75
...,...,...,...
1611,ENSG00000178764,263,324
1611,ENSG00000178764,439,501
1611,ENSG00000178764,530,591
1611,ENSG00000178764,628,690


In [254]:
# Merging back to original Lambert TF table
lambert_TFs_with_DBDs = pd.merge(lambert_TFs, DBDs, how = "left")
lambert_TFs_with_DBDs

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Start,End
0,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
1,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
2,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,Yes,,,
3,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,,,
4,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,ENSG00000177683,THAP5,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1718,ENSG00000174796,THAP6,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1719,ENSG00000184436,THAP7,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1720,ENSG00000161277,THAP8,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,PMID: 12575992 says it has unique DBD,need revisit as it has C2CH signature,c,Yes,Yes,No,No,,,


In [255]:
len(set(lambert_TFs_with_DBDs[lambert_TFs_with_DBDs["Start"].isna()]["Gene Information"]))

1124

In [256]:
len(set(lambert_TFs_with_DBDs[~lambert_TFs_with_DBDs["Start"].isna()]["Gene Information"]))

515

In [257]:
len(lambert_TFs)

1639

In [270]:
lambert_TFs_with_DBDs.to_csv("../output/lambert_TFs_with_uniprot_DBDs.csv")

In [282]:
DBDs_found = lambert_TFs_with_DBDs[~lambert_TFs_with_DBDs["Start"].isna()]
DBDs_found

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Start,End
11,ENSG00000189079,ARID2,ARID/BRIGHT; RFX,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,x,Yes,Yes,TF Gene_Transcription Factor Binding: tf co-fa...,No,,524.0,603.0
12,ENSG00000153207,AHCTF1,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Recruits nucleoporins to chromatin [PMID: 1723...,Should not be initial assessment 1a1?,c,No,No,No,No,,1971.0,1983.0
13,ENSG00000126705,AHDC1,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,x,No,No,No,No,,396.0,408.0
14,ENSG00000126705,AHDC1,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,x,No,No,No,No,,544.0,556.0
15,ENSG00000106948,AKNA,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,Binds AT-rich promoters of CD40 and CD40L and ...,,...,Debatable - binds AT-rich promoters of CD40 an...,Debatable - binds AT-rich promoters of CD40 an...,c,No,No,No,Yes,,1115.0,1123.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1701,ENSG00000149922,TBX6,T-box,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,No,,100.0,273.0
1706,ENSG00000187079,TEAD1,TEA,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,No,,28.0,104.0
1707,ENSG00000074219,TEAD2,TEA,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,38.0,114.0
1708,ENSG00000007866,TEAD3,TEA,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,No,,28.0,104.0


In [284]:
DBDs_not_found = lambert_TFs_with_DBDs[lambert_TFs_with_DBDs["Start"].isna()]
DBDs_not_found

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Start,End
0,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
1,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
2,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,Yes,,,
3,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,,,
4,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,ENSG00000177683,THAP5,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1718,ENSG00000174796,THAP6,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1719,ENSG00000184436,THAP7,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1720,ENSG00000161277,THAP8,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,PMID: 12575992 says it has unique DBD,need revisit as it has C2CH signature,c,Yes,Yes,No,No,,,


In [286]:
DBDs_not_found.columns

Index(['Gene Information', 'Unnamed: 1', 'Unnamed: 2', 'Is TF?',
       'Final Assesment', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7',
       'Unnamed: 8', 'Notes from re-reviewed genes', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Curator Disagreement Flags', 'Unnamed: 15', 'Curator 1', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Curator 2', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Prior classifications',
       'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30',
       'Unnamed: 31', 'Start', 'End'],
      dtype='object')

In [300]:
pd.read_csv("../data/DBDs/S3_41576_2009_BFnrg2538_MOESM6_ESM.txt", header = 7, sep = "\t")

Unnamed: 0,Class,Ensembl ID,IPI ID,Interpro DBD,Interpro DNA-binding family,HGNC symbol,Tissue-specificity
0,a,ENSG00000001167,IPI00218470;IPI00333568,,IPR001289,NFYA,
1,a,ENSG00000004848,IPI00045066,IPR000047;IPR001356;IPR009057;IPR012287,,ARX,
2,a,ENSG00000005073,IPI00010754;IPI00807582,IPR001356;IPR009057;IPR012287,,HOXA11,uterus
3,a,ENSG00000005513,IPI00299069;IPI00386303,IPR000910;IPR009071,,SOX8,
4,a,ENSG00000005889,IPI00012336;IPI00169385;IPI00179465;IPI00607877,IPR007087;IPR015880,IPR006794,ZFX,general
...,...,...,...,...,...,...,...
1982,x,ENSG00000215812,IPI00874268,IPR007087;IPR015880,,,
1983,x,ENSG00000215887,IPI00873740,IPR007087;IPR015880,,,
1984,x,ENSG00000218788,IPI00902657,IPR004826;IPR004827;IPR008917,,,
1985,x,ENSG00000219340,IPI00397740,IPR007087;IPR015880,,,
