In [262]:
import pandas as pd
import re

## Adding UniProt DBDs to Lambert TFs

In [510]:
# Loading in supplemental table 1 from Lambert et al 2018
lambert_TFs = pd.read_excel("../data/lambert_supp_tables.xlsx", sheet_name = "Table S1. Related to Figure 1B")
lambert_TFs = lambert_TFs[lambert_TFs["Is TF?"] == "Yes"]
lambert_TFs.head()

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
1,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
2,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
3,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,No,Yes,
4,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Has known motif,1 Monomer or homomultimer,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,
5,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,


In [511]:
# Downloading all Lambert TF ENSG codes
lambert_TFs_ENSG = lambert_TFs[["Gene Information"]]
lambert_TFs_ENSG.to_csv("../data/LambertTFs_ENSG.csv", header = None, index = False)
lambert_TFs_ENSG

Unnamed: 0,Gene Information
1,ENSG00000137203
2,ENSG00000008196
3,ENSG00000087510
4,ENSG00000008197
5,ENSG00000116819
...,...
2760,ENSG00000177683
2761,ENSG00000174796
2762,ENSG00000184436
2763,ENSG00000161277


Steps
1. Uploaded to UniProt ID mapping: https://www.uniprot.org/id-mapping
2. Selected: From database "Ensembl" To database "UniProtKB" then hit enter
4. Once search completed, filtered to only reviewed entries
5. Downloaded TSV, and added "DNA binding" column and "regions"

In [513]:
# Output from uniprot
uniprot_DBD_output = pd.read_csv("../data/Lambert_TFs_DBD_uniprot_more_info.txt", sep = "\t")
uniprot_DBD_output

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,DNA binding,Coiled coil,Compositional bias,Domain [CC],Domain [FT],Region,Repeat,Protein families,Motif,Sequence similarities,Zinc finger
0,ENSG00000233757,A0A087WUV0,reviewed,ZN892_HUMAN,Zinc finger protein 892,ZNF892,Homo sapiens (Human),522,,,"COMPBIAS 1..19; /note=""Basic and acidic residu...",,,"REGION 1..22; /note=""Disordered""; /evidence=""E...",,Krueppel C2H2-type zinc-finger protein family,,SIMILARITY: Belongs to the krueppel C2H2-type ...,"ZN_FING 221..243; /note=""C2H2-type 1""; /eviden..."
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,Transcription factor E2F8 (E2F-8),E2F8,Homo sapiens (Human),867,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",,"COMPBIAS 410..432; /note=""Polar residues""; /ev...",DOMAIN: In contrast to classical members of th...,,"REGION 38..58; /note=""Disordered""; /evidence=""...",,E2F/DP family,,SIMILARITY: Belongs to the E2F/DP family. {ECO...,
2,ENSG00000128610,A0PJY2,reviewed,FEZF1_HUMAN,Fez family zinc finger protein 1 (Zinc finger ...,FEZF1 FEZ ZNF312B,Homo sapiens (Human),475,,,"COMPBIAS 436..466; /note=""Pro residues""; /evid...",,,"REGION 428..475; /note=""Disordered""; /evidence...",,Krueppel C2H2-type zinc-finger protein family,"MOTIF 28..43; /note=""Engrailed homology 1 repr...",SIMILARITY: Belongs to the krueppel C2H2-type ...,"ZN_FING 260..282; /note=""C2H2-type 1""; /eviden..."
3,ENSG00000164334,A1A519,reviewed,F170A_HUMAN,Protein FAM170A (Zinc finger domain-containing...,FAM170A ZNFD,Homo sapiens (Human),330,,,"COMPBIAS 30..54; /note=""Polar residues""; /evid...",DOMAIN: The N-terminus is necessary for nuclea...,,"REGION 1..54; /note=""Disordered""; /evidence=""E...",,FAM170 family,,SIMILARITY: Belongs to the FAM170 family. {ECO...,"ZN_FING 228..252; /note=""C2H2-type; degenerate"""
4,ENSG00000184828,A1YPR0,reviewed,ZBT7C_HUMAN,Zinc finger and BTB domain-containing protein ...,ZBTB7C APM1 ZBTB36 ZNF857C,Homo sapiens (Human),619,,,"COMPBIAS 130..176; /note=""Acidic residues""; /e...",,"DOMAIN 34..101; /note=""BTB""; /evidence=""ECO:00...","REGION 129..218; /note=""Disordered""; /evidence...",,,,,"ZN_FING 364..386; /note=""C2H2-type 1""; /eviden..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1608,ENSG00000124151,Q9Y6Q9,reviewed,NCOA3_HUMAN,Nuclear receptor coactivator 3 (NCoA-3) (EC 2....,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,Homo sapiens (Human),1424,,,"COMPBIAS 503..520; /note=""Polar residues""; /ev...",DOMAIN: Contains three Leu-Xaa-Xaa-Leu-Leu (LX...,"DOMAIN 25..82; /note=""bHLH""; /evidence=""ECO:00...","REGION 1..38; /note=""Disordered""; /evidence=""E...",,SRC/p160 nuclear receptor coactivator family,"MOTIF 685..689; /note=""LXXLL motif 1""; MOTIF 7...",SIMILARITY: Belongs to the SRC/p160 nuclear re...,
1609,ENSG00000128000,Q9Y6R6,reviewed,Z780B_HUMAN,Zinc finger protein 780B (Zinc finger protein ...,ZNF780B ZNF779,Homo sapiens (Human),833,,,,,"DOMAIN 6..77; /note=""KRAB""; /evidence=""ECO:000...",,,Krueppel C2H2-type zinc-finger protein family,,SIMILARITY: Belongs to the krueppel C2H2-type ...,"ZN_FING 165..187; /note=""C2H2-type 1""; /eviden..."
1610,ENSG00000152217,Q9Y6X0,reviewed,SETBP_HUMAN,SET-binding protein (SEB),SETBP1 KIAA0437,Homo sapiens (Human),1596,"DNA_BIND 584..596; /note=""A.T hook 1""; DNA_BIN...",,"COMPBIAS 145..163; /note=""Basic and acidic res...",,,"REGION 1..83; /note=""Disordered""; /evidence=""E...","REPEAT 1520..1527; /note=""1""; REPEAT 1528..153...",,,,
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,Zinc fingers and homeoboxes protein 2 (Alpha-f...,ZHX2 AFR1 KIAA0854 RAF,Homo sapiens (Human),837,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",,"COMPBIAS 164..182; /note=""Polar residues""; /ev...",,,"REGION 27..77; /note=""Interaction with EFNB1"";...",,ZHX family,,SIMILARITY: Belongs to the ZHX family. {ECO:00...,"ZN_FING 78..101; /note=""C2H2-type 1""; /evidenc..."


#### Plan
- Get info from:
1. DNA binding
2. Domain [FT]
3. Region
4. Zinc finger

In [515]:
uniprot_DBD_output = uniprot_DBD_output.fillna("")

In [516]:
# NO info for these
uniprot_DBD_output[(uniprot_DBD_output["DNA binding"] == "") & (uniprot_DBD_output["Region"] == "") & (uniprot_DBD_output["Zinc finger"] == "") & (uniprot_DBD_output["Domain [FT]"] == "")]

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,DNA binding,Coiled coil,Compositional bias,Domain [CC],Domain [FT],Region,Repeat,Protein families,Motif,Sequence similarities,Zinc finger
33,ENSG00000169689,A8MT69,reviewed,CENPX_HUMAN,Centromere protein X (CENP-X) (FANCM-associate...,CENPX FAAP10 MHF2 STRA13,Homo sapiens (Human),81,,,,,,,,CENP-X/MHF2 family,,SIMILARITY: Belongs to the CENP-X/MHF2 family....,
230,ENSG00000237765,P0CF97,reviewed,F200B_HUMAN,Protein FAM200B,FAM200B C4orf53,Homo sapiens (Human),657,,,,,,,,FAM200 family,,SIMILARITY: Belongs to the FAM200 family. {ECO...,
511,ENSG00000028839,P62380,reviewed,TBPL1_HUMAN,TATA box-binding protein-like 1 (TBP-like 1) (...,TBPL1 TLF TLP TLP21 TRF2 TRP,Homo sapiens (Human),186,,,,,,,,TBP family,,SIMILARITY: Belongs to the TBP family. {ECO:00...,
654,ENSG00000257923,Q13948,reviewed,CASP_HUMAN,Protein CASP,CUX1 CUTL1,Homo sapiens (Human),678,,"COILED 67..450; /evidence=""ECO:0000255""; COILE...",,,,,,CASP family,,SIMILARITY: Belongs to the CASP family. {ECO:0...,
775,ENSG00000120832,Q49AM1,reviewed,MTEF2_HUMAN,"Transcription termination factor 2, mitochondr...",MTERF2 MTERFD3,Homo sapiens (Human),385,,,,,,,,MTERF family,,SIMILARITY: Belongs to the mTERF family. {ECO:...,
844,ENSG00000174428,Q6EKJ0,reviewed,GTD2B_HUMAN,General transcription factor II-I repeat domai...,GTF2IRD2B,Homo sapiens (Human),949,,,,,,,"REPEAT 98..192; /note=""GTF2I-like 1""; REPEAT 3...",TFII-I family,,SIMILARITY: Belongs to the TFII-I family. {ECO...,
951,ENSG00000196275,Q86UP8,reviewed,GTD2A_HUMAN,General transcription factor II-I repeat domai...,GTF2IRD2 GTF2IRD2A,Homo sapiens (Human),949,,,,,,,"REPEAT 98..192; /note=""GTF2I-like 1""; REPEAT 3...",TFII-I family,,SIMILARITY: Belongs to the TFII-I family. {ECO...,
1120,ENSG00000198715,Q8WWB7,reviewed,GLMP_HUMAN,Glycosylated lysosomal membrane protein (Lysos...,GLMP C1orf85 PSEC0030 UNQ2553/PRO6182,Homo sapiens (Human),406,,,,,,,,GLMP family,"MOTIF 402..406; /note=""Lysosomal targeting mot...",SIMILARITY: Belongs to the GLMP family. {ECO:0...,
1495,ENSG00000163320,Q9UFW8,reviewed,CGBP1_HUMAN,CGG triplet repeat-binding protein 1 (CGG-bind...,CGGBP1 CGGBP,Homo sapiens (Human),167,,,,,,,,,"MOTIF 80..84; /note=""Nuclear localization sign...",,


In [517]:
# Will add to this
DBD_list = uniprot_DBD_output.copy(deep = True)[["From", "Entry", "Entry Name"]]
DBD_list

Unnamed: 0,From,Entry,Entry Name
0,ENSG00000233757,A0A087WUV0,ZN892_HUMAN
1,ENSG00000129173,A0AVK6,E2F8_HUMAN
2,ENSG00000128610,A0PJY2,FEZF1_HUMAN
3,ENSG00000164334,A1A519,F170A_HUMAN
4,ENSG00000184828,A1YPR0,ZBT7C_HUMAN
...,...,...,...
1608,ENSG00000124151,Q9Y6Q9,NCOA3_HUMAN
1609,ENSG00000128000,Q9Y6R6,Z780B_HUMAN
1610,ENSG00000152217,Q9Y6X0,SETBP_HUMAN
1611,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN


1. DNA Binding column

In [519]:
# DNA_binding_col_matches = uniprot_DBD_output["DNA binding"].str.extractall(r'DNA_BIND (\d+\.\.\d+)')
# DNA_binding_col_matches = DNA_binding_col_matches.reset_index()
# DNA_binding_col_matches

In [520]:
DBD_list = pd.merge(DBD_list, DNA_binding_col_matches, left_index = True, right_on = "level_0", how = "left")
DBD_list

Unnamed: 0,From,Entry,Entry Name,level_0,match,0
,ENSG00000233757,A0A087WUV0,ZN892_HUMAN,0,,
0.0,ENSG00000129173,A0AVK6,E2F8_HUMAN,1,0.0,113..182
1.0,ENSG00000129173,A0AVK6,E2F8_HUMAN,1,1.0,261..347
,ENSG00000128610,A0PJY2,FEZF1_HUMAN,2,,
,ENSG00000164334,A1A519,F170A_HUMAN,3,,
...,...,...,...,...,...,...
593.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,0.0,263..324
594.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,1.0,439..501
595.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,2.0,530..591
596.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,3.0,628..690


In [521]:
DBD_list = DBD_list.fillna("")
DBD_list

Unnamed: 0,From,Entry,Entry Name,level_0,match,0
,ENSG00000233757,A0A087WUV0,ZN892_HUMAN,0,,
0.0,ENSG00000129173,A0AVK6,E2F8_HUMAN,1,0.0,113..182
1.0,ENSG00000129173,A0AVK6,E2F8_HUMAN,1,1.0,261..347
,ENSG00000128610,A0PJY2,FEZF1_HUMAN,2,,
,ENSG00000164334,A1A519,F170A_HUMAN,3,,
...,...,...,...,...,...,...
593.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,0.0,263..324
594.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,1.0,439..501
595.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,2.0,530..591
596.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,3.0,628..690


In [522]:
start_ints = DBD_list[0].str.extract(r'(\d+)\.\.')[0]
end_ints =  DBD_list[0].str.extract(r'\.\.(\d+)')[0]
DBD_list[0] = list(zip(start_ints, end_ints))
DBD_list

Unnamed: 0,From,Entry,Entry Name,level_0,match,0
,ENSG00000233757,A0A087WUV0,ZN892_HUMAN,0,,"(nan, nan)"
0.0,ENSG00000129173,A0AVK6,E2F8_HUMAN,1,0.0,"(113, 182)"
1.0,ENSG00000129173,A0AVK6,E2F8_HUMAN,1,1.0,"(261, 347)"
,ENSG00000128610,A0PJY2,FEZF1_HUMAN,2,,"(nan, nan)"
,ENSG00000164334,A1A519,F170A_HUMAN,3,,"(nan, nan)"
...,...,...,...,...,...,...
593.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,0.0,"(263, 324)"
594.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,1.0,"(439, 501)"
595.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,2.0,"(530, 591)"
596.0,ENSG00000178764,Q9Y6X8,ZHX2_HUMAN,1611,3.0,"(628, 690)"


In [526]:
#DBD_list[0] = DBD_list[0].astype(str)
DBD_list_func_joined = DBD_list.groupby(["From", "Entry", "Entry Name"])[0].apply(list).reset_index()
DBD_list_func_joined

Unnamed: 0,From,Entry,Entry Name,0
0,ENSG00000001167,P23511,NFYA_HUMAN,"[(296, 321)]"
1,ENSG00000004848,Q96QS3,ARX_HUMAN,"[(328, 387)]"
2,ENSG00000005073,P31270,HXA11_HUMAN,"[(241, 300)]"
3,ENSG00000005102,P50221,MEOX1_HUMAN,"[(171, 230)]"
4,ENSG00000005513,P57073,SOX8_HUMAN,"[(102, 170)]"
...,...,...,...,...
1608,ENSG00000277258,P35227,PCGF2_HUMAN,"[(nan, nan)]"
1609,ENSG00000277462,Q9BS34,ZN670_HUMAN,"[(nan, nan)]"
1610,ENSG00000278129,P17098,ZNF8_HUMAN,"[(nan, nan)]"
1611,ENSG00000278318,Q9UJW7,ZN229_HUMAN,"[(nan, nan)]"


In [535]:
DBD_list_func_joined = DBD_list_func_joined.rename(columns = {0: "Function"})
DBD_list_func_joined

Unnamed: 0,From,Entry,Entry Name,Function
0,ENSG00000001167,P23511,NFYA_HUMAN,"[(296, 321)]"
1,ENSG00000004848,Q96QS3,ARX_HUMAN,"[(328, 387)]"
2,ENSG00000005073,P31270,HXA11_HUMAN,"[(241, 300)]"
3,ENSG00000005102,P50221,MEOX1_HUMAN,"[(171, 230)]"
4,ENSG00000005513,P57073,SOX8_HUMAN,"[(102, 170)]"
...,...,...,...,...
1608,ENSG00000277258,P35227,PCGF2_HUMAN,"[(nan, nan)]"
1609,ENSG00000277462,Q9BS34,ZN670_HUMAN,"[(nan, nan)]"
1610,ENSG00000278129,P17098,ZNF8_HUMAN,"[(nan, nan)]"
1611,ENSG00000278318,Q9UJW7,ZN229_HUMAN,"[(nan, nan)]"


---
2. Now, using Regions

In [538]:
uniprot_DBD_output["Region"]

0       REGION 1..22; /note="Disordered"; /evidence="E...
1       REGION 38..58; /note="Disordered"; /evidence="...
2       REGION 428..475; /note="Disordered"; /evidence...
3       REGION 1..54; /note="Disordered"; /evidence="E...
4       REGION 129..218; /note="Disordered"; /evidence...
                              ...                        
1608    REGION 1..38; /note="Disordered"; /evidence="E...
1609                                                     
1610    REGION 1..83; /note="Disordered"; /evidence="E...
1611    REGION 27..77; /note="Interaction with EFNB1";...
1612    REGION 283..375; /note="Disordered"; /evidence...
Name: Region, Length: 1613, dtype: object

In [540]:
for _ in uniprot_DBD_output["Region"].str.extractall(r'note="(.+?)"').reset_index().sort_values(by = 0)[0].unique():
    print(_)

10 X 4 AA tandem repeats of S-P-[RGMKC]-[RK]
11 X 7 AA tandem repeats of [DR]-P-Y-R-[LI][AG][QHP]
13 X 2 AA tandem repeats of G-Q
14 X 6 AA repeats of [ED]-R-S-M-M-S
15 X 9 AA tandem repeats of P-P-x-x-P-x-P-P-x
17 X 10 AA tandem repeats of L-A-[ST]-[NSG]-[TS]-MDSQM
2 X 19 AA repeats of P-S-R-R-R-R-S-R-S-V-V-R-R-R-S-F-S-I-S
2 X 2 AA tandem repeats of G-Q
2 approximate SP repeats
3 X 11 AA tandem repats of P-P-L-P-P-E-E-P-P-[TME]-[MTG]
3 X 12 AA approximate repeats
3 X 7 AA repeat of P-E-V-E-A-A-E
3 X 8 AA tandem repeats of P-P-L-P-P-P-P-P
3 X SP repeats
3 X approximate SP repeats
3 X tandem repeats of [ST]-P-[VLI]-R-[RL]-[RK]-[RF]-S-R
4 X 5 AA repeat of P-X-G-E-A
4 X 67 AA tandem repeats
4 X 7 AA repeats of P-[LV]-T-[IL]-T-[ST]-P
4 X 8 AA tandem repeats of V-L-E-SS-[AVT]-VT
4 X approximate tandem repeats
6 X 2 AA tandem repeats of K-G
6 X 4 AA tandem repeats of S-P-X-[RK]
7 X 7 AA repeats of P-S-R-R-S-R-[TS]
8 X repeats starting with a Trp in each unit
A domain
AF-2
AF-2 domain
AF1; me

In [542]:
for _ in uniprot_DBD_output["Region"].str.extractall(r'note="(.+?)"').reset_index().sort_values(by = 0)[0].unique():
    if "DNA" in _:
        print(_)

Basic (repression of DNA-binding)
Binding to DNA
Core promoter DNA-binding
Critical for glycosaminoglycan, lipid A, lysozyme and DNA binding
DNA replication foci-targeting sequence
DNA-binding
DNA-binding domain
DNA-binding regulation
DNA-binding; major groove
DNA-binding; minor groove
Important for DNA and nucleosome binding
Important for flexibility of DNA ends that protrude from nucleosomes
Important for interaction with target DNA
Interaction with 5-mCpG DNA
Interaction with DNA
Interaction with ss-DNA
Interaction with target DNA
Involved in DNA-binding
Mediates dimerization, DNA-binding, transcription repression of CCNA2 and interaction with HMGA2
Mediates promoter DNA-binding and activation of transcription
No DNA binding activity or transactivation activity, but complete prevention of TRAF-dependent NF-Kappa-B activation; associates with TRAF2 and JUN
Required for DNA binding
Required for DNA-PK heterotrimer
Required for DNA-binding
Required for DNA-binding and interaction with 

Keep the following regions:

DNA-binding
DNA-binding domain
DNA-binding; major groove
DNA-binding; minor groove

In [545]:
DNA_binding_region_terms = ["DNA-binding", "DNA-binding domain", "DNA-binding; major groove", "DNA-binding; minor groove"]
DNA_binding_region_terms

['DNA-binding',
 'DNA-binding domain',
 'DNA-binding; major groove',
 'DNA-binding; minor groove']

In [547]:
pat = r'(REGION (\d+\.\.\d+?); \/note="(.+?)"?;)'
string_to_search = 'REGION 1..22; /note="DNA-binding; minor groove"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 96..124; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"'
print(string_to_search)

REGION 1..22; /note="DNA-binding; minor groove"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 96..124; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"


In [549]:
re.findall(pat, string_to_search)

[('REGION 1..22; /note="DNA-binding;', '1..22', 'DNA-binding'),
 ('REGION 96..124; /note="Disordered";', '96..124', 'Disordered')]

In [551]:
import numpy as np
np.nan

nan

In [553]:
DBD_list_func_joined

Unnamed: 0,From,Entry,Entry Name,Function
0,ENSG00000001167,P23511,NFYA_HUMAN,"[(296, 321)]"
1,ENSG00000004848,Q96QS3,ARX_HUMAN,"[(328, 387)]"
2,ENSG00000005073,P31270,HXA11_HUMAN,"[(241, 300)]"
3,ENSG00000005102,P50221,MEOX1_HUMAN,"[(171, 230)]"
4,ENSG00000005513,P57073,SOX8_HUMAN,"[(102, 170)]"
...,...,...,...,...
1608,ENSG00000277258,P35227,PCGF2_HUMAN,"[(nan, nan)]"
1609,ENSG00000277462,Q9BS34,ZN670_HUMAN,"[(nan, nan)]"
1610,ENSG00000278129,P17098,ZNF8_HUMAN,"[(nan, nan)]"
1611,ENSG00000278318,Q9UJW7,ZN229_HUMAN,"[(nan, nan)]"


In [555]:
region_DBDs = []

for i in uniprot_DBD_output.index:
    row_DBDs = []
    
    row = uniprot_DBD_output.iloc[i]
    region_descrip = row["Region"]
    pat = r'(REGION (\d+\.\.\d+?); \/note="(.+?)"?;)'
    regex_find = re.findall(pat, region_descrip)
    
    for find in regex_find:
        if find[2] in DNA_binding_region_terms:
            coords = find[1]
            start = re.search(r'(\d+)', coords)[0]
            end = re.search(r'\d+\.\.(\d+)', coords)[1]
            row_DBDs.append((start, end))
            
    if len(row_DBDs) == 0:
        row_DBDs.append((np.nan, np.nan))
    else:
        print(row["Entry Name"])
        print(row_DBDs)
        
        
    region_DBDs.append(row_DBDs)

uniprot_DBD_output["Region_extrac"] = region_DBDs
uniprot_DBD_output

P73_HUMAN
[('131', '310')]
ARNT_HUMAN
[('88', '128')]
AHR_HUMAN
[('38', '66')]
HSF1_HUMAN
[('15', '120')]
FOXK2_HUMAN
[('300', '318'), ('328', '332'), ('348', '353')]
PO5F1_HUMAN
[('180', '186'), ('193', '196')]
SUH_HUMAN
[('57', '67'), ('165', '170'), ('192', '197')]
FOXO1_HUMAN
[('211', '218'), ('234', '237')]
TFCP2_HUMAN
[('133', '395')]
HIF1A_HUMAN
[('21', '30')]
LIN54_HUMAN
[('523', '536'), ('599', '612')]
NPAS3_HUMAN
[('53', '64')]
PURB_HUMAN
[('28', '254')]
EPAS1_HUMAN
[('26', '53')]
PRDM9_HUMAN
[('730', '820')]
RBPJL_HUMAN
[('78', '88'), ('193', '198'), ('220', '225')]


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,DNA binding,Coiled coil,Compositional bias,Domain [CC],Domain [FT],Region,Repeat,Protein families,Motif,Sequence similarities,Zinc finger,Region_extrac
0,ENSG00000233757,A0A087WUV0,reviewed,ZN892_HUMAN,Zinc finger protein 892,ZNF892,Homo sapiens (Human),522,,,"COMPBIAS 1..19; /note=""Basic and acidic residu...",,,"REGION 1..22; /note=""Disordered""; /evidence=""E...",,Krueppel C2H2-type zinc-finger protein family,,SIMILARITY: Belongs to the krueppel C2H2-type ...,"ZN_FING 221..243; /note=""C2H2-type 1""; /eviden...","[(nan, nan)]"
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,Transcription factor E2F8 (E2F-8),E2F8,Homo sapiens (Human),867,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",,"COMPBIAS 410..432; /note=""Polar residues""; /ev...",DOMAIN: In contrast to classical members of th...,,"REGION 38..58; /note=""Disordered""; /evidence=""...",,E2F/DP family,,SIMILARITY: Belongs to the E2F/DP family. {ECO...,,"[(nan, nan)]"
2,ENSG00000128610,A0PJY2,reviewed,FEZF1_HUMAN,Fez family zinc finger protein 1 (Zinc finger ...,FEZF1 FEZ ZNF312B,Homo sapiens (Human),475,,,"COMPBIAS 436..466; /note=""Pro residues""; /evid...",,,"REGION 428..475; /note=""Disordered""; /evidence...",,Krueppel C2H2-type zinc-finger protein family,"MOTIF 28..43; /note=""Engrailed homology 1 repr...",SIMILARITY: Belongs to the krueppel C2H2-type ...,"ZN_FING 260..282; /note=""C2H2-type 1""; /eviden...","[(nan, nan)]"
3,ENSG00000164334,A1A519,reviewed,F170A_HUMAN,Protein FAM170A (Zinc finger domain-containing...,FAM170A ZNFD,Homo sapiens (Human),330,,,"COMPBIAS 30..54; /note=""Polar residues""; /evid...",DOMAIN: The N-terminus is necessary for nuclea...,,"REGION 1..54; /note=""Disordered""; /evidence=""E...",,FAM170 family,,SIMILARITY: Belongs to the FAM170 family. {ECO...,"ZN_FING 228..252; /note=""C2H2-type; degenerate""","[(nan, nan)]"
4,ENSG00000184828,A1YPR0,reviewed,ZBT7C_HUMAN,Zinc finger and BTB domain-containing protein ...,ZBTB7C APM1 ZBTB36 ZNF857C,Homo sapiens (Human),619,,,"COMPBIAS 130..176; /note=""Acidic residues""; /e...",,"DOMAIN 34..101; /note=""BTB""; /evidence=""ECO:00...","REGION 129..218; /note=""Disordered""; /evidence...",,,,,"ZN_FING 364..386; /note=""C2H2-type 1""; /eviden...","[(nan, nan)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1608,ENSG00000124151,Q9Y6Q9,reviewed,NCOA3_HUMAN,Nuclear receptor coactivator 3 (NCoA-3) (EC 2....,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,Homo sapiens (Human),1424,,,"COMPBIAS 503..520; /note=""Polar residues""; /ev...",DOMAIN: Contains three Leu-Xaa-Xaa-Leu-Leu (LX...,"DOMAIN 25..82; /note=""bHLH""; /evidence=""ECO:00...","REGION 1..38; /note=""Disordered""; /evidence=""E...",,SRC/p160 nuclear receptor coactivator family,"MOTIF 685..689; /note=""LXXLL motif 1""; MOTIF 7...",SIMILARITY: Belongs to the SRC/p160 nuclear re...,,"[(nan, nan)]"
1609,ENSG00000128000,Q9Y6R6,reviewed,Z780B_HUMAN,Zinc finger protein 780B (Zinc finger protein ...,ZNF780B ZNF779,Homo sapiens (Human),833,,,,,"DOMAIN 6..77; /note=""KRAB""; /evidence=""ECO:000...",,,Krueppel C2H2-type zinc-finger protein family,,SIMILARITY: Belongs to the krueppel C2H2-type ...,"ZN_FING 165..187; /note=""C2H2-type 1""; /eviden...","[(nan, nan)]"
1610,ENSG00000152217,Q9Y6X0,reviewed,SETBP_HUMAN,SET-binding protein (SEB),SETBP1 KIAA0437,Homo sapiens (Human),1596,"DNA_BIND 584..596; /note=""A.T hook 1""; DNA_BIN...",,"COMPBIAS 145..163; /note=""Basic and acidic res...",,,"REGION 1..83; /note=""Disordered""; /evidence=""E...","REPEAT 1520..1527; /note=""1""; REPEAT 1528..153...",,,,,"[(nan, nan)]"
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,Zinc fingers and homeoboxes protein 2 (Alpha-f...,ZHX2 AFR1 KIAA0854 RAF,Homo sapiens (Human),837,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",,"COMPBIAS 164..182; /note=""Polar residues""; /ev...",,,"REGION 27..77; /note=""Interaction with EFNB1"";...",,ZHX family,,SIMILARITY: Belongs to the ZHX family. {ECO:00...,"ZN_FING 78..101; /note=""C2H2-type 1""; /evidenc...","[(nan, nan)]"


In [561]:
DBD_list_func_joined = pd.merge(DBD_list_func_joined, uniprot_DBD_output[["Entry", "Region_extrac"]], on = "Entry", how = "left")
DBD_list_func_joined

Unnamed: 0,From,Entry,Entry Name,Function,Region_extrac
0,ENSG00000001167,P23511,NFYA_HUMAN,"[(296, 321)]","[(nan, nan)]"
1,ENSG00000004848,Q96QS3,ARX_HUMAN,"[(328, 387)]","[(nan, nan)]"
2,ENSG00000005073,P31270,HXA11_HUMAN,"[(241, 300)]","[(nan, nan)]"
3,ENSG00000005102,P50221,MEOX1_HUMAN,"[(171, 230)]","[(nan, nan)]"
4,ENSG00000005513,P57073,SOX8_HUMAN,"[(102, 170)]","[(nan, nan)]"
...,...,...,...,...,...
1608,ENSG00000277258,P35227,PCGF2_HUMAN,"[(nan, nan)]","[(nan, nan)]"
1609,ENSG00000277462,Q9BS34,ZN670_HUMAN,"[(nan, nan)]","[(nan, nan)]"
1610,ENSG00000278129,P17098,ZNF8_HUMAN,"[(nan, nan)]","[(nan, nan)]"
1611,ENSG00000278318,Q9UJW7,ZN229_HUMAN,"[(nan, nan)]","[(nan, nan)]"


In [563]:
null_val = DBD_list_func_joined["Function"].value_counts().reset_index()["Function"].iloc[0]
null_val

[(nan, nan)]

In [565]:
null_val[0]

(nan, nan)

In [569]:
DBD_list_func_joined[[null_val[0] not in _ for _ in DBD_list_func_joined["Function"]] and [null_val[0] not in _ for _ in DBD_list_func_joined["Region_extrac"]]]

Unnamed: 0,From,Entry,Entry Name,Function,Region_extrac
105,ENSG00000078900,O15350,P73_HUMAN,"[(nan, nan)]","[(131, 310)]"
158,ENSG00000100644,Q16665,HIF1A_HUMAN,"[(nan, nan)]","[(21, 30)]"
229,ENSG00000106546,P35869,AHR_HUMAN,"[(nan, nan)]","[(38, 66)]"
305,ENSG00000116016,Q99814,EPAS1_HUMAN,"[(nan, nan)]","[(26, 53)]"
400,ENSG00000124232,Q9UBG7,RBPJL_HUMAN,"[(nan, nan)]","[(78, 88), (193, 198), (220, 225)]"
533,ENSG00000135457,Q12800,TFCP2_HUMAN,"[(nan, nan)]","[(133, 395)]"
602,ENSG00000141568,Q01167,FOXK2_HUMAN,"[(258, 353)]","[(300, 318), (328, 332), (348, 353)]"
630,ENSG00000143437,P27540,ARNT_HUMAN,"[(nan, nan)]","[(88, 128)]"
650,ENSG00000146676,Q96QR8,PURB_HUMAN,"[(nan, nan)]","[(28, 254)]"
677,ENSG00000150907,Q12778,FOXO1_HUMAN,"[(159, 235)]","[(211, 218), (234, 237)]"


In [None]:
filtered_list = [item for item in DBD_list_func_joined["Function"] if null_val[0] in item]


In [484]:
uniprot_DBD_output[uniprot_DBD_output["Entry Name"] == "SOX4_HUMAN"]["Region"].iloc[0]

'REGION 1..58; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 128..228; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 262..286; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 302..416; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"'

In [486]:
uniprot_DBD_output[uniprot_DBD_output["Entry Name"] == "SOX4_HUMAN"]["DNA binding"].iloc[0]

'DNA_BIND 59..127; /note="HMG box"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00267"'

In [251]:
# Checking how many DBDs found per TF
uniprot_DBD_output["DBD_count"] = uniprot_DBD_output["DNA binding"].str.count("DNA_BIND")
uniprot_DBD_output

Unnamed: 0,From,Entry,Reviewed,Entry Name,Gene Names,DNA binding,DBD_count
0,ENSG00000233757,A0A087WUV0,reviewed,ZN892_HUMAN,ZNF892,,
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0
2,ENSG00000128610,A0PJY2,reviewed,FEZF1_HUMAN,FEZF1 FEZ ZNF312B,,
3,ENSG00000164334,A1A519,reviewed,F170A_HUMAN,FAM170A ZNFD,,
4,ENSG00000184828,A1YPR0,reviewed,ZBT7C_HUMAN,ZBTB7C APM1 ZBTB36 ZNF857C,,
...,...,...,...,...,...,...,...
1608,ENSG00000124151,Q9Y6Q9,reviewed,NCOA3_HUMAN,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,,
1609,ENSG00000128000,Q9Y6R6,reviewed,Z780B_HUMAN,ZNF780B ZNF779,,
1610,ENSG00000152217,Q9Y6X0,reviewed,SETBP_HUMAN,SETBP1 KIAA0437,"DNA_BIND 584..596; /note=""A.T hook 1""; DNA_BIN...",3.0
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0


In [252]:
# Extracting DBDs and saving as integers

# One row per DBD
uniprot_DBD_output["DBD_split"] = uniprot_DBD_output["DNA binding"].str.split("DNA_BIND")
DBDs = uniprot_DBD_output.explode("DBD_split")
DBDs = DBDs.dropna()
DBDs

Unnamed: 0,From,Entry,Reviewed,Entry Name,Gene Names,DNA binding,DBD_count,DBD_split
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0,
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0,"113..182; /evidence=""ECO:0000255"";"
1,ENSG00000129173,A0AVK6,reviewed,E2F8_HUMAN,E2F8,"DNA_BIND 113..182; /evidence=""ECO:0000255""; DN...",2.0,"261..347; /evidence=""ECO:0000255"""
6,ENSG00000188816,A2RU54,reviewed,HMX2_HUMAN,HMX2,"DNA_BIND 149..208; /note=""Homeobox""; /evidence...",1.0,
6,ENSG00000188816,A2RU54,reviewed,HMX2_HUMAN,HMX2,"DNA_BIND 149..208; /note=""Homeobox""; /evidence...",1.0,"149..208; /note=""Homeobox""; /evidence=""ECO:00..."
...,...,...,...,...,...,...,...,...
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0,"439..501; /note=""Homeobox 2""; /evidence=""ECO:..."
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0,"530..591; /note=""Homeobox 3""; /evidence=""ECO:..."
1611,ENSG00000178764,Q9Y6X8,reviewed,ZHX2_HUMAN,ZHX2 AFR1 KIAA0854 RAF,"DNA_BIND 263..324; /note=""Homeobox 1""; /eviden...",4.0,"628..690; /note=""Homeobox 4""; /evidence=""ECO:..."
1612,ENSG00000171735,Q9Y6Y1,reviewed,CMTA1_HUMAN,CAMTA1 KIAA0833 MSTP023,"DNA_BIND 63..188; /note=""CG-1""; /evidence=""ECO...",1.0,


In [253]:
# Formatting, regex to extact DBDs
DBDs["Start"] = DBDs["DBD_split"].str.extract(r'(\d*)\.\.')
DBDs["End"] = DBDs["DBD_split"].str.extract(r'\d*\.\.(\d*)')
DBDs = DBDs[["From", "Start", "End"]]
DBDs = DBDs.rename(columns = {"From" : "Gene Information"})
DBDs = DBDs.dropna()
DBDs["Start"] = DBDs["Start"].astype(int)
DBDs["End"] = DBDs["End"].astype(int)
DBDs = DBDs.drop_duplicates()
DBDs

Unnamed: 0,Gene Information,Start,End
1,ENSG00000129173,113,182
1,ENSG00000129173,261,347
6,ENSG00000188816,149,208
8,ENSG00000180053,132,191
11,ENSG00000204595,16,75
...,...,...,...
1611,ENSG00000178764,263,324
1611,ENSG00000178764,439,501
1611,ENSG00000178764,530,591
1611,ENSG00000178764,628,690


In [254]:
# Merging back to original Lambert TF table
lambert_TFs_with_DBDs = pd.merge(lambert_TFs, DBDs, how = "left")
lambert_TFs_with_DBDs

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Start,End
0,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
1,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
2,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,Yes,,,
3,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,,,
4,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,ENSG00000177683,THAP5,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1718,ENSG00000174796,THAP6,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1719,ENSG00000184436,THAP7,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1720,ENSG00000161277,THAP8,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,PMID: 12575992 says it has unique DBD,need revisit as it has C2CH signature,c,Yes,Yes,No,No,,,


In [255]:
len(set(lambert_TFs_with_DBDs[lambert_TFs_with_DBDs["Start"].isna()]["Gene Information"]))

1124

In [256]:
len(set(lambert_TFs_with_DBDs[~lambert_TFs_with_DBDs["Start"].isna()]["Gene Information"]))

515

In [257]:
len(lambert_TFs)

1639

In [270]:
lambert_TFs_with_DBDs.to_csv("../output/lambert_TFs_with_uniprot_DBDs.csv")

In [282]:
DBDs_found = lambert_TFs_with_DBDs[~lambert_TFs_with_DBDs["Start"].isna()]
DBDs_found

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Start,End
11,ENSG00000189079,ARID2,ARID/BRIGHT; RFX,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,x,Yes,Yes,TF Gene_Transcription Factor Binding: tf co-fa...,No,,524.0,603.0
12,ENSG00000153207,AHCTF1,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Recruits nucleoporins to chromatin [PMID: 1723...,Should not be initial assessment 1a1?,c,No,No,No,No,,1971.0,1983.0
13,ENSG00000126705,AHDC1,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,x,No,No,No,No,,396.0,408.0
14,ENSG00000126705,AHDC1,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,x,No,No,No,No,,544.0,556.0
15,ENSG00000106948,AKNA,AT hook,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,Binds AT-rich promoters of CD40 and CD40L and ...,,...,Debatable - binds AT-rich promoters of CD40 an...,Debatable - binds AT-rich promoters of CD40 an...,c,No,No,No,Yes,,1115.0,1123.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1701,ENSG00000149922,TBX6,T-box,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,No,,100.0,273.0
1706,ENSG00000187079,TEAD1,TEA,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,No,,28.0,104.0
1707,ENSG00000074219,TEAD2,TEA,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,38.0,114.0
1708,ENSG00000007866,TEAD3,TEA,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,No,,28.0,104.0


In [284]:
DBDs_not_found = lambert_TFs_with_DBDs[lambert_TFs_with_DBDs["Start"].isna()]
DBDs_not_found

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Start,End
0,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
1,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
2,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,No,Yes,,,
3,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,,,
4,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,ENSG00000177683,THAP5,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1718,ENSG00000174796,THAP6,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1719,ENSG00000184436,THAP7,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,,,c,Yes,Yes,No,No,,,
1720,ENSG00000161277,THAP8,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,PMID: 12575992 says it has unique DBD,need revisit as it has C2CH signature,c,Yes,Yes,No,No,,,


In [286]:
DBDs_not_found.columns

Index(['Gene Information', 'Unnamed: 1', 'Unnamed: 2', 'Is TF?',
       'Final Assesment', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7',
       'Unnamed: 8', 'Notes from re-reviewed genes', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Curator Disagreement Flags', 'Unnamed: 15', 'Curator 1', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Curator 2', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Prior classifications',
       'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30',
       'Unnamed: 31', 'Start', 'End'],
      dtype='object')

In [300]:
pd.read_csv("../data/DBDs/S3_41576_2009_BFnrg2538_MOESM6_ESM.txt", header = 7, sep = "\t")

Unnamed: 0,Class,Ensembl ID,IPI ID,Interpro DBD,Interpro DNA-binding family,HGNC symbol,Tissue-specificity
0,a,ENSG00000001167,IPI00218470;IPI00333568,,IPR001289,NFYA,
1,a,ENSG00000004848,IPI00045066,IPR000047;IPR001356;IPR009057;IPR012287,,ARX,
2,a,ENSG00000005073,IPI00010754;IPI00807582,IPR001356;IPR009057;IPR012287,,HOXA11,uterus
3,a,ENSG00000005513,IPI00299069;IPI00386303,IPR000910;IPR009071,,SOX8,
4,a,ENSG00000005889,IPI00012336;IPI00169385;IPI00179465;IPI00607877,IPR007087;IPR015880,IPR006794,ZFX,general
...,...,...,...,...,...,...,...
1982,x,ENSG00000215812,IPI00874268,IPR007087;IPR015880,,,
1983,x,ENSG00000215887,IPI00873740,IPR007087;IPR015880,,,
1984,x,ENSG00000218788,IPI00902657,IPR004826;IPR004827;IPR008917,,,
1985,x,ENSG00000219340,IPI00397740,IPR007087;IPR015880,,,
