In [7]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import protfasta 
import re

from Bio import pairwise2
from Bio.Seq import Seq 

%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools

Goal: Regenerating the merged datasets, using ENSG

Plan:

1. Reading in the dataset of known ADs, where the coordinates of non-canonical ADs have been updated if there was an exact sequence match in the canonical AD
2. Merging SFARI Gene and Lambert TFs -> SFARI TFs
3. Merging known ADs and SFARI TFs -> SFARI TF known ADs
4. Merging SFARI TFs and SFARI TF known ADs -> SFARI TFs with known ADs
5. Find DBDs on SFARI TFs with known ADs -> DBDs on SFARI TFs with known ADs
6. Run variant analysis scripts

---
1. Reading in the dataset of known ADs, where the coordinates of non-canonical ADs have been updated if there was an exact sequence match in the canonical AD

In [57]:
# From: http://localhost:8888/notebooks/Desktop/Staller_Lab/SFARI/notebooks/Looking%20for%20non-canonical%20ADs.ipynb
known_ADs = pd.read_csv("../output/known_ADs_considering_isoforms_and_canonical.csv", index_col = None)
known_ADs

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
0,ABRAXAS1,121,200,Q6UWZ7,[['Q6UWZ7']],ENST00000321945,Q6UWZ7,DelRosso et al.,CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,
1,AEBP1,1088,1158,Q8IUX7,[['Q8IUX7']],,Q8IUX7,Staller Activity Data,TF,EVVTEFGTEVEPEFGTKVEPEFETQLEPEFETQLEPEFEEEEEEEK...,
2,AHCTF1,1445,1698,Q8WYP5,"[['Q8WYP5'], ['Q8WYP5']]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF,IRANDNKSMADVLGDGGNSSLTISEGPIVSERRLNQEVALNLKEDH...,
3,AHR,118,126,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,LLQALNGFV,
4,AHR,266,268,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,FAI,
...,...,...,...,...,...,...,...,...,...,...,...
739,ZSCAN20,262,341,P17040,"[['P17040', 'P17040-3']]",ENST00000361328,P17040,DelRosso et al.,TF,PSNTSEKEQGPEFWGLSLINSGKRSTADYSLDNEPAQALTWRDSRA...,
740,ZXDA,572,699,P98168,"[['P98168'], ['P98168']]",nan / nan,P98168 / P98168 / P98168,"PMID: 17493635, Soto / R4TA_regions.txt / acti...",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
741,ZXDB,576,703,P98169,[['P98169']],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
742,ZXDC,579,688,Q2QGD7,"[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2']]",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF,DSPLVLGTAATVLQQGSFSVDDVQTVSAGALGCLVALPMKNLSDDP...,


---

2. Merging SFARI Gene and Lambert TFs -> SFARI TFs

In [10]:
SFARI_Gene = pd.read_csv("../data/SFARI-Gene_genes_01-23-2023release_03-21-2023export.csv")
SFARI_Gene

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
0,9,ABAT,4-aminobutyrate aminotransferase,ENSG00000183044,16,"Rare Single Gene Mutation, Genetic Association",2.0,0,,7
1,9,ABCA10,"ATP-binding cassette, sub-family A (ABC1), mem...",ENSG00000154263,17,Rare Single Gene Mutation,2.0,0,,2
2,9,ABCA13,ATP binding cassette subfamily A member 13,ENSG00000179869,7,"Rare Single Gene Mutation, Functional",2.0,0,,10
3,9,ABCA7,"ATP-binding cassette, sub-family A (ABC1), mem...",ENSG00000064687,19,Rare Single Gene Mutation,2.0,0,,5
4,9,ABL2,"ABL proto-oncogene 2, non-receptor tyrosine ki...",ENSG00000143322,1,"Rare Single Gene Mutation, Functional",3.0,0,,10
...,...,...,...,...,...,...,...,...,...,...
1113,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4
1114,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16
1115,9,ZNF827,Zinc finger protein 827,ENSG00000151612,4,"Rare Single Gene Mutation, Genetic Association",2.0,0,,3
1116,9,ZSWIM6,zinc finger SWIM-type containing 6,ENSG00000130449,5,"Rare Single Gene Mutation, Syndromic, Genetic ...",,1,,6


In [11]:
# Redownloading lambert TFs from uniprot.

# First, downloaded tables s1-s4 from lambert review
lambert_table_s1 = pd.read_excel(open('../data/lambert_supp_tables.xlsx', 'rb'),
                               sheet_name='Table S1. Related to Figure 1B')
lambert_table_s1

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
0,ID,Name,DBD,,TF assessment,Binding mode,Motif status,Notes,Comments,Committee notes,...,Assesment2,Binding2,Comment2,Notes2,Vaquerizas 2009 TF classification,CisBP considers it as a TF?,TFclass considers it as a TF?,TF-CAT classification,Is a GO TF,PDB
1,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
2,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
3,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,No,Yes,
4,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Has known motif,1 Monomer or homomultimer,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2761,ENSG00000174796,THAP6,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,1 Monomer or homomultimer,,,c,Yes,Yes,No,No,
2762,ENSG00000184436,THAP7,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,1 Monomer or homomultimer,,,c,Yes,Yes,No,No,
2763,ENSG00000161277,THAP8,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,,PMID: 12575992 says it has unique DBD,need revisit as it has C2CH signature,c,Yes,Yes,No,No,
2764,ENSG00000168152,THAP9,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,Human THAP9 Gene Encodes an Active P-Element D...,,...,Likely to be sequence specific TF,1 Monomer or homomultimer,,,c,Yes,Yes,No,No,


In [12]:
# Keeping TF rows
lambert_table_s1_TF_rows = lambert_table_s1[lambert_table_s1["Is TF?"] == "Yes"]
lambert_table_s1_TF_rows

Unnamed: 0,Gene Information,Unnamed: 1,Unnamed: 2,Is TF?,Final Assesment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Notes from re-reviewed genes,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Prior classifications,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
1,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
2,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
3,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,No,Yes,
4,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,1 Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,,...,Has known motif,1 Monomer or homomultimer,Source of Hocomoco motif is unclear,,a,Yes,Yes,No,Yes,
5,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,1 Monomer or homomultimer,High-throughput in vitro,,,,...,Has known motif,1 Monomer or homomultimer,,,a,Yes,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2760,ENSG00000177683,THAP5,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,1 Monomer or homomultimer,,,c,Yes,Yes,No,No,
2761,ENSG00000174796,THAP6,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,1 Monomer or homomultimer,,,c,Yes,Yes,No,No,
2762,ENSG00000184436,THAP7,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,1 Monomer or homomultimer,,,c,Yes,Yes,No,No,
2763,ENSG00000161277,THAP8,THAP finger,Yes,Likely to be sequence specific TF,1 Monomer or homomultimer,No motif,,,,...,Likely to be sequence specific TF,,PMID: 12575992 says it has unique DBD,need revisit as it has C2CH signature,c,Yes,Yes,No,No,


In [13]:
lambert_TF_ensg_codes = lambert_table_s1_TF_rows["Gene Information"]
lambert_TF_ensg_codes

1       ENSG00000137203
2       ENSG00000008196
3       ENSG00000087510
4       ENSG00000008197
5       ENSG00000116819
             ...       
2760    ENSG00000177683
2761    ENSG00000174796
2762    ENSG00000184436
2763    ENSG00000161277
2764    ENSG00000168152
Name: Gene Information, Length: 1639, dtype: object

In [14]:
# Rows of SFARI Gene with ENSG codes in lambert TFs
SFARI_TFs = SFARI_Gene[SFARI_Gene["ensembl-id"].isin(lambert_TF_ensg_codes)]
SFARI_TFs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
16,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64
31,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24
60,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12
61,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15
62,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24
...,...,...,...,...,...,...,...,...,...,...
1111,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5
1112,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3
1113,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4
1114,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16


---
3. Merging known ADs and SFARI TFs -> SFARI TF known ADs

In [15]:
SFARI_TFs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
16,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64
31,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24
60,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12
61,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15
62,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24
...,...,...,...,...,...,...,...,...,...,...
1111,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5
1112,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3
1113,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4
1114,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16


In [16]:
# Adding uniprotIDs using ENSG
SFARI_TFs[["ensembl-id"]].to_csv("../data/SFARI_TF_ENSG_codes.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [17]:
output = protfasta.read_fasta("../data/SFARI_TFs_recent.fasta")
keys = output.keys()
vals = output.values()
SFARI_TFs_seqs = pd.DataFrame({"GeneName": keys, "ProteinSeq": vals})
SFARI_TFs_seqs

Unnamed: 0,GeneName,ProteinSeq
0,sp|O00712|NFIB_HUMAN Nuclear factor 1 B-type O...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
1,sp|O14529|CUX2_HUMAN Homeobox protein cut-like...,MAANVGSMFQYWKRFDLRRLQKELNSVASELSARQEESEHSHKHLI...
2,sp|O14770|MEIS2_HUMAN Homeobox protein Meis2 O...,MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...
3,sp|O15266|SHOX_HUMAN Short stature homeobox pr...,MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...
4,sp|O15409|FOXP2_HUMAN Forkhead box protein P2 ...,MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...
...,...,...
121,sp|Q9Y2K7|KDM2A_HUMAN Lysine-specific demethyl...,MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...
122,sp|Q9Y458|TBX22_HUMAN T-box transcription fact...,MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...
123,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
124,sp|Q9Y4A8|NF2L3_HUMAN Nuclear factor erythroid...,MKHLKRWWSAGGGLLHLTLLLSLAGLRVDLDLYLLLPPPTLLQDEL...


In [18]:
SFARI_TFs_seq_DBD = pd.read_csv("../data/SFARI_TFs_Seq_DBD.txt", sep = "\t")
SFARI_TFs_seq_DBD

Unnamed: 0,From,Entry,DNA binding,Sequence
0,ENSG00000147862,O00712,"DNA_BIND 1..195; /note=""CTF/NF-I""; /evidence=""...",MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
1,ENSG00000111249,O14529,"DNA_BIND 544..631; /note=""CUT 1""; /evidence=""E...",MAANVGSMFQYWKRFDLRRLQKELNSVASELSARQEESEHSHKHLI...
2,ENSG00000134138,O14770,"DNA_BIND 276..338; /note=""Homeobox; TALE-type""...",MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...
3,ENSG00000185960,O15266,"DNA_BIND 117..176; /note=""Homeobox""; /evidence...",MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...
4,ENSG00000128573,O15409,"DNA_BIND 504..594; /note=""Fork-head""; /evidenc...",MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...
...,...,...,...,...
121,ENSG00000173120,Q9Y2K7,,MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...
122,ENSG00000122145,Q9Y458,"DNA_BIND 96..283; /note=""T-box""; /evidence=""EC...",MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...
123,ENSG00000147180,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
124,ENSG00000050344,Q9Y4A8,,MKHLKRWWSAGGGLLHLTLLLSLAGLRVDLDLYLLLPPPTLLQDEL...


In [19]:
SFARI_TFs = pd.merge(SFARI_TFs, SFARI_TFs_seq_DBD, left_on = "ensembl-id", right_on = "From", how = "left")
SFARI_TFs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,Entry,DNA binding,Sequence
0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,ENSG00000101126,Q9H2P0,"DNA_BIND 754..814; /note=""Homeobox""; /evidence...",MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...
1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,ENSG00000126705,Q5TGY3,"DNA_BIND 396..408; /note=""A.T hook 1""; DNA_BIN...",MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...
2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,ENSG00000189079,Q68CP9,"DNA_BIND 524..603; /note=""RFX-type winged-heli...",MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,ENSG00000147180,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,ENSG00000178665,Q8N859,,MPSQNAVFSQEGNMEEEEMNDGSQMVRSQESLTFQDVAVDFTREEW...
124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,ENSG00000196391,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...
125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,ENSG00000170396,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...


In [20]:
SFARI_TFs['gene-symbol'].value_counts()

gene-symbol
CUX1      2
ADNP      1
SATB2     1
TBR1      1
SRCAP     1
         ..
KLF16     1
KDM5B     1
KDM2A     1
IKZF1     1
ZNF827    1
Name: count, Length: 126, dtype: int64

In [21]:
SFARI_TFs[SFARI_TFs["gene-symbol"] == "CUX1"]

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,Entry,DNA binding,Sequence
14,9,CUX1,cut like homeobox 1,ENSG00000257923,7,"Rare Single Gene Mutation, Functional",2.0,0,,11,ENSG00000257923,P39880,"DNA_BIND 542..629; /note=""CUT 1""; /evidence=""E...",MLCVAGARLKRELDATATVLANRQDESEQSRKRLIEQSREFKKNTP...
15,9,CUX1,cut like homeobox 1,ENSG00000257923,7,"Rare Single Gene Mutation, Functional",2.0,0,,11,ENSG00000257923,Q13948,,MAANVGSMFQYWKRFDLQQLQRELDATATVLANRQDESEQSRKRLI...


In [22]:
SFARI_TFs = SFARI_TFs.rename(columns = {"Entry" : "uniprotID"})

In [23]:
known_ADs_on_SFARI_TFs = pd.merge(known_ADs, SFARI_TFs.drop(columns = ["Sequence"]), on = "uniprotID")
known_ADs_on_SFARI_TFs

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,...,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,DNA binding
0,ARNT2,524,717,Q9HBZ2,[['Q9HBZ2']],,Q9HBZ2,"PMID: 8657146, Soto",TF,QGSPFPSGHSGKAFSSSVVHVPGVNDIQSSSSTGQNMSQISRQLNQ...,...,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,
1,ARX,472,562,Q96QS3,[['Q96QS3']],,Q96QS3,"PMID: 17331656, Soto",TF,RHPAFISPAFGRLFSTMAPLTSASTAAALLRQPTPAVEGAVASGAL...,...,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,"DNA_BIND 328..387; /note=""Homeobox""; /evidence..."
2,C2D1A,22,60,Q6P1N0,"[['Q6P1N0', 'Q6P1N0-2']]",,Q6P1N0,Staller Activity Data,TF,GLLVDLSPDGLMIPEDGANDEELEAEFLALVGGQPPALE,...,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,
3,CAMTA2,285,468,O94983,"[['O94983', 'O94983-2'], ['O94983', 'O94983-2']]",nan / ENST00000348066,O94983 / O94983,"PMID: 16678093, Soto / DelRosso et al.",TF,KAHTSPSSSSSSSSSGFAEPLEIRPSPPTSRGGSSRGGTAILLLTG...,...,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO..."
4,CAMTA2,472,581,O94983,"[['O94983', 'O94983-2']]",ENST00000348066,O94983,DelRosso et al.,TF,SPAPLEPSSRVGRGEALFGGPVGASELEPFSLSSFPDLMGELISDE...,...,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,THRA,1,52,P10827,"[['P10827', 'P10827-2', 'P10827-3', 'P10827-4']]",,P10827,"PMID: 27347890, Soto",TF,MEQKPSKVECGSDPEENSARSPDGKRKRKNGQCSLKTSMSGYIPSY...,...,thyroid hormone receptor alpha,ENSG00000126351,17,"Rare Single Gene Mutation, Functional",2.0,0,,5,ENSG00000126351,"DNA_BIND 53..127; /note=""Nuclear receptor""; /e..."
59,VDR,415,427,P11473,[['P11473']],,P11473,"PMID: 15908514, Soto",TF,TPLVLEVFGNEIS,...,vitamin D receptor,ENSG00000111424,12,"Genetic Association, Functional",2.0,0,,10,ENSG00000111424,"DNA_BIND 21..96; /note=""Nuclear receptor""; /ev..."
60,VDR,195,238,P11473,"[['P11473'], ['P11473']]",nan / nan,P11473 / P11473,"Choi 2000 list OR uniprot. check, GSL / Stalle...",TF,DMMDSSSFSNLDLSEEDSDDPSVTLELSQLSMLPHLADLVSYSI,...,vitamin D receptor,ENSG00000111424,12,"Genetic Association, Functional",2.0,0,,10,ENSG00000111424,"DNA_BIND 21..96; /note=""Nuclear receptor""; /ev..."
61,YY1,1,69,P25490,[['P25490']],,P25490,"PMID: 7731805, Soto",TF,MASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,...,YY1transcription factor,ENSG00000100811,14,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,,7,ENSG00000100811,


---

4. Merging SFARI TFs and SFARI TF known ADs -> SFARI TFs with known ADs

In [24]:
SFARI_TFs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence
0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,ENSG00000101126,Q9H2P0,"DNA_BIND 754..814; /note=""Homeobox""; /evidence...",MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...
1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,ENSG00000126705,Q5TGY3,"DNA_BIND 396..408; /note=""A.T hook 1""; DNA_BIN...",MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...
2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,ENSG00000189079,Q68CP9,"DNA_BIND 524..603; /note=""RFX-type winged-heli...",MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,ENSG00000147180,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,ENSG00000178665,Q8N859,,MPSQNAVFSQEGNMEEEEMNDGSQMVRSQESLTFQDVAVDFTREEW...
124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,ENSG00000196391,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...
125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,ENSG00000170396,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...


In [25]:
known_ADs_on_SFARI_TFs

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,...,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,DNA binding
0,ARNT2,524,717,Q9HBZ2,[['Q9HBZ2']],,Q9HBZ2,"PMID: 8657146, Soto",TF,QGSPFPSGHSGKAFSSSVVHVPGVNDIQSSSSTGQNMSQISRQLNQ...,...,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,
1,ARX,472,562,Q96QS3,[['Q96QS3']],,Q96QS3,"PMID: 17331656, Soto",TF,RHPAFISPAFGRLFSTMAPLTSASTAAALLRQPTPAVEGAVASGAL...,...,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,"DNA_BIND 328..387; /note=""Homeobox""; /evidence..."
2,C2D1A,22,60,Q6P1N0,"[['Q6P1N0', 'Q6P1N0-2']]",,Q6P1N0,Staller Activity Data,TF,GLLVDLSPDGLMIPEDGANDEELEAEFLALVGGQPPALE,...,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,
3,CAMTA2,285,468,O94983,"[['O94983', 'O94983-2'], ['O94983', 'O94983-2']]",nan / ENST00000348066,O94983 / O94983,"PMID: 16678093, Soto / DelRosso et al.",TF,KAHTSPSSSSSSSSSGFAEPLEIRPSPPTSRGGSSRGGTAILLLTG...,...,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO..."
4,CAMTA2,472,581,O94983,"[['O94983', 'O94983-2']]",ENST00000348066,O94983,DelRosso et al.,TF,SPAPLEPSSRVGRGEALFGGPVGASELEPFSLSSFPDLMGELISDE...,...,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,THRA,1,52,P10827,"[['P10827', 'P10827-2', 'P10827-3', 'P10827-4']]",,P10827,"PMID: 27347890, Soto",TF,MEQKPSKVECGSDPEENSARSPDGKRKRKNGQCSLKTSMSGYIPSY...,...,thyroid hormone receptor alpha,ENSG00000126351,17,"Rare Single Gene Mutation, Functional",2.0,0,,5,ENSG00000126351,"DNA_BIND 53..127; /note=""Nuclear receptor""; /e..."
59,VDR,415,427,P11473,[['P11473']],,P11473,"PMID: 15908514, Soto",TF,TPLVLEVFGNEIS,...,vitamin D receptor,ENSG00000111424,12,"Genetic Association, Functional",2.0,0,,10,ENSG00000111424,"DNA_BIND 21..96; /note=""Nuclear receptor""; /ev..."
60,VDR,195,238,P11473,"[['P11473'], ['P11473']]",nan / nan,P11473 / P11473,"Choi 2000 list OR uniprot. check, GSL / Stalle...",TF,DMMDSSSFSNLDLSEEDSDDPSVTLELSQLSMLPHLADLVSYSI,...,vitamin D receptor,ENSG00000111424,12,"Genetic Association, Functional",2.0,0,,10,ENSG00000111424,"DNA_BIND 21..96; /note=""Nuclear receptor""; /ev..."
61,YY1,1,69,P25490,[['P25490']],,P25490,"PMID: 7731805, Soto",TF,MASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,...,YY1transcription factor,ENSG00000100811,14,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,,7,ENSG00000100811,


In [26]:
SFARI_TFs_with_known_ADs = SFARI_TFs[SFARI_TFs["ensembl-id"].isin(known_ADs_on_SFARI_TFs["ensembl-id"])]
SFARI_TFs_with_known_ADs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...
8,9,CASZ1,castor zinc finger 1,ENSG00000130940,1,Rare Single Gene Mutation,1.0,0,9.55,6,ENSG00000130940,Q86V15,,MDLGTAEGTRCTDPPAGKPAMAPKRKGGLKLNAICAKLSRQVVVEK...
9,9,CC2D1A,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,Q6P1N0,,MHKRKGPPGPPGRGAAAARQLGLLVDLSPDGLMIPEDGANDEELEA...
10,9,CAMTA2,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,O94983,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO...",MNTKDTTEVAENSHHLKIFLPKKLLECLPRCPLLPPERLRWNTNEE...
18,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,ENSG00000064195,O60479,"DNA_BIND 129..188; /note=""Homeobox""; /evidence...",MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...
21,9,EBF3,early B-cell factor 3,ENSG00000108001,10,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,9.75,18,ENSG00000108001,Q9H4W6,,MFGIQENIPRGGTTMKEEPLGSGMNPVRSWMHTAGVVDANTAAQSG...
22,9,EGR3,early growth response 3,ENSG00000179388,8,Rare Single Gene Mutation,2.0,0,,3,ENSG00000179388,Q06889,,MTGKLAEKLPVTMSSLLNQLPDNLYPEEIPSALNLFSGSSDSVVHY...
24,9,ESR2,estrogen receptor 2 (ER beta),ENSG00000140009,14,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,9,ENSG00000140009,Q92731,"DNA_BIND 149..214; /note=""Nuclear receptor""; /...",MDIKNSPSSLNSPSSYNCSQSILPLEHGSIYIPSSYVDSHHEYPAM...
25,9,ESRRB,estrogen-related receptor beta,ENSG00000119715,14,"Rare Single Gene Mutation, Genetic Association",2.0,0,,9,ENSG00000119715,O95718,"DNA_BIND 100..186; /note=""Nuclear receptor""; /...",MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...


In [27]:
len(SFARI_TFs_with_known_ADs)

46

In [28]:
len(known_ADs_on_SFARI_TFs["ensembl-id"].unique())

46

---

5. Find DBDs on SFARI TFs with known ADs -> DBDs on SFARI TFs with known ADs

In [29]:
sum(SFARI_TFs_with_known_ADs["DNA binding"].isna())

20

In [30]:
SFARI_TFs_with_known_ADs["DNA binding"]

3                                                    NaN
4      DNA_BIND 328..387; /note="Homeobox"; /evidence...
8                                                    NaN
9                                                    NaN
10     DNA_BIND 30..155; /note="CG-1"; /evidence="ECO...
18     DNA_BIND 129..188; /note="Homeobox"; /evidence...
21                                                   NaN
22                                                   NaN
24     DNA_BIND 149..214; /note="Nuclear receptor"; /...
25     DNA_BIND 100..186; /note="Nuclear receptor"; /...
30                                                   NaN
31     DNA_BIND 311..391; /note="ETS"; /evidence="ECO...
34                                                   NaN
36                                                   NaN
38                                                   NaN
39                                                   NaN
40                                                   NaN
41     DNA_BIND 169..180; /note

In [31]:
SFARI_TFs_with_known_ADs[SFARI_TFs_with_known_ADs["DNA binding"].isna()]

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...
8,9,CASZ1,castor zinc finger 1,ENSG00000130940,1,Rare Single Gene Mutation,1.0,0,9.55,6,ENSG00000130940,Q86V15,,MDLGTAEGTRCTDPPAGKPAMAPKRKGGLKLNAICAKLSRQVVVEK...
9,9,CC2D1A,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,Q6P1N0,,MHKRKGPPGPPGRGAAAARQLGLLVDLSPDGLMIPEDGANDEELEA...
21,9,EBF3,early B-cell factor 3,ENSG00000108001,10,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,9.75,18,ENSG00000108001,Q9H4W6,,MFGIQENIPRGGTTMKEEPLGSGMNPVRSWMHTAGVVDANTAAQSG...
22,9,EGR3,early growth response 3,ENSG00000179388,8,Rare Single Gene Mutation,2.0,0,,3,ENSG00000179388,Q06889,,MTGKLAEKLPVTMSSLLNQLPDNLYPEEIPSALNLFSGSSDSVVHY...
30,9,GLIS1,GLIS family zinc finger 1,ENSG00000174332,1,"Rare Single Gene Mutation, Genetic Association",2.0,0,,3,ENSG00000174332,Q8NBF1,,MAEARTSLSAHCRGPLATGLHPDLDLPGRSLATPAPSCYLLGSEPS...
34,9,HIVEP3,human immunodeficiency virus type I enhancer b...,ENSG00000127124,1,"Rare Single Gene Mutation, Genetic Association",2.0,0,,8,ENSG00000127124,Q5T1R4,,MDPEQSVKGTKKAEGSPRKRLTKGEAIQTSVSSSVPYPGSGTAATQ...
36,9,IKZF1,IKAROS family zinc finger 1,ENSG00000185811,7,Rare Single Gene Mutation,3.0,0,,6,ENSG00000185811,Q13422,,MDADEGQDMSQVSGKESPPVSDTPDEGDEPMPIPEDLSTTSGGQQS...
38,9,KDM5B,Lysine (K)-specific demethylase 5B,ENSG00000117139,1,"Rare Single Gene Mutation, Syndromic, Functional",1.0,0,2.8,19,ENSG00000117139,Q9UGL1,,MEAATTLHPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPF...
39,9,KLF16,Kruppel like factor 16,ENSG00000129911,19,Rare Single Gene Mutation,2.0,0,,1,ENSG00000129911,Q9BXK1,,MSAAVACVDYFAADVLMAISSGAVVHRGRPGPEGAGPAAGLDVRAA...


In [32]:
SFARI_TFs_with_known_ADs[~SFARI_TFs_with_known_ADs["DNA binding"].isna()]

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...
10,9,CAMTA2,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,O94983,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO...",MNTKDTTEVAENSHHLKIFLPKKLLECLPRCPLLPPERLRWNTNEE...
18,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,ENSG00000064195,O60479,"DNA_BIND 129..188; /note=""Homeobox""; /evidence...",MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...
24,9,ESR2,estrogen receptor 2 (ER beta),ENSG00000140009,14,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,9,ENSG00000140009,Q92731,"DNA_BIND 149..214; /note=""Nuclear receptor""; /...",MDIKNSPSSLNSPSSYNCSQSILPLEHGSIYIPSSYVDSHHEYPAM...
25,9,ESRRB,estrogen-related receptor beta,ENSG00000119715,14,"Rare Single Gene Mutation, Genetic Association",2.0,0,,9,ENSG00000119715,O95718,"DNA_BIND 100..186; /note=""Nuclear receptor""; /...",MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...
31,9,ERG,"ERG, ETS transcription factor",ENSG00000157554,21,Genetic Association,2.0,0,,1,ENSG00000157554,P11308,"DNA_BIND 311..391; /note=""ETS""; /evidence=""ECO...",MASTIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSSDYGQTSK...
41,9,KMT2A,Lysine (K)-specific methyltransferase 2A,ENSG00000118058,11,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,,40,ENSG00000118058,Q03164,"DNA_BIND 169..180; /note=""A.T hook 1""; DNA_BIN...",MAHSCRWRFPARPGTTGGGGGGGRRGLGGAPRQRVPALLLPPGPPV...
49,9,MEF2C,myocyte enhancer factor 2C,ENSG00000081189,5,"Rare Single Gene Mutation, Syndromic, Genetic ...",1.0,1,9.85,46,ENSG00000081189,Q06413,"DNA_BIND 58..86; /note=""Mef2-type""; /evidence=...",MGRKKIQITRIMDERNRQVTFTKRKFGLMKKAYELSVLCDCEIALI...
50,9,MEIS2,Meis homeobox 2,ENSG00000134138,15,"Rare Single Gene Mutation, Syndromic",1.0,1,7.5,12,ENSG00000134138,O14770,"DNA_BIND 276..338; /note=""Homeobox; TALE-type""...",MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...
52,9,MSX2,msh homeobox 2,ENSG00000120149,5,"Rare Single Gene Mutation, Syndromic",3.0,1,,2,ENSG00000120149,P35548,"DNA_BIND 142..201; /note=""Homeobox""; /evidence...",MASPSKGNDLFSPDEEGPAVVAGPGPGPGGAEGAAEERRVKVSSLP...


In [33]:
SFARI_TFs_with_known_ADs["DBD_Start"] = SFARI_TFs_with_known_ADs["DNA binding"].str.findall(r'DNA_BIND (\d*)\.\.')
SFARI_TFs_with_known_ADs["DBD_End"] = SFARI_TFs_with_known_ADs["DNA binding"].str.findall(r'DNA_BIND \d*\.\.(\d*)')
SFARI_TFs_with_known_ADs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence,DBD_Start,DBD_End
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,,
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,[328],[387]
8,9,CASZ1,castor zinc finger 1,ENSG00000130940,1,Rare Single Gene Mutation,1.0,0,9.55,6,ENSG00000130940,Q86V15,,MDLGTAEGTRCTDPPAGKPAMAPKRKGGLKLNAICAKLSRQVVVEK...,,
9,9,CC2D1A,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,Q6P1N0,,MHKRKGPPGPPGRGAAAARQLGLLVDLSPDGLMIPEDGANDEELEA...,,
10,9,CAMTA2,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,O94983,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO...",MNTKDTTEVAENSHHLKIFLPKKLLECLPRCPLLPPERLRWNTNEE...,[30],[155]
18,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,ENSG00000064195,O60479,"DNA_BIND 129..188; /note=""Homeobox""; /evidence...",MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...,[129],[188]
21,9,EBF3,early B-cell factor 3,ENSG00000108001,10,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,9.75,18,ENSG00000108001,Q9H4W6,,MFGIQENIPRGGTTMKEEPLGSGMNPVRSWMHTAGVVDANTAAQSG...,,
22,9,EGR3,early growth response 3,ENSG00000179388,8,Rare Single Gene Mutation,2.0,0,,3,ENSG00000179388,Q06889,,MTGKLAEKLPVTMSSLLNQLPDNLYPEEIPSALNLFSGSSDSVVHY...,,
24,9,ESR2,estrogen receptor 2 (ER beta),ENSG00000140009,14,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,9,ENSG00000140009,Q92731,"DNA_BIND 149..214; /note=""Nuclear receptor""; /...",MDIKNSPSSLNSPSSYNCSQSILPLEHGSIYIPSSYVDSHHEYPAM...,[149],[214]
25,9,ESRRB,estrogen-related receptor beta,ENSG00000119715,14,"Rare Single Gene Mutation, Genetic Association",2.0,0,,9,ENSG00000119715,O95718,"DNA_BIND 100..186; /note=""Nuclear receptor""; /...",MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,[100],[186]


In [39]:
SFARI_TFs_with_known_ADs = SFARI_TFs_with_known_ADs.reset_index(drop = True)
SFARI_TFs_with_known_ADs_DBDs_expanded = SFARI_TFs_with_known_ADs.apply(pd.Series.explode)
SFARI_TFs_with_known_ADs_DBDs_expanded

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence,DBD_Start,DBD_End
0,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,,
1,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,328.0,387.0
2,9,CASZ1,castor zinc finger 1,ENSG00000130940,1,Rare Single Gene Mutation,1.0,0,9.55,6,ENSG00000130940,Q86V15,,MDLGTAEGTRCTDPPAGKPAMAPKRKGGLKLNAICAKLSRQVVVEK...,,
3,9,CC2D1A,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,Q6P1N0,,MHKRKGPPGPPGRGAAAARQLGLLVDLSPDGLMIPEDGANDEELEA...,,
4,9,CAMTA2,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,O94983,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO...",MNTKDTTEVAENSHHLKIFLPKKLLECLPRCPLLPPERLRWNTNEE...,30.0,155.0
5,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,ENSG00000064195,O60479,"DNA_BIND 129..188; /note=""Homeobox""; /evidence...",MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...,129.0,188.0
6,9,EBF3,early B-cell factor 3,ENSG00000108001,10,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,9.75,18,ENSG00000108001,Q9H4W6,,MFGIQENIPRGGTTMKEEPLGSGMNPVRSWMHTAGVVDANTAAQSG...,,
7,9,EGR3,early growth response 3,ENSG00000179388,8,Rare Single Gene Mutation,2.0,0,,3,ENSG00000179388,Q06889,,MTGKLAEKLPVTMSSLLNQLPDNLYPEEIPSALNLFSGSSDSVVHY...,,
8,9,ESR2,estrogen receptor 2 (ER beta),ENSG00000140009,14,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,9,ENSG00000140009,Q92731,"DNA_BIND 149..214; /note=""Nuclear receptor""; /...",MDIKNSPSSLNSPSSYNCSQSILPLEHGSIYIPSSYVDSHHEYPAM...,149.0,214.0
9,9,ESRRB,estrogen-related receptor beta,ENSG00000119715,14,"Rare Single Gene Mutation, Genetic Association",2.0,0,,9,ENSG00000119715,O95718,"DNA_BIND 100..186; /note=""Nuclear receptor""; /...",MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,100.0,186.0


In [40]:
DBDs = SFARI_TFs_with_known_ADs_DBDs_expanded[["ensembl-id", "uniprotID", "DBD_Start", "DBD_End"]]
DBDs = DBDs.dropna()
DBDs

Unnamed: 0,ensembl-id,uniprotID,DBD_Start,DBD_End
1,ENSG00000004848,Q96QS3,328,387
4,ENSG00000108509,O94983,30,155
5,ENSG00000064195,O60479,129,188
8,ENSG00000140009,Q92731,149,214
9,ENSG00000119715,O95718,100,186
11,ENSG00000157554,P11308,311,391
17,ENSG00000118058,Q03164,169,180
17,ENSG00000118058,Q03164,217,227
17,ENSG00000118058,Q03164,301,309
18,ENSG00000081189,Q06413,58,86


---
## Preparing for variant analysis script format

In [41]:
# CDS
SFARI_TFs_with_known_ADs["Start"] = 1
SFARI_TFs_with_known_ADs["End"] = SFARI_TFs_with_known_ADs["Sequence"].str.len()
SFARI_TFs_with_known_ADs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,From,uniprotID,DNA binding,Sequence,DBD_Start,DBD_End,Start,End
0,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,ENSG00000172379,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,,,1,717
1,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.8,24,ENSG00000004848,Q96QS3,"DNA_BIND 328..387; /note=""Homeobox""; /evidence...",MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,[328],[387],1,562
2,9,CASZ1,castor zinc finger 1,ENSG00000130940,1,Rare Single Gene Mutation,1.0,0,9.55,6,ENSG00000130940,Q86V15,,MDLGTAEGTRCTDPPAGKPAMAPKRKGGLKLNAICAKLSRQVVVEK...,,,1,1759
3,9,CC2D1A,Coiled-coil and C2 domain containing 1A,ENSG00000132024,19,"Rare Single Gene Mutation, Functional",2.0,0,,15,ENSG00000132024,Q6P1N0,,MHKRKGPPGPPGRGAAAARQLGLLVDLSPDGLMIPEDGANDEELEA...,,,1,951
4,9,CAMTA2,calmodulin binding transcription activator 2,ENSG00000108509,17,"Rare Single Gene Mutation, Syndromic",1.0,0,,3,ENSG00000108509,O94983,"DNA_BIND 30..155; /note=""CG-1""; /evidence=""ECO...",MNTKDTTEVAENSHHLKIFLPKKLLECLPRCPLLPPERLRWNTNEE...,[30],[155],1,1202
5,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,ENSG00000064195,O60479,"DNA_BIND 129..188; /note=""Homeobox""; /evidence...",MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...,[129],[188],1,287
6,9,EBF3,early B-cell factor 3,ENSG00000108001,10,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,9.75,18,ENSG00000108001,Q9H4W6,,MFGIQENIPRGGTTMKEEPLGSGMNPVRSWMHTAGVVDANTAAQSG...,,,1,596
7,9,EGR3,early growth response 3,ENSG00000179388,8,Rare Single Gene Mutation,2.0,0,,3,ENSG00000179388,Q06889,,MTGKLAEKLPVTMSSLLNQLPDNLYPEEIPSALNLFSGSSDSVVHY...,,,1,387
8,9,ESR2,estrogen receptor 2 (ER beta),ENSG00000140009,14,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,9,ENSG00000140009,Q92731,"DNA_BIND 149..214; /note=""Nuclear receptor""; /...",MDIKNSPSSLNSPSSYNCSQSILPLEHGSIYIPSSYVDSHHEYPAM...,[149],[214],1,530
9,9,ESRRB,estrogen-related receptor beta,ENSG00000119715,14,"Rare Single Gene Mutation, Genetic Association",2.0,0,,9,ENSG00000119715,O95718,"DNA_BIND 100..186; /note=""Nuclear receptor""; /...",MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,[100],[186],1,433


In [42]:
SFARI_TFs_final_cols = SFARI_TFs_with_known_ADs[["uniprotID", "Start", "End", "ensembl-id"]]
SFARI_TFs_final_cols = SFARI_TFs_final_cols.rename(columns = {"ensembl-id" : "ENSG"})
SFARI_TFs_final_cols

Unnamed: 0,uniprotID,Start,End,ENSG
0,Q9HBZ2,1,717,ENSG00000172379
1,Q96QS3,1,562,ENSG00000004848
2,Q86V15,1,1759,ENSG00000130940
3,Q6P1N0,1,951,ENSG00000132024
4,O94983,1,1202,ENSG00000108509
5,O60479,1,287,ENSG00000064195
6,Q9H4W6,1,596,ENSG00000108001
7,Q06889,1,387,ENSG00000179388
8,Q92731,1,530,ENSG00000140009
9,O95718,1,433,ENSG00000119715


In [43]:
# ADs
known_ADs_on_SFARI_TFs_final_cols = known_ADs_on_SFARI_TFs[["uniprotID", "Start", "End", "From"]]
known_ADs_on_SFARI_TFs_final_cols = known_ADs_on_SFARI_TFs_final_cols.rename(columns = {"From" :"ENSG"})
known_ADs_on_SFARI_TFs_final_cols

Unnamed: 0,uniprotID,Start,End,ENSG
0,Q9HBZ2,524,717,ENSG00000172379
1,Q96QS3,472,562,ENSG00000004848
2,Q6P1N0,22,60,ENSG00000132024
3,O94983,285,468,ENSG00000108509
4,O94983,472,581,ENSG00000108509
...,...,...,...,...
58,P10827,1,52,ENSG00000126351
59,P11473,415,427,ENSG00000111424
60,P11473,195,238,ENSG00000111424
61,P25490,1,69,ENSG00000100811


In [44]:
CDS_uniprot_IDs = set(SFARI_TFs_final_cols["uniprotID"])
AD_uniprot_IDs = set(known_ADs_on_SFARI_TFs_final_cols["uniprotID"])
DBD_uniprot_IDs = set(DBDs["uniprotID"])

In [45]:
AD_uniprot_IDs - CDS_uniprot_IDs

set()

In [46]:
CDS_uniprot_IDs - AD_uniprot_IDs

set()

In [47]:
CDS_uniprot_IDs - DBD_uniprot_IDs

{'O75840',
 'P15884',
 'P19532',
 'P25490',
 'Q06889',
 'Q13422',
 'Q13485',
 'Q14872',
 'Q15788',
 'Q5T1R4',
 'Q6N021',
 'Q6P1N0',
 'Q86V15',
 'Q8NBF1',
 'Q9BXK1',
 'Q9H4W6',
 'Q9HBZ2',
 'Q9UGL1',
 'Q9UL68',
 'Q9Y4A8'}

In [48]:
DBD_uniprot_IDs - AD_uniprot_IDs

set()

In [49]:
AD_uniprot_IDs - DBD_uniprot_IDs

{'O75840',
 'P15884',
 'P19532',
 'P25490',
 'Q06889',
 'Q13422',
 'Q13485',
 'Q14872',
 'Q15788',
 'Q5T1R4',
 'Q6N021',
 'Q6P1N0',
 'Q86V15',
 'Q8NBF1',
 'Q9BXK1',
 'Q9H4W6',
 'Q9HBZ2',
 'Q9UGL1',
 'Q9UL68',
 'Q9Y4A8'}

In [50]:
DBDs_final_cols = DBDs[["uniprotID", "DBD_Start", "DBD_End"]]
DBDs_final_cols = DBDs_final_cols.rename(columns = {"DBD_Start" : "Start", "DBD_End" : "End"})
DBDs_final_cols

Unnamed: 0,uniprotID,Start,End
1,Q96QS3,328,387
4,O94983,30,155
5,O60479,129,188
8,Q92731,149,214
9,O95718,100,186
11,P11308,311,391
17,Q03164,169,180
17,Q03164,217,227
17,Q03164,301,309
18,Q06413,58,86


In [51]:
SFARI_TFs_final_cols.to_csv("../data/SFARI_TFs_with_knownADs_coords_ENSG.csv")
known_ADs_on_SFARI_TFs_final_cols.to_csv("../data/SFARI_ADs_AA_coords_ENSG.csv")
DBDs_final_cols.to_csv("../data/SFARI_TFs_with_known_ADs_DBD_coords_ENSG.csv")