In [2]:
%load_ext autoreload

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools

**Which SPARK transcription factors have no effector domains?**

In [4]:
# Loading in all known effector domains
# GSL, activity data, Soto ED list, Nicole hits

#GSL
GSL = pd.read_csv("../data/GSL.csv")[["GeneName", "Start", "End", "uniprotID", "Reference"]]
GSL["Reference"] = GSL["Reference"] + ", GSL"

# Active predictions
activity_data = pd.read_csv('../data/Staller2021/SupplementalDataSet4_ActivityData_PredictedADs_renorm20210708_uniprotIDs_added.csv', index_col = 0)
activity_data = activity_data[activity_data["RegionType"] == "Prediction"]
active = activity_data[activity_data["Activity_mean"] > 221]
active["Reference"] = "Staller Activity Data"
active = active[["GeneName", "Start", "End", "uniprotID", "Reference"]]
active["GeneName"] = active["GeneName"].str.extract(r'\|.*\|(.*)_')

# Downloading Soto effector domain list (not just activation domains)
# From: https://pubmed.ncbi.nlm.nih.gov/34863368/
Soto = pd.read_excel("../data/Soto_supplemental_table", sheet_name = 'Table S2')
Soto = Soto.rename(columns = {"Uniprot ID" : "uniprotID"})
Soto["GeneName"] = ""
Soto["Reference"] = "PMID: " + Soto["Reference (PMID)"] + ", Soto " + Soto["Domain type"]
Soto["Start"] = Soto["Coordinates"].str.split("-").str[0]
Soto["End"] = Soto["Coordinates"].str.split("-").str[1]
Soto = Soto[["GeneName", "Start", "End", "uniprotID", "Reference"]]

# Nicole
Stanford_ADs = pd.read_excel("../data/Stanford_Supplementary Table 2 Domains from Tiles.xlsx").iloc[:, :-3]
Stanford_ADs["Reference"] = "Stanford AD"

Stanford_RDs = pd.read_excel("../data/Stanford_Supplementary Table 2 Domains from Tiles.xlsx",
             sheet_name = "Repression Domains").iloc[:, :-2]
Stanford_RDs["Reference"] = "Stanford RD"

Stanford = pd.concat([Stanford_ADs, Stanford_RDs])
Stanford = Stanford.rename(columns = {"UniProt ID" : "uniprotID", "HGNC symbol" : "GeneName"})
Stanford = Stanford[["GeneName", "Start", "End", "uniprotID", "Reference"]]

In [5]:
Stanford

Unnamed: 0,GeneName,Start,End,uniprotID,Reference
0,ABRAXAS1,121,200,Q6UWZ7,Stanford AD
1,AHR,531,640,P35869,Stanford AD
2,AHR,641,720,P35869,Stanford AD
3,AKAP8,1,80,O43823,Stanford AD
4,AKAP8L,1,80,Q9ULX6,Stanford AD
...,...,...,...,...,...
3895,ZSCAN5C,81,200,A6NGD5,Stanford RD
3896,ZSCAN9,21,120,O15535,Stanford RD
3897,ZUFSP,271,350,P10070,Stanford RD
3898,ZXDC,591,730,Q2QGD7,Stanford RD


In [6]:
# Merging these lists to one list of known effector domains
for df in [GSL, Soto, Stanford, active]:
    df["Start"] = df["Start"].astype(int)
    df["End"] = df["End"].astype(int)
    df["List"] = ""
    df["AD name"] = ""
        
known_EDs = AD_comparison_tools.return_merged_list([GSL, Soto, Stanford, active])
known_EDs

Unnamed: 0,GeneName,AD name,Start,End,uniprotID,Reference,List
0,TADA2A,,11,90,A0A024R0Y4,Stanford RD,
1,TADA2A,,111,210,A0A024R0Y4,Stanford AD,
2,BHLHE23,,161,240,A0A087WXG3,Stanford AD,
3,SMARCA1,/,11,140,A0A0A0MRP6,Stanford RD / Stanford RD,/
4,SMARCA1,,561,650,A0A0A0MRP6,Stanford RD,
...,...,...,...,...,...,...,...
3315,,,263,446,Q9Y6X8,"PMID: 12741956, Soto RD",
3316,ZHX2,,1,80,Q9Y6X8,Stanford RD,
3317,CAMTA1,,911,1050,Q9Y6Y1,Stanford RD,
3318,CAMTA1,/ / / / / /,421,910,Q9Y6Y1,Stanford AD / Stanford RD / Stanford RD / Stan...,/ / / / / /


In [7]:
# Adding primary gene names to the Known ED list using uniprotIDs
known_EDs[["uniprotID"]].to_csv("../data/known_ED_uniprotIDs.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [8]:
# Reading in the gene names
known_ED_gene_names = pd.read_csv("../data/known_ED_gene_names.tsv", sep = "\t")
known_ED_gene_names = known_ED_gene_names.rename(columns = {"From" : "uniprotID",
                                         "Gene Names (primary)" : "gene-symbol"})
known_ED_gene_names = known_ED_gene_names.drop(columns = "Entry")
known_ED_gene_names

Unnamed: 0,uniprotID,gene-symbol
0,A0A024R0Y4,TADA2A
1,A0A087WXG3,BHLHE23
2,A1YPR0,ZBTB7C
3,A2RRD8,ZNF320
4,A6NFD8,HELT
...,...,...
781,Q9Y6F1,PARP3
782,Q9Y6J9,TAF6L
783,Q9Y6Q9,NCOA3
784,Q9Y6X8,ZHX2


In [9]:
known_EDs_with_gene_symbol = pd.merge(known_EDs, known_ED_gene_names, on = "uniprotID", how = "left")
known_EDs_with_gene_symbol

Unnamed: 0,GeneName,AD name,Start,End,uniprotID,Reference,List,gene-symbol
0,TADA2A,,11,90,A0A024R0Y4,Stanford RD,,TADA2A
1,TADA2A,,111,210,A0A024R0Y4,Stanford AD,,TADA2A
2,BHLHE23,,161,240,A0A087WXG3,Stanford AD,,BHLHE23
3,SMARCA1,/,11,140,A0A0A0MRP6,Stanford RD / Stanford RD,/,
4,SMARCA1,,561,650,A0A0A0MRP6,Stanford RD,,
...,...,...,...,...,...,...,...,...
3315,,,263,446,Q9Y6X8,"PMID: 12741956, Soto RD",,ZHX2
3316,ZHX2,,1,80,Q9Y6X8,Stanford RD,,ZHX2
3317,CAMTA1,,911,1050,Q9Y6Y1,Stanford RD,,CAMTA1
3318,CAMTA1,/ / / / / /,421,910,Q9Y6Y1,Stanford AD / Stanford RD / Stanford RD / Stan...,/ / / / / /,CAMTA1


In [10]:
# Are there rows with no gene symbol?
known_EDs_with_gene_symbol[known_EDs_with_gene_symbol["gene-symbol"].isna()]

Unnamed: 0,GeneName,AD name,Start,End,uniprotID,Reference,List,gene-symbol
3,SMARCA1,/,11,140,A0A0A0MRP6,Stanford RD / Stanford RD,/,
4,SMARCA1,,561,650,A0A0A0MRP6,Stanford RD,,
5,SMARCA1,,881,960,A0A0A0MRP6,Stanford RD,,
6,ZNF107,,351,430,A0A0B4J2G0,Stanford RD,,
7,ZNF107,,1,80,A0A0B4J2G0,Stanford RD,,
...,...,...,...,...,...,...,...,...
3299,HCFC2,,521,610,Q9Y5Z7,Stanford RD,,
3307,USP3,,111,190,Q9Y6I4,Stanford RD,,
3308,USP3,,421,510,Q9Y6I4,Stanford RD,,
3313,SETBP1,,361,440,Q9Y6X0,Stanford RD,,


In [11]:
# Replacing null gene symbols with values from the GeneName column
known_EDs_with_gene_symbol["gene-symbol"].fillna(known_EDs_with_gene_symbol["GeneName"], inplace=True)
known_EDs_with_gene_symbol

Unnamed: 0,GeneName,AD name,Start,End,uniprotID,Reference,List,gene-symbol
0,TADA2A,,11,90,A0A024R0Y4,Stanford RD,,TADA2A
1,TADA2A,,111,210,A0A024R0Y4,Stanford AD,,TADA2A
2,BHLHE23,,161,240,A0A087WXG3,Stanford AD,,BHLHE23
3,SMARCA1,/,11,140,A0A0A0MRP6,Stanford RD / Stanford RD,/,SMARCA1
4,SMARCA1,,561,650,A0A0A0MRP6,Stanford RD,,SMARCA1
...,...,...,...,...,...,...,...,...
3315,,,263,446,Q9Y6X8,"PMID: 12741956, Soto RD",,ZHX2
3316,ZHX2,,1,80,Q9Y6X8,Stanford RD,,ZHX2
3317,CAMTA1,,911,1050,Q9Y6Y1,Stanford RD,,CAMTA1
3318,CAMTA1,/ / / / / /,421,910,Q9Y6Y1,Stanford AD / Stanford RD / Stanford RD / Stan...,/ / / / / /,CAMTA1


In [12]:
# List of the gene names associated with the known effector domains
known_ED_gene_names_list = known_EDs_with_gene_symbol["gene-symbol"]
known_ED_gene_names_list

0        TADA2A
1        TADA2A
2       BHLHE23
3       SMARCA1
4       SMARCA1
         ...   
3315       ZHX2
3316       ZHX2
3317     CAMTA1
3318     CAMTA1
3319     CAMTA1
Name: gene-symbol, Length: 3320, dtype: object

In [13]:
# Loading in SFARI genes  
SFARI = pd.read_csv("../data/SFARI-Gene_genes_01-23-2023release_03-21-2023export.csv")
SFARI

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
0,9,ABAT,4-aminobutyrate aminotransferase,ENSG00000183044,16,"Rare Single Gene Mutation, Genetic Association",2.0,0,,7
1,9,ABCA10,"ATP-binding cassette, sub-family A (ABC1), mem...",ENSG00000154263,17,Rare Single Gene Mutation,2.0,0,,2
2,9,ABCA13,ATP binding cassette subfamily A member 13,ENSG00000179869,7,"Rare Single Gene Mutation, Functional",2.0,0,,10
3,9,ABCA7,"ATP-binding cassette, sub-family A (ABC1), mem...",ENSG00000064687,19,Rare Single Gene Mutation,2.0,0,,5
4,9,ABL2,"ABL proto-oncogene 2, non-receptor tyrosine ki...",ENSG00000143322,1,"Rare Single Gene Mutation, Functional",3.0,0,,10
...,...,...,...,...,...,...,...,...,...,...
1113,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4
1114,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16
1115,9,ZNF827,Zinc finger protein 827,ENSG00000151612,4,"Rare Single Gene Mutation, Genetic Association",2.0,0,,3
1116,9,ZSWIM6,zinc finger SWIM-type containing 6,ENSG00000130449,5,"Rare Single Gene Mutation, Syndromic, Genetic ...",,1,,6


In [7]:
# Loading in lambert TFs
lambert_tfs = pd.read_csv("../data/LambertTFs.csv", index_col = 0)
lambert_tfs["uniprotID"] = lambert_tfs["GeneName"].str.split("|").str[1]
lambert_tfs

Unnamed: 0,GeneName,ProteinSeq,uniprotID
0,sp|P23511|NFYA_HUMAN,MEQYTANSNSSTEQIVVQAGQIQQQQQGGVTAVQLQTEAQVASASG...,P23511
1,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3
2,sp|P31270|HXA11_HUMAN,MDFDERGPCSSNMYLPSCTYYVSGPDFSSLPSFLPQTPSSRPMTYS...,P31270
3,sp|P50221|MEOX1_HUMAN,MDPAASSCMRSLQPPAPVWGCLRNPHSEGNGASGLPHYPPTPFSFH...,P50221
4,sp|P57073|SOX8_HUMAN,MLDMSEARSQPPCSPSGTASSMSHVEDSDSDAPPSPAGSEGLGRAG...,P57073
...,...,...,...
1603,sp|P35227|PCGF2_HUMAN,MHRTTRIKITELNPHLMCALCGGYFIDATTIVECLHSFCKTCIVRY...,P35227
1604,sp|Q9BS34|ZN670_HUMAN,MDSVSFEDVAVAFTQEEWALLDPSQKNLYRDVMQEIFRNLASVGNK...,Q9BS34
1605,sp|P17098|ZNF8_HUMAN,MDPEDEGVAGVMSVGPPAARLQEPVTFRDVAVDFTQEEWGQLDPTQ...,P17098
1606,sp|Q9UJW7|ZN229_HUMAN,METLTSRHEKRALHSQASAISQDREEKIMSQEPLSFKDVAVVFTEE...,Q9UJW7


In [8]:
# Adding primary gene names to the lambert TFs using uniprotIDs
lambert_tfs[["uniprotID"]].to_csv("../data/lambert_tfs_uniprotIDs.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [9]:
# Reading in the gene names
lambert_gene_names = pd.read_csv("../data/lambert_tfs_gene_names.tsv", sep = "\t")
lambert_gene_names = lambert_gene_names.rename(columns = {"From" : "uniprotID",
                                         "Gene Names (primary)" : "gene-symbol"})
lambert_gene_names = lambert_gene_names.drop(columns = "Entry")
lambert_gene_names

Unnamed: 0,uniprotID,gene-symbol
0,P23511,NFYA
1,Q96QS3,ARX
2,P31270,HOXA11
3,P50221,MEOX1
4,P57073,SOX8
...,...,...
1603,P35227,PCGF2
1604,Q9BS34,ZNF670
1605,P17098,ZNF8
1606,Q9UJW7,ZNF229


In [10]:
# Adding gene names to lambert tfs
lambert_tfs = pd.merge(lambert_tfs, lambert_gene_names, on = "uniprotID")
lambert_tfs

Unnamed: 0,GeneName,ProteinSeq,uniprotID,gene-symbol
0,sp|P23511|NFYA_HUMAN,MEQYTANSNSSTEQIVVQAGQIQQQQQGGVTAVQLQTEAQVASASG...,P23511,NFYA
1,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3,ARX
2,sp|P31270|HXA11_HUMAN,MDFDERGPCSSNMYLPSCTYYVSGPDFSSLPSFLPQTPSSRPMTYS...,P31270,HOXA11
3,sp|P50221|MEOX1_HUMAN,MDPAASSCMRSLQPPAPVWGCLRNPHSEGNGASGLPHYPPTPFSFH...,P50221,MEOX1
4,sp|P57073|SOX8_HUMAN,MLDMSEARSQPPCSPSGTASSMSHVEDSDSDAPPSPAGSEGLGRAG...,P57073,SOX8
...,...,...,...,...
1603,sp|P35227|PCGF2_HUMAN,MHRTTRIKITELNPHLMCALCGGYFIDATTIVECLHSFCKTCIVRY...,P35227,PCGF2
1604,sp|Q9BS34|ZN670_HUMAN,MDSVSFEDVAVAFTQEEWALLDPSQKNLYRDVMQEIFRNLASVGNK...,Q9BS34,ZNF670
1605,sp|P17098|ZNF8_HUMAN,MDPEDEGVAGVMSVGPPAARLQEPVTFRDVAVDFTQEEWGQLDPTQ...,P17098,ZNF8
1606,sp|Q9UJW7|ZN229_HUMAN,METLTSRHEKRALHSQASAISQDREEKIMSQEPLSFKDVAVVFTEE...,Q9UJW7,ZNF229


In [18]:
# Looking at SFARI genes corresponding to TFs
SFARI_tfs = SFARI[SFARI["gene-symbol"].isin(lambert_tfs["gene-symbol"])]
SFARI_tfs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
16,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64
31,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24
60,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12
61,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15
62,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24
...,...,...,...,...,...,...,...,...,...,...
1111,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5
1112,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3
1113,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4
1114,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16


In [19]:
SFARI_tfs = pd.merge(SFARI_tfs, lambert_tfs, on = "gene-symbol", how = "left")
SFARI_tfs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID
0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0
1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3
2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859
124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45
125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570


In [20]:
SFARI_tfs.to_csv("../data/SFARI_TFs.csv")

In [21]:
# SFARI genes corresponding to TFs whose gene name is not in the list of known effector domains
no_ED_SFARI_tfs = SFARI_tfs[~SFARI_tfs["gene-symbol"].isin(known_ED_gene_names_list)]
no_ED_SFARI_tfs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID
47,9,MBD6,Methyl-CpG binding domain protein 6,ENSG00000166987,12,Rare Single Gene Mutation,2.0,0,,3,sp|Q96DN6|MBD6_HUMAN,MNGGNESSGADRAGGPVATSVPIGWQRCVREGAVLYISPSGTELSS...,Q96DN6
73,9,POU3F3,POU class 3 homeobox 3,ENSG00000198914,2,"Rare Single Gene Mutation, Syndromic",,1,,2,sp|P20264|PO3F3_HUMAN,MATAASNPYLPGNSLLAAGSIVHSDAAGAGGGGGGGGGGGGGGAGG...,P20264
75,9,RFX3,regulatory factor X3,ENSG00000080298,9,"Rare Single Gene Mutation, Syndromic",1.0,0,15.95,10,sp|P48380|RFX3_HUMAN,MQTSETGSDTGSTVTLQTSVASQAAVPTQVVQQVPVQQQVQQVQTV...,P48380
87,9,SKI,SKIproto-oncogene,ENSG00000157933,1,Rare Single Gene Mutation,1.0,0,4.25,5,sp|P12755|SKI_HUMAN,MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEA...,P12755
114,9,ZNF18,zinc finger protein 18,ENSG00000154957,17,Rare Single Gene Mutation,2.0,0,,2,sp|P17022|ZNF18_HUMAN,MPVDLGQALGLLPSLAKAEDSQFSESDAALQEELSSPETARQLFRQ...,P17022
118,9,ZNF517,Zinc finger protein 517,ENSG00000197363,8,Rare Single Gene Mutation,2.0,0,,5,sp|Q6ZMY9|ZN517_HUMAN,MAMALPMPGPQEAVVFEDVAVYFTRIEWSCLAPDQQALYRDVMLEN...,Q6ZMY9
119,9,ZNF548,zinc finger protein 548,ENSG00000188785,19,Rare Single Gene Mutation,2.0,0,,4,sp|Q8NEK5|ZN548_HUMAN,MNLTEGRVVFEDVAIYFSQEEWGHLDEAQRLLYRDVMLENLALLSS...,Q8NEK5
120,9,ZNF559,Zinc finger protein 559,ENSG00000188321,19,Rare Single Gene Mutation,2.0,0,,8,sp|Q9BR84|ZN559_HUMAN,MVAGWLTNYSQDSVTFEDVAVDFTQEEWTLLDQTQRNLYRDVMLEN...,Q9BR84
121,9,ZNF626,zinc finger protein 626,ENSG00000188171,19,Rare Single Gene Mutation,2.0,0,,3,sp|Q68DY1|ZN626_HUMAN,MGPLQFRDVAIEFSLEEWHCLDTAQRNLYRNVMLENYSNLVFLGIT...,Q68DY1
123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859


In [22]:
len(no_ED_SFARI_tfs)

11

In [23]:
len(SFARI_tfs)

127

Adding sequences using lambert tfs

In [27]:
no_ED_SFARI_tfs = pd.merge(no_ED_SFARI_tfs, lambert_tfs, on = "gene-symbol", how = "left")
no_ED_SFARI_tfs["Start"] = 1
no_ED_SFARI_tfs["End"] = no_ED_SFARI_tfs["ProteinSeq_x"].str.len()

In [None]:
no_ED_SFARI_tfs = no_ED_SFARI_tfs.rename(columns = {"ProteinSeq" : "Sequence"})
no_ED_SFARI_tfs

Did Nicole screen any of the TFs that have no effector domains?

In [None]:
screened = pd.read_excel("../data/DelRosso_2023_supp_table_1", sheet_name = "CRTF Proteins Summary")
screened

In [None]:
# Which SFARI TFs with no known effector domains were not screened by Nicole?
no_ED_or_screened = no_ED_SFARI_tfs[~no_ED_SFARI_tfs["gene-symbol"].isin(screened["HGNC Symbol"])]
no_ED_or_screened

In [None]:
# Taipale 2022 paper - ZNF626 not in ORFeome screen
# Tycko 2022 list - tiling repressors supplemental sheet in table s4 does not have any of the TFs above

How many tiles would it be to do 40AA tiles spaced every 10 for the 56 SPARK TFs with no known EDs?

In [None]:
AD_predictor_tools.makeTilingDF_fromDF(no_ED_SFARI_tfs, window_size = 40, window_spacing = 10)

In [None]:
AD_predictor_tools.makeTilingDF_fromDF(no_ED_SFARI_tfs, window_size = 40, window_spacing = 5)

In [None]:
AD_predictor_tools.makeTilingDF_fromDF(no_ED_or_screened, window_size = 40, window_spacing = 10)

In [None]:
AD_predictor_tools.makeTilingDF_fromDF(no_ED_or_screened, window_size = 40, window_spacing = 5)

---

**Which SPARK transcription factors have really long known ADs?**

In [None]:
# Making a list of known EDs that have not been merged
known_EDs_unmerged = pd.concat([GSL, Soto, Stanford, active])
known_EDs_unmerged["Length"] = known_EDs_unmerged["End"] - known_EDs_unmerged["Start"] + 1
known_EDs_unmerged

In [None]:
known_EDs_unmerged_with_gene_symbol = pd.merge(known_EDs_unmerged, known_ED_gene_names, on = "uniprotID", how = "left")
known_EDs_unmerged_with_gene_symbol

In [None]:
# Are there rows with no gene symbol?
known_EDs_unmerged_with_gene_symbol[known_EDs_unmerged_with_gene_symbol["gene-symbol"].isna()]

In [None]:
# Replacing null gene symbols with values from the GeneName column
known_EDs_unmerged_with_gene_symbol["gene-symbol"].fillna(known_EDs_unmerged_with_gene_symbol["GeneName"], inplace=True)
known_EDs_unmerged_with_gene_symbol

In [None]:
np.median(known_EDs_unmerged["Length"])

In [None]:
np.percentile(known_EDs_unmerged["Length"], q = 85)

In [None]:
known_EDs_unmerged["Length"].hist(bins = 20)

In [None]:
known_EDs_unmerged_with_gene_symbol[known_EDs_unmerged_with_gene_symbol["Length"] > 80]

In [None]:
long_known_EDs_unmerged_with_gene_symbol = known_EDs_unmerged_with_gene_symbol[known_EDs_unmerged_with_gene_symbol["Length"] > 150]
long_known_EDs_unmerged_with_gene_symbol

In [None]:
SFARI_tfs[SFARI_tfs["gene-symbol"].isin(long_known_EDs_unmerged_with_gene_symbol["gene-symbol"])]

## Checking for KRAB domains

In [None]:
no_ED_SFARI_tfs

In [None]:
for uniprotID in no_ED_SFARI_tfs["uniprotID"]:
    print(uniprotID)

In [None]:
domains = pd.read_csv("../data/no_ED_SFARI_TF_domains.tsv", sep = "\t")
domains["KRAB"] = domains["Domain [FT]"].str.contains("KRAB")
domains

In [None]:
domains = domains[["From", "KRAB"]]
domains = domains.rename(columns = {"From" : "uniprotID"})
domains

In [None]:
no_ED_SFARI_tfs_KRAB_status = pd.merge(no_ED_SFARI_tfs, domains, on = "uniprotID", how = "left")
no_ED_SFARI_tfs_KRAB_status

In [None]:
no_ED_SFARI_tfs_KRAB_status[~(no_ED_SFARI_tfs_KRAB_status["KRAB"] == True)]