In [3]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools

## Steps to replicate Soto gnomAD analysis
Source: https://www.cell.com/molecular-cell/pdfExtended/S1097-2765(21)00957-6

1. Get list of SFARI TFs
2. Use uniprotID of each TF to find its ENST code
3. Use the ENST code transcript ID to obtain the nucleotide coordinates for the exons
4. To verify, translate all nucleotide coordinates to their respective amino acid sequences and compare to TF's amino acid sequence.
5. Obtain nucleotide coordinates for each domain from their respective amino acid positions
6. Use bedtools intersect to determine the location of variants as either in activation domains, DBD, or the rest of the full length protein.
7. Look at the density of non-synonymous variants
8. Later: replace gnomAD varaint data with the SPARK data

---

1. Get list of SFARI TFs

In [5]:
SFARI_tfs = pd.read_csv("../data/SFARI_TFs.csv")
SFARI_tfs

Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID
0,0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0
1,1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3
2,2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
3,3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
4,4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
123,123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859
124,124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45
125,125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570


2. Use uniprotID of each TF to find its ENST code

In [6]:
# Downloading the uniprot IDs to get the corresponding ENST codes
SFARI_tfs[["uniprotID"]].to_csv("../data/SFARI_tf_uniprotIDs.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [7]:
# Loading in the table from uniprot 
SFARI_tf_ensembl_matches = pd.read_csv("../data/SFARI_tf_enst_codes.tsv", sep = "\t")
SFARI_tf_ensembl_matches

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,ENST00000349014.8;ENST00000371602.9;ENST000003...
1,Q5TGY3,Q5TGY3,ENST00000247087.10;ENST00000374011.6;ENST00000...
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
4,Q96QS3,Q96QS3,ENST00000379044.5;
...,...,...,...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...
123,Q8N859,Q8N859,ENST00000429591.4;
124,Q6NX45,Q6NX45,ENST00000354377.8;
125,Q7Z570,Q7Z570,ENST00000302277.7;


In [8]:
# Looking at uniprotIDs that have multiple isoforms
multiple_isoforms = SFARI_tf_ensembl_matches[SFARI_tf_ensembl_matches["Ensembl"].str.contains("\[") == True]
multiple_isoforms

Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
5,P10275,P10275,ENST00000374690.9 [P10275-1];ENST00000504326.5...
6,Q9NR48,Q9NR48,ENST00000368346.7 [Q9NR48-1];ENST00000392403.8...
7,Q9UIF8,Q9UIF8,ENST00000392782.5 [Q9UIF8-5];ENST00000392783.7...
...,...,...,...
119,Q8NEK5,Q8NEK5,ENST00000336128.12 [Q8NEK5-2];ENST00000366197....
120,Q9BR84,Q9BR84,ENST00000317221.11 [Q9BR84-2];ENST00000393883....
121,Q68DY1,Q68DY1,ENST00000291750.6 [Q68DY1-3];ENST00000601440.6...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...


In [9]:
# Looking at uniprotIDs that do not have multiple isoforms
non_isoforms = SFARI_tf_ensembl_matches[SFARI_tf_ensembl_matches["Ensembl"].str.contains("\[") == False]
non_isoforms

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,ENST00000349014.8;ENST00000371602.9;ENST000003...
1,Q5TGY3,Q5TGY3,ENST00000247087.10;ENST00000374011.6;ENST00000...
4,Q96QS3,Q96QS3,ENST00000379044.5;
12,Q96JM3,Q96JM3,ENST00000361283.4;ENST00000643483.2;ENST000006...
13,Q96RK0,Q96RK0,ENST00000575354.6;
17,O14529,O14529,ENST00000261726.11;
23,P19622,P19622,ENST00000297375.4;
27,P55316,P55316,ENST00000313071.7;
30,Q8NBF1,Q8NBF1,ENST00000312233.4;
33,P31629,P31629,ENST00000012134.7;ENST00000367603.8;ENST000003...


In [10]:
non_isoforms['Ensembl'] = non_isoforms['Ensembl'].str.split(";").str[:-1]
non_isoforms

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,"[ENST00000349014.8, ENST00000371602.9, ENST000..."
1,Q5TGY3,Q5TGY3,"[ENST00000247087.10, ENST00000374011.6, ENST00..."
4,Q96QS3,Q96QS3,[ENST00000379044.5]
12,Q96JM3,Q96JM3,"[ENST00000361283.4, ENST00000643483.2, ENST000..."
13,Q96RK0,Q96RK0,[ENST00000575354.6]
17,O14529,O14529,[ENST00000261726.11]
23,P19622,P19622,[ENST00000297375.4]
27,P55316,P55316,[ENST00000313071.7]
30,Q8NBF1,Q8NBF1,[ENST00000312233.4]
33,P31629,P31629,"[ENST00000012134.7, ENST00000367603.8, ENST000..."


In [11]:
# Arbitrarily choosing the first transcript to keep 
# since all of the transcripts correspond to the same protein
non_isoforms['ENST'] = non_isoforms['Ensembl'].str[0]
non_isoforms

Unnamed: 0,From,Entry,Ensembl,ENST
0,Q9H2P0,Q9H2P0,"[ENST00000349014.8, ENST00000371602.9, ENST000...",ENST00000349014.8
1,Q5TGY3,Q5TGY3,"[ENST00000247087.10, ENST00000374011.6, ENST00...",ENST00000247087.10
4,Q96QS3,Q96QS3,[ENST00000379044.5],ENST00000379044.5
12,Q96JM3,Q96JM3,"[ENST00000361283.4, ENST00000643483.2, ENST000...",ENST00000361283.4
13,Q96RK0,Q96RK0,[ENST00000575354.6],ENST00000575354.6
17,O14529,O14529,[ENST00000261726.11],ENST00000261726.11
23,P19622,P19622,[ENST00000297375.4],ENST00000297375.4
27,P55316,P55316,[ENST00000313071.7],ENST00000313071.7
30,Q8NBF1,Q8NBF1,[ENST00000312233.4],ENST00000312233.4
33,P31629,P31629,"[ENST00000012134.7, ENST00000367603.8, ENST000...",ENST00000012134.7


In [12]:
# Formatting
non_isoforms = non_isoforms.rename(columns = {"From" : "uniprotID"})
non_isoforms = non_isoforms[["uniprotID", "ENST"]]
non_isoforms

Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
4,Q96QS3,ENST00000379044.5
12,Q96JM3,ENST00000361283.4
13,Q96RK0,ENST00000575354.6
17,O14529,ENST00000261726.11
23,P19622,ENST00000297375.4
27,P55316,ENST00000313071.7
30,Q8NBF1,ENST00000312233.4
33,P31629,ENST00000012134.7


In [13]:
# Now, need to figure out which isoform corresponds to the TF
multiple_isoforms["Ensembl"] = multiple_isoforms["Ensembl"].str.split(";").str[:-1]
multiple_isoforms

Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,[ENST00000334344.11 [Q68CP9-1]]
3,Q9HBZ2,Q9HBZ2,"[ENST00000303329.9 [Q9HBZ2-1], ENST00000527771..."
5,P10275,P10275,"[ENST00000374690.9 [P10275-1], ENST00000504326..."
6,Q9NR48,Q9NR48,"[ENST00000368346.7 [Q9NR48-1], ENST00000392403..."
7,Q9UIF8,Q9UIF8,"[ENST00000392782.5 [Q9UIF8-5], ENST00000392783..."
...,...,...,...
119,Q8NEK5,Q8NEK5,"[ENST00000336128.12 [Q8NEK5-2], ENST0000036619..."
120,Q9BR84,Q9BR84,"[ENST00000317221.11 [Q9BR84-2], ENST0000039388..."
121,Q68DY1,Q68DY1,"[ENST00000291750.6 [Q68DY1-3], ENST00000601440..."
122,Q9Y462,Q9Y462,"[ENST00000276123.7 [Q9Y462-1], ENST00000360700..."


In [14]:
# Obtaining all isoform sequences from uniprot
multiple_isoforms[["From"]].to_csv("../data/SFARI_TF_isoform_uniprotIDs.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [15]:
isoforms = multiple_isoforms.explode("Ensembl")

In [16]:
isoforms

Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1]
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1]
3,Q9HBZ2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2]
3,Q9HBZ2,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2]
5,P10275,P10275,ENST00000374690.9 [P10275-1]
...,...,...,...
122,Q9Y462,Q9Y462,ENST00000373165.7 [Q9Y462-1]
122,Q9Y462,Q9Y462,ENST00000674551.1 [Q9Y462-3]
126,Q17R98,Q17R98,ENST00000379448.9 [Q17R98-2]
126,Q17R98,Q17R98,ENST00000508784.6 [Q17R98-1]


In [17]:
isoforms = isoforms.rename(columns = {"From" : "uniprotID"})
isoforms = isoforms[["uniprotID", "Ensembl"]]
isoforms

Unnamed: 0,uniprotID,Ensembl
2,Q68CP9,ENST00000334344.11 [Q68CP9-1]
3,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1]
3,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2]
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2]
5,P10275,ENST00000374690.9 [P10275-1]
...,...,...
122,Q9Y462,ENST00000373165.7 [Q9Y462-1]
122,Q9Y462,ENST00000674551.1 [Q9Y462-3]
126,Q17R98,ENST00000379448.9 [Q17R98-2]
126,Q17R98,ENST00000508784.6 [Q17R98-1]


In [18]:
isoforms["isoform_uniprotID"] = isoforms["Ensembl"].str.split("[").str[1].str.split("]").str[0]
isoforms

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID
2,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1
3,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1
3,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-2
5,P10275,ENST00000374690.9 [P10275-1],P10275-1
...,...,...,...
122,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1
122,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-3
126,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2
126,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1


In [19]:
isoforms[isoforms["isoform_uniprotID"].str.contains("O00712-1")]

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID
59,O00712,ENST00000380959.7 [O00712-1],O00712-1


In [20]:
isoform_seqs = AD_predictor_tools.makeFullLengthProteinDF("../data/SFARI_TF_isoforms.fasta")
isoform_seqs

There are 355 proteins


Unnamed: 0,GeneName,AAseq
0,sp|O00712|NFIB_HUMAN Nuclear factor 1 B-type O...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
1,sp|O00712-2|NFIB_HUMAN Isoform 3 of Nuclear fa...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
2,sp|O00712-4|NFIB_HUMAN Isoform 4 of Nuclear fa...,MERIPVSVDFWVVCCAVLKCNPGIPMERIPVSVDFWVVCCAVLKCN...
3,sp|O00712-5|NFIB_HUMAN Isoform 5 of Nuclear fa...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
4,sp|O00712-6|NFIB_HUMAN Isoform 6 of Nuclear fa...,MNSGVNLQRSLSSPPSSKRPKTISIDENMEPSPTGDFYPSPSSPAA...
...,...,...
350,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
351,sp|Q9Y462-2|ZN711_HUMAN Isoform 2 of Zinc fing...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
352,sp|Q9Y462-3|ZN711_HUMAN Isoform 3 of Zinc fing...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
353,sp|Q9Y6X0|SETBP_HUMAN SET-binding protein OS=H...,MESRETLSSSRQRGGESDFLPVSSAKPPAAPGCAGEPLLSTPGPGK...


In [21]:
isoform_seqs["isoform_uniprotID"] = isoform_seqs["GeneName"].str.split("|").str[1]
isoform_seqs

Unnamed: 0,GeneName,AAseq,isoform_uniprotID
0,sp|O00712|NFIB_HUMAN Nuclear factor 1 B-type O...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712
1,sp|O00712-2|NFIB_HUMAN Isoform 3 of Nuclear fa...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712-2
2,sp|O00712-4|NFIB_HUMAN Isoform 4 of Nuclear fa...,MERIPVSVDFWVVCCAVLKCNPGIPMERIPVSVDFWVVCCAVLKCN...,O00712-4
3,sp|O00712-5|NFIB_HUMAN Isoform 5 of Nuclear fa...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712-5
4,sp|O00712-6|NFIB_HUMAN Isoform 6 of Nuclear fa...,MNSGVNLQRSLSSPPSSKRPKTISIDENMEPSPTGDFYPSPSSPAA...,O00712-6
...,...,...,...
350,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
351,sp|Q9Y462-2|ZN711_HUMAN Isoform 2 of Zinc fing...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462-2
352,sp|Q9Y462-3|ZN711_HUMAN Isoform 3 of Zinc fing...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462-3
353,sp|Q9Y6X0|SETBP_HUMAN SET-binding protein OS=H...,MESRETLSSSRQRGGESDFLPVSSAKPPAAPGCAGEPLLSTPGPGK...,Q9Y6X0


In [22]:
isoform_seqs = isoform_seqs[["isoform_uniprotID", "AAseq"]]
isoform_seqs

Unnamed: 0,isoform_uniprotID,AAseq
0,O00712,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
1,O00712-2,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
2,O00712-4,MERIPVSVDFWVVCCAVLKCNPGIPMERIPVSVDFWVVCCAVLKCN...
3,O00712-5,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
4,O00712-6,MNSGVNLQRSLSSPPSSKRPKTISIDENMEPSPTGDFYPSPSSPAA...
...,...,...
350,Q9Y462,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
351,Q9Y462-2,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
352,Q9Y462-3,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
353,Q9Y6X0,MESRETLSSSRQRGGESDFLPVSSAKPPAAPGCAGEPLLSTPGPGK...


In [23]:
# Using uniprot to get the isoform uniprot ID of the canonical sequences
isoform_uniprotIDs = pd.read_csv("../data/SFARI_TF_isoform_uniprotIDs.tsv", sep = "\t")
isoform_uniprotIDs

Unnamed: 0,From,Entry,Alternative products (isoforms)
0,Q68CP9,Q68CP9,ALTERNATIVE PRODUCTS: Event=Alternative splic...
1,Q9HBZ2,Q9HBZ2,ALTERNATIVE PRODUCTS: Event=Alternative splic...
2,P10275,P10275,ALTERNATIVE PRODUCTS: Event=Alternative splic...
3,Q9NR48,Q9NR48,ALTERNATIVE PRODUCTS: Event=Alternative splic...
4,Q9UIF8,Q9UIF8,ALTERNATIVE PRODUCTS: Event=Alternative splic...
...,...,...,...
85,Q8NEK5,Q8NEK5,ALTERNATIVE PRODUCTS: Event=Alternative splic...
86,Q9BR84,Q9BR84,ALTERNATIVE PRODUCTS: Event=Alternative splic...
87,Q68DY1,Q68DY1,ALTERNATIVE PRODUCTS: Event=Alternative splic...
88,Q9Y462,Q9Y462,ALTERNATIVE PRODUCTS: Event=Alternative splic...


In [24]:
isoform_uniprotIDs[isoform_uniprotIDs["From"] == "Q9H334"]["Alternative products (isoforms)"].iloc[0]

'ALTERNATIVE PRODUCTS:  Event=Alternative splicing; Named isoforms=7; Comment=Additional isoforms seem to exist.; Name=1; IsoId=Q9H334-1; Sequence=Displayed; Name=3; IsoId=Q9H334-3; Sequence=VSP_001555, VSP_001556; Name=4; IsoId=Q9H334-4; Sequence=VSP_001555; Name=5; IsoId=Q9H334-5; Sequence=VSP_043462, VSP_043463; Name=6; IsoId=Q9H334-6; Sequence=VSP_001556; Name=7; IsoId=Q9H334-7; Sequence=VSP_046930; Name=8; Synonyms=FOXP1-ES; IsoId=Q9H334-8; Sequence=VSP_057341;'

In [25]:
pat1 = r'IsoId=(.*?);'
isoform_uniprotIDs["canonical_uniprotID"] = isoform_uniprotIDs["Alternative products (isoforms)"].str.extract(pat1)
isoform_uniprotIDs

Unnamed: 0,From,Entry,Alternative products (isoforms),canonical_uniprotID
0,Q68CP9,Q68CP9,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q68CP9-1
1,Q9HBZ2,Q9HBZ2,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9HBZ2-1
2,P10275,P10275,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P10275-1
3,Q9NR48,Q9NR48,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9NR48-1
4,Q9UIF8,Q9UIF8,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9UIF8-1
...,...,...,...,...
85,Q8NEK5,Q8NEK5,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q8NEK5-1
86,Q9BR84,Q9BR84,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9BR84-1
87,Q68DY1,Q68DY1,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q68DY1-1
88,Q9Y462,Q9Y462,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9Y462-1


In [26]:
isoform_uniprotIDs = isoform_uniprotIDs.rename(columns = {"From" : "uniprotID"})
isoform_uniprotIDs

Unnamed: 0,uniprotID,Entry,Alternative products (isoforms),canonical_uniprotID
0,Q68CP9,Q68CP9,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q68CP9-1
1,Q9HBZ2,Q9HBZ2,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9HBZ2-1
2,P10275,P10275,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P10275-1
3,Q9NR48,Q9NR48,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9NR48-1
4,Q9UIF8,Q9UIF8,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9UIF8-1
...,...,...,...,...
85,Q8NEK5,Q8NEK5,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q8NEK5-1
86,Q9BR84,Q9BR84,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9BR84-1
87,Q68DY1,Q68DY1,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q68DY1-1
88,Q9Y462,Q9Y462,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9Y462-1


In [27]:
canonical_IDs = dict(zip(isoform_uniprotIDs["uniprotID"], isoform_uniprotIDs["canonical_uniprotID"]))
canonical_IDs

{'Q68CP9': 'Q68CP9-1',
 'Q9HBZ2': 'Q9HBZ2-1',
 'P10275': 'P10275-1',
 'Q9NR48': 'Q9NR48-1',
 'Q9UIF8': 'Q9UIF8-1',
 'Q9H165': 'Q9H165-1',
 'Q86V15': 'Q86V15-1',
 'Q6P1N0': 'Q6P1N0-1',
 'O94983': 'O94983-1',
 'P49711': 'P49711-1',
 'P39880': 'P39880-1',
 'Q13948': 'Q13948-1',
 'O75398': 'O75398-1',
 'P56179': 'P56179-1',
 'Q07687': 'Q07687-1',
 'Q9H4W6': 'Q9H4W6-1',
 'Q06889': 'Q06889-1',
 'Q92731': 'Q92731-1',
 'O95718': 'O95718-3',
 'Q8TBJ5': 'Q8TBJ5-1',
 'Q9H334': 'Q9H334-1',
 'O15409': 'O15409-1',
 'P11308': 'P11308-4',
 'P78347': 'P78347-1',
 'Q5T1R4': 'Q5T1R4-1',
 'Q13422': 'Q13422-1',
 'Q9Y2K7': 'Q9Y2K7-1',
 'Q9UGL1': 'Q9UGL1-1',
 'O75840': 'O75840-1',
 'Q03164': 'Q03164-1',
 'Q8NHM5': 'Q8NHM5-1',
 'O60663': 'O60663-1',
 'Q9UIS9': 'Q9UIS9-1',
 'O95983': 'O95983-1',
 'O95243': 'O95243-1',
 'P51608': 'P51608-1',
 'Q06413': 'Q06413-1',
 'O14770': 'O14770-1',
 'Q9UL68': 'Q9UL68-1',
 'Q15788': 'Q15788-1',
 'Q12857': 'Q12857-1',
 'O00712': 'O00712-1',
 'Q14938': 'Q14938-1',
 'P08235': 

In [28]:
# for uniprotID in canonical_IDs.keys():
#     pat = "^" + uniprotID + "$"
#     repl = canonical_IDs[uniprotID]
#     isoform_seqs["isoform_uniprotID"] = isoform_seqs["isoform_uniprotID"].str.replace(pat, repl)

In [29]:
isoform_seqs["isoform_uniprotID"] = isoform_seqs["isoform_uniprotID"].replace(canonical_IDs)
isoform_seqs

Unnamed: 0,isoform_uniprotID,AAseq
0,O00712-1,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
1,O00712-2,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
2,O00712-4,MERIPVSVDFWVVCCAVLKCNPGIPMERIPVSVDFWVVCCAVLKCN...
3,O00712-5,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
4,O00712-6,MNSGVNLQRSLSSPPSSKRPKTISIDENMEPSPTGDFYPSPSSPAA...
...,...,...
350,Q9Y462-1,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
351,Q9Y462-2,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
352,Q9Y462-3,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
353,Q9Y6X0-1,MESRETLSSSRQRGGESDFLPVSSAKPPAAPGCAGEPLLSTPGPGK...


In [30]:
isoform_seqs = isoform_seqs[isoform_seqs["isoform_uniprotID"].isin(isoforms["isoform_uniprotID"])]
isoform_seqs

Unnamed: 0,isoform_uniprotID,AAseq
0,O00712-1,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
3,O00712-5,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...
4,O00712-6,MNSGVNLQRSLSSPPSSKRPKTISIDENMEPSPTGDFYPSPSSPAA...
5,O14770-1,MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...
6,O14770-2,MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...
...,...,...
348,Q9Y458-1,MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...
350,Q9Y462-1,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
352,Q9Y462-3,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
353,Q9Y6X0-1,MESRETLSSSRQRGGESDFLPVSSAKPPAAPGCAGEPLLSTPGPGK...


In [31]:
isoforms

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID
2,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1
3,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1
3,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-2
5,P10275,ENST00000374690.9 [P10275-1],P10275-1
...,...,...,...
122,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1
122,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-3
126,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2
126,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1


In [32]:
isoforms = pd.merge(isoforms, isoform_seqs, on = "isoform_uniprotID", how = "left")
isoforms

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID,AAseq
0,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...
1,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...
2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2,MASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKRRSGMDFDDEDG...
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-2,MASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKRRSGMDFDDEDG...
4,P10275,ENST00000374690.9 [P10275-1],P10275-1,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...
...,...,...,...,...
428,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
429,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-3,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
430,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2,MPRRKQEQPKRLPSHVSRQEEAEGELSEGEHWYGNSSETPSEASYG...
431,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1,MPRRKQEQPKRLPSHVSRQEEAEGELSEGEHWYGNSSETPSEASYG...


In [33]:
isoforms[isoforms["uniprotID"] == "P11308"]

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID,AAseq
94,P11308,ENST00000288319.12 [P11308-4],P11308-4,MASTIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSSDYGQTSK...
95,P11308,ENST00000398897.5 [P11308-2],P11308-2,MVGSPDTVGMNYGSYMEEKHMPPPNMTTNERRVIVPADPTLWSTDH...
96,P11308,ENST00000398911.5 [P11308-1],P11308-1,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...
97,P11308,ENST00000398919.6 [P11308-3],P11308-3,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...
98,P11308,ENST00000417133.6 [P11308-3],P11308-3,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...
99,P11308,ENST00000442448.5 [P11308-1],P11308-1,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...


In [34]:
isoforms[isoforms["uniprotID"] == "P11308"]

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID,AAseq
94,P11308,ENST00000288319.12 [P11308-4],P11308-4,MASTIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSSDYGQTSK...
95,P11308,ENST00000398897.5 [P11308-2],P11308-2,MVGSPDTVGMNYGSYMEEKHMPPPNMTTNERRVIVPADPTLWSTDH...
96,P11308,ENST00000398911.5 [P11308-1],P11308-1,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...
97,P11308,ENST00000398919.6 [P11308-3],P11308-3,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...
98,P11308,ENST00000417133.6 [P11308-3],P11308-3,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...
99,P11308,ENST00000442448.5 [P11308-1],P11308-1,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...


In [35]:
SFARI_tfs

Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID
0,0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0
1,1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3
2,2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
3,3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
4,4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
123,123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859
124,124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45
125,125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570


In [36]:
isoforms_TFs_seq_merged = pd.merge(isoforms, SFARI_tfs, 
                                   left_on = "AAseq", right_on = "ProteinSeq", how = "left")
isoforms_TFs_seq_merged

Unnamed: 0.1,uniprotID_x,Ensembl,isoform_uniprotID,AAseq,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID_y
0,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,2.0,9.0,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1.0,,12.0,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
1,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,3.0,9.0,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0.0,,15.0,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2,MASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKRRSGMDFDDEDG...,,,,,,,,,,,,,,
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-2,MASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKRRSGMDFDDEDG...,,,,,,,,,,,,,,
4,P10275,ENST00000374690.9 [P10275-1],P10275-1,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,5.0,9.0,AR,androgen receptor,,X,Genetic Association,2.0,0.0,,6.0,sp|P10275|ANDR_HUMAN,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,P10275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,122.0,9.0,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0.0,,5.0,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
429,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-3,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,,,,,,,,,,,,,,
430,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2,MPRRKQEQPKRLPSHVSRQEEAEGELSEGEHWYGNSSETPSEASYG...,,,,,,,,,,,,,,
431,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1,MPRRKQEQPKRLPSHVSRQEEAEGELSEGEHWYGNSSETPSEASYG...,126.0,9.0,ZNF827,Zinc finger protein 827,ENSG00000151612,4,"Rare Single Gene Mutation, Genetic Association",2.0,0.0,,3.0,sp|Q17R98|ZN827_HUMAN,MPRRKQEQPKRLPSHVSRQEEAEGELSEGEHWYGNSSETPSEASYG...,Q17R98


In [37]:
isoforms_TFs_seq_merged = isoforms_TFs_seq_merged[~isoforms_TFs_seq_merged["ProteinSeq"].isna()]
isoforms_TFs_seq_merged

Unnamed: 0.1,uniprotID_x,Ensembl,isoform_uniprotID,AAseq,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID_y
0,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,2.0,9.0,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1.0,,12.0,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
1,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,3.0,9.0,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0.0,,15.0,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
4,P10275,ENST00000374690.9 [P10275-1],P10275-1,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,5.0,9.0,AR,androgen receptor,,X,Genetic Association,2.0,0.0,,6.0,sp|P10275|ANDR_HUMAN,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,P10275
6,P10275,ENST00000612452.5 [P10275-1],P10275-1,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,5.0,9.0,AR,androgen receptor,,X,Genetic Association,2.0,0.0,,6.0,sp|P10275|ANDR_HUMAN,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,P10275
7,Q9NR48,ENST00000368346.7 [Q9NR48-1],Q9NR48-1,MDPRNTAMLGLGSDSEGFSRKSPSAISTGTLVSKREVELEKNTKEE...,6.0,9.0,ASH1L,"Ash1 (absent, small, or homeotic)-like (Drosop...",ENSG00000116539,1,"Rare Single Gene Mutation, Syndromic, Genetic ...",1.0,0.0,14.15,29.0,sp|Q9NR48|ASH1L_HUMAN,MDPRNTAMLGLGSDSEGFSRKSPSAISTGTLVSKREVELEKNTKEE...,Q9NR48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Q9BR84,ENST00000603380.6 [Q9BR84-1],Q9BR84-1,MVAGWLTNYSQDSVTFEDVAVDFTQEEWTLLDQTQRNLYRDVMLEN...,120.0,9.0,ZNF559,Zinc finger protein 559,ENSG00000188321,19,Rare Single Gene Mutation,2.0,0.0,,8.0,sp|Q9BR84|ZN559_HUMAN,MVAGWLTNYSQDSVTFEDVAVDFTQEEWTLLDQTQRNLYRDVMLEN...,Q9BR84
425,Q68DY1,ENST00000601440.6 [Q68DY1-1],Q68DY1-1,MGPLQFRDVAIEFSLEEWHCLDTAQRNLYRNVMLENYSNLVFLGIT...,121.0,9.0,ZNF626,zinc finger protein 626,ENSG00000188171,19,Rare Single Gene Mutation,2.0,0.0,,3.0,sp|Q68DY1|ZN626_HUMAN,MGPLQFRDVAIEFSLEEWHCLDTAQRNLYRNVMLENYSNLVFLGIT...,Q68DY1
426,Q9Y462,ENST00000276123.7 [Q9Y462-1],Q9Y462-1,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,122.0,9.0,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0.0,,5.0,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
428,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,122.0,9.0,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0.0,,5.0,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462


In [38]:
# Arbitrarily choose the first of each isoform_uniprotID
isoforms_with_seqs = isoforms_TFs_seq_merged.groupby("isoform_uniprotID").agg(lambda sf: sf.iloc[0])
isoforms_with_seqs = isoforms_with_seqs.reset_index()
isoforms_with_seqs

Unnamed: 0.1,isoform_uniprotID,uniprotID_x,Ensembl,AAseq,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID_y
0,O00712-1,O00712,ENST00000380959.7 [O00712-1],MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,59.0,9.0,NFIB,nuclear factor I B,ENSG00000147862,9,"Rare Single Gene Mutation, Syndromic",2.0,1.0,,5.0,sp|O00712|NFIB_HUMAN,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712
1,O14770-1,O14770,ENST00000561208.6 [O14770-1],MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...,50.0,9.0,MEIS2,Meis homeobox 2,ENSG00000134138,15,"Rare Single Gene Mutation, Syndromic",1.0,1.0,7.50,12.0,sp|O14770|MEIS2_HUMAN,MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...,O14770
2,O15266-1,O15266,ENST00000381578.6 [O15266-1],MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...,86.0,9.0,SHOX,short stature homeobox,ENSG00000185960,"X,Y",Rare Single Gene Mutation,2.0,0.0,,2.0,sp|O15266|SHOX_HUMAN,MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...,O15266
3,O15409-1,O15409,ENST00000350908.9 [O15409-1],MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...,29.0,9.0,FOXP2,forkhead box P2,ENSG00000128573,7,"Rare Single Gene Mutation, Genetic Association...",1.0,0.0,,52.0,sp|O15409|FOXP2_HUMAN,MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...,O15409
4,O43435-1,O43435,ENST00000329705.11 [O43435-1],MHFSTVTRDMEAFTASSLSSLGAAGGFPGAASPGADPYGPREPPPP...,95.0,9.0,TBX1,T-box 1,ENSG00000184058,22,"Rare Single Gene Mutation, Syndromic, Functional",,1.0,,6.0,sp|O43435|TBX1_HUMAN,MHFSTVTRDMEAFTASSLSSLGAAGGFPGAASPGADPYGPREPPPP...,O43435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,Q9UPW6-1,Q9UPW6,ENST00000260926.9 [Q9UPW6-1],MERRSESPCLRDSPDRRSGSPDVKGPPPVKVARLEQNGSPMGARGR...,81.0,9.0,SATB2,SATB homeobox 2,ENSG00000119042,2,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,1.0,24.95,41.0,sp|Q9UPW6|SATB2_HUMAN,MERRSESPCLRDSPDRRSGSPDVKGPPPVKVARLEQNGSPMGARGR...,Q9UPW6
82,Q9Y2K7-1,Q9Y2K7,ENST00000529006.7 [Q9Y2K7-1],MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...,37.0,9.0,KDM2A,lysine demethylase 2A,ENSG00000173120,11,Rare Single Gene Mutation,3.0,0.0,,8.0,sp|Q9Y2K7|KDM2A_HUMAN,MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...,Q9Y2K7
83,Q9Y458-1,Q9Y458,ENST00000373294.8 [Q9Y458-1],MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...,96.0,9.0,TBX22,T-box transcription factor 22,ENSG00000122145,X,Rare Single Gene Mutation,3.0,0.0,,2.0,sp|Q9Y458|TBX22_HUMAN,MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...,Q9Y458
84,Q9Y462-1,Q9Y462,ENST00000276123.7 [Q9Y462-1],MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,122.0,9.0,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0.0,,5.0,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462


In [39]:
multiple_isoforms

Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,[ENST00000334344.11 [Q68CP9-1]]
3,Q9HBZ2,Q9HBZ2,"[ENST00000303329.9 [Q9HBZ2-1], ENST00000527771..."
5,P10275,P10275,"[ENST00000374690.9 [P10275-1], ENST00000504326..."
6,Q9NR48,Q9NR48,"[ENST00000368346.7 [Q9NR48-1], ENST00000392403..."
7,Q9UIF8,Q9UIF8,"[ENST00000392782.5 [Q9UIF8-5], ENST00000392783..."
...,...,...,...
119,Q8NEK5,Q8NEK5,"[ENST00000336128.12 [Q8NEK5-2], ENST0000036619..."
120,Q9BR84,Q9BR84,"[ENST00000317221.11 [Q9BR84-2], ENST0000039388..."
121,Q68DY1,Q68DY1,"[ENST00000291750.6 [Q68DY1-3], ENST00000601440..."
122,Q9Y462,Q9Y462,"[ENST00000276123.7 [Q9Y462-1], ENST00000360700..."


In [40]:
set(multiple_isoforms["From"]) - set(isoforms_with_seqs["uniprotID_x"])

{'O43151', 'O95718', 'P56179', 'Q569K4'}

In [41]:
lambert = pd.read_csv("../data/LambertTFs.csv")
lambert

Unnamed: 0.1,Unnamed: 0,GeneName,ProteinSeq
0,0,sp|P23511|NFYA_HUMAN,MEQYTANSNSSTEQIVVQAGQIQQQQQGGVTAVQLQTEAQVASASG...
1,1,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...
2,2,sp|P31270|HXA11_HUMAN,MDFDERGPCSSNMYLPSCTYYVSGPDFSSLPSFLPQTPSSRPMTYS...
3,3,sp|P50221|MEOX1_HUMAN,MDPAASSCMRSLQPPAPVWGCLRNPHSEGNGASGLPHYPPTPFSFH...
4,4,sp|P57073|SOX8_HUMAN,MLDMSEARSQPPCSPSGTASSMSHVEDSDSDAPPSPAGSEGLGRAG...
...,...,...,...
1603,1603,sp|P35227|PCGF2_HUMAN,MHRTTRIKITELNPHLMCALCGGYFIDATTIVECLHSFCKTCIVRY...
1604,1604,sp|Q9BS34|ZN670_HUMAN,MDSVSFEDVAVAFTQEEWALLDPSQKNLYRDVMQEIFRNLASVGNK...
1605,1605,sp|P17098|ZNF8_HUMAN,MDPEDEGVAGVMSVGPPAARLQEPVTFRDVAVDFTQEEWGQLDPTQ...
1606,1606,sp|Q9UJW7|ZN229_HUMAN,METLTSRHEKRALHSQASAISQDREEKIMSQEPLSFKDVAVVFTEE...


In [42]:
for uniprotID in set(multiple_isoforms["From"]) - set(isoforms_with_seqs["uniprotID_x"]):
    print("Unmatched uniprotID: " + uniprotID)
    print("TF length:")
    print("\t" + str(len(SFARI_tfs[SFARI_tfs["uniprotID"] == uniprotID]["ProteinSeq"].iloc[0])))
    print("Isoform length(s):")
    for AAseq in isoform_seqs[isoform_seqs["isoform_uniprotID"].str.contains(uniprotID)]["AAseq"]:
        print("\t" + str(len(AAseq)))
    # print(len(lambert[lambert["GeneName"].str.contains(uniprotID)]["ProteinSeq"].iloc[0]))
    print("---")

Unmatched uniprotID: P56179
TF length:
	175
Isoform length(s):
	293
---
Unmatched uniprotID: O95718
TF length:
	433
Isoform length(s):
	508
	500
---
Unmatched uniprotID: Q569K4
TF length:
	471
Isoform length(s):
	395
	369
---
Unmatched uniprotID: O43151
TF length:
	1660
Isoform length(s):
	1795
---


The problem for the four above is that the correct isoform does not have a corresponding ENST code- will have to proceed without these four for now.

In [43]:
isoforms_with_seqs

Unnamed: 0.1,isoform_uniprotID,uniprotID_x,Ensembl,AAseq,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID_y
0,O00712-1,O00712,ENST00000380959.7 [O00712-1],MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,59.0,9.0,NFIB,nuclear factor I B,ENSG00000147862,9,"Rare Single Gene Mutation, Syndromic",2.0,1.0,,5.0,sp|O00712|NFIB_HUMAN,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712
1,O14770-1,O14770,ENST00000561208.6 [O14770-1],MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...,50.0,9.0,MEIS2,Meis homeobox 2,ENSG00000134138,15,"Rare Single Gene Mutation, Syndromic",1.0,1.0,7.50,12.0,sp|O14770|MEIS2_HUMAN,MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...,O14770
2,O15266-1,O15266,ENST00000381578.6 [O15266-1],MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...,86.0,9.0,SHOX,short stature homeobox,ENSG00000185960,"X,Y",Rare Single Gene Mutation,2.0,0.0,,2.0,sp|O15266|SHOX_HUMAN,MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...,O15266
3,O15409-1,O15409,ENST00000350908.9 [O15409-1],MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...,29.0,9.0,FOXP2,forkhead box P2,ENSG00000128573,7,"Rare Single Gene Mutation, Genetic Association...",1.0,0.0,,52.0,sp|O15409|FOXP2_HUMAN,MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...,O15409
4,O43435-1,O43435,ENST00000329705.11 [O43435-1],MHFSTVTRDMEAFTASSLSSLGAAGGFPGAASPGADPYGPREPPPP...,95.0,9.0,TBX1,T-box 1,ENSG00000184058,22,"Rare Single Gene Mutation, Syndromic, Functional",,1.0,,6.0,sp|O43435|TBX1_HUMAN,MHFSTVTRDMEAFTASSLSSLGAAGGFPGAASPGADPYGPREPPPP...,O43435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,Q9UPW6-1,Q9UPW6,ENST00000260926.9 [Q9UPW6-1],MERRSESPCLRDSPDRRSGSPDVKGPPPVKVARLEQNGSPMGARGR...,81.0,9.0,SATB2,SATB homeobox 2,ENSG00000119042,2,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,1.0,24.95,41.0,sp|Q9UPW6|SATB2_HUMAN,MERRSESPCLRDSPDRRSGSPDVKGPPPVKVARLEQNGSPMGARGR...,Q9UPW6
82,Q9Y2K7-1,Q9Y2K7,ENST00000529006.7 [Q9Y2K7-1],MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...,37.0,9.0,KDM2A,lysine demethylase 2A,ENSG00000173120,11,Rare Single Gene Mutation,3.0,0.0,,8.0,sp|Q9Y2K7|KDM2A_HUMAN,MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...,Q9Y2K7
83,Q9Y458-1,Q9Y458,ENST00000373294.8 [Q9Y458-1],MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...,96.0,9.0,TBX22,T-box transcription factor 22,ENSG00000122145,X,Rare Single Gene Mutation,3.0,0.0,,2.0,sp|Q9Y458|TBX22_HUMAN,MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...,Q9Y458
84,Q9Y462-1,Q9Y462,ENST00000276123.7 [Q9Y462-1],MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,122.0,9.0,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0.0,,5.0,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462


In [44]:
isoforms_cleaned = isoforms_with_seqs[["uniprotID_x", "Ensembl"]]
isoforms_cleaned["Ensembl"] = isoforms_cleaned["Ensembl"].str.split(" ").str[0]
isoforms_cleaned = isoforms_cleaned.rename(columns = {"uniprotID_x" : "uniprotID",
                                                     "Ensembl" : "ENST"})
isoforms_cleaned

Unnamed: 0,uniprotID,ENST
0,O00712,ENST00000380959.7
1,O14770,ENST00000561208.6
2,O15266,ENST00000381578.6
3,O15409,ENST00000350908.9
4,O43435,ENST00000329705.11
...,...,...
81,Q9UPW6,ENST00000260926.9
82,Q9Y2K7,ENST00000529006.7
83,Q9Y458,ENST00000373294.8
84,Q9Y462,ENST00000276123.7


In [45]:
non_isoforms_cleaned = non_isoforms.reset_index(drop = True)
non_isoforms_cleaned

Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
2,Q96QS3,ENST00000379044.5
3,Q96JM3,ENST00000361283.4
4,Q96RK0,ENST00000575354.6
5,O14529,ENST00000261726.11
6,P19622,ENST00000297375.4
7,P55316,ENST00000313071.7
8,Q8NBF1,ENST00000312233.4
9,P31629,ENST00000012134.7


In [46]:
uniprotID_ENST_mapping_df = pd.concat([isoforms_cleaned, non_isoforms_cleaned])
uniprotID_ENST_mapping_df

Unnamed: 0,uniprotID,ENST
0,O00712,ENST00000380959.7
1,O14770,ENST00000561208.6
2,O15266,ENST00000381578.6
3,O15409,ENST00000350908.9
4,O43435,ENST00000329705.11
...,...,...
31,O95365,ENST00000322357.9
32,Q6ZMY9,ENST00000359971.4
33,Q8N859,ENST00000429591.4
34,Q6NX45,ENST00000354377.8


In [47]:
set(SFARI_tfs["uniprotID"]) - set(uniprotID_ENST_mapping_df["uniprotID"])
# P49639 has no ensembl transcript code
# The other four do not have an ensembl transcript codes that corresponds to the correct isoform

{'O43151', 'O95718', 'P49639', 'P56179', 'Q569K4'}

In [48]:
#uniprotID_ENST_mapping_df.to_csv("../data/SFARI_tf_ENST_codes.csv")

In [49]:
SFARI_TFs_with_ENST = pd.merge(SFARI_tfs, uniprotID_ENST_mapping_df, on = "uniprotID")
SFARI_TFs_with_ENST

Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
0,0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0,ENST00000349014.8
1,1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3,ENST00000247087.10
2,2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9,ENST00000334344.11
3,3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2,ENST00000303329.9
4,4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3,ENST00000379044.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462,ENST00000276123.7
118,123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859,ENST00000429591.4
119,124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45,ENST00000354377.8
120,125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570,ENST00000302277.7


In [50]:
#SFARI_TFs_with_ENST.to_csv("../data/SFARI_TFs_with_ENST.csv")

In [51]:
SFARI_TFs_with_ENST[SFARI_TFs_with_ENST["uniprotID"] == "P11308"]

Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
29,31,9,ERG,"ERG, ETS transcription factor",ENSG00000157554,21,Genetic Association,2.0,0,,1,sp|P11308|ERG_HUMAN,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...,P11308,ENST00000398919.6


In [60]:
df1 = pd.read_csv("../data/SFARI_TFs_with_ENST_corrected.csv", index_col = 0).sort_values("uniprotID").reset_index(drop = True)
df1

Unnamed: 0,uniprotID,ENST
0,O00712,ENST00000380959.7
1,O14529,ENST00000261726.11
2,O14770,ENST00000561208.6
3,O15266,ENST00000381578.6
4,O15409,ENST00000350908.9
...,...,...
117,Q9Y2K7,ENST00000529006.7
118,Q9Y458,ENST00000373294.8
119,Q9Y462,ENST00000276123.7
120,Q9Y4A8,ENST00000056233.4


In [63]:
df2 = uniprotID_ENST_mapping_df.sort_values("uniprotID").reset_index(drop = True)
df2

Unnamed: 0,uniprotID,ENST
0,O00712,ENST00000380959.7
1,O14529,ENST00000261726.11
2,O14770,ENST00000561208.6
3,O15266,ENST00000381578.6
4,O15409,ENST00000350908.9
...,...,...
117,Q9Y2K7,ENST00000529006.7
118,Q9Y458,ENST00000373294.8
119,Q9Y462,ENST00000276123.7
120,Q9Y4A8,ENST00000056233.4


In [66]:
results = df1 == df2
print(len(results))
print(sum(results["uniprotID"]))
print(sum(results["ENST"]))

122
122
120


In [67]:
results[results["ENST"] == False]

Unnamed: 0,uniprotID,ENST
19,True,False
65,True,False


In [68]:
df1.iloc[19]

uniprotID                P11308
ENST         ENST00000288319.12
Name: 19, dtype: object

In [69]:
df2.iloc[19]

uniprotID               P11308
ENST         ENST00000398919.6
Name: 19, dtype: object