In [1]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%autoreload 2
#%aimport AD_predictor_tools
# %aimport AD_comparison_tools
# %aimport PlottingTools

## Steps to replicate Soto gnomAD analysis
Source: https://www.cell.com/molecular-cell/pdfExtended/S1097-2765(21)00957-6

1. Get list of SFARI TFs
2. Use uniprotID of each TF to find its ENST code
3. Use the ENST code transcript ID to obtain the nucleotide coordinates for the exons
4. To verify, translate all nucleotide coordinates to their respective amino acid sequences and compare to TF's amino acid sequence.
5. Obtain nucleotide coordinates for each domain from their respective amino acid positions
6. Use bedtools intersect to determine the location of variants as either in activation domains, DBD, or the rest of the full length protein.
7. Look at the density of non-synonymous variants
8. Later: replace gnomAD varaint data with the SPARK data

---

1. Get list of SFARI TFs

### Dataset generated at: SFARI/soto_analysis/notebooks/Adding%20isoforms.ipynb
SFARI_tfs = pd.read_csv("../data/SFARI_TFs_with_known_ADs_isoforms.csv")
SFARI_tfs

2. Use uniprotID of each TF to find its ENST code

In [3]:
# # Downloading the uniprot IDs to get the corresponding ENST codes
# SFARI_tfs[["uniprotID"]].to_csv("../data/SFARI_tf_uniprotIDs.txt", 
#                                                  header=None, index=None, sep=' ', mode='a')

In [106]:
# Loading in the table from uniprot 
SFARI_tf_ensembl_matches = pd.read_csv("../data/SFARI_TF_ENST_codes.txt", sep = "\t")
SFARI_tf_ensembl_matches

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,ENST00000349014.8;ENST00000371602.9;ENST000003...
1,Q5TGY3,Q5TGY3,ENST00000247087.10;ENST00000374011.6;ENST00000...
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
4,Q96QS3,Q96QS3,ENST00000379044.5;
...,...,...,...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...
123,Q8N859,Q8N859,ENST00000429591.4;
124,Q6NX45,Q6NX45,ENST00000354377.8;
125,Q7Z570,Q7Z570,ENST00000302277.7;


In [107]:
# Looking at uniprotIDs that have multiple isoforms
multiple_isoforms = SFARI_tf_ensembl_matches[SFARI_tf_ensembl_matches["Ensembl"].str.contains("\[") == True]
multiple_isoforms

Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
5,P10275,P10275,ENST00000374690.9 [P10275-1];ENST00000504326.5...
6,Q9NR48,Q9NR48,ENST00000368346.7 [Q9NR48-1];ENST00000392403.8...
7,Q9UIF8,Q9UIF8,ENST00000392782.5 [Q9UIF8-5];ENST00000392783.7...
...,...,...,...
119,Q8NEK5,Q8NEK5,ENST00000336128.12 [Q8NEK5-2];ENST00000366197....
120,Q9BR84,Q9BR84,ENST00000317221.11 [Q9BR84-2];ENST00000393883....
121,Q68DY1,Q68DY1,ENST00000291750.6 [Q68DY1-3];ENST00000601440.6...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...


In [108]:
# Looking at uniprotIDs that do not have multiple isoforms
non_isoforms = SFARI_tf_ensembl_matches[SFARI_tf_ensembl_matches["Ensembl"].str.contains("\[") == False]
non_isoforms

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,ENST00000349014.8;ENST00000371602.9;ENST000003...
1,Q5TGY3,Q5TGY3,ENST00000247087.10;ENST00000374011.6;ENST00000...
4,Q96QS3,Q96QS3,ENST00000379044.5;
12,Q96JM3,Q96JM3,ENST00000361283.4;ENST00000643483.2;ENST000006...
13,Q96RK0,Q96RK0,ENST00000575354.6;
17,O14529,O14529,ENST00000261726.11;
23,P19622,P19622,ENST00000297375.4;
27,P55316,P55316,ENST00000313071.7;ENST00000706482.1;
30,Q8NBF1,Q8NBF1,ENST00000312233.4;
33,P31629,P31629,ENST00000012134.7;ENST00000367603.8;ENST000003...


In [109]:
non_isoforms['Ensembl'] = non_isoforms['Ensembl'].str.split(";").str[:-1]
non_isoforms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_isoforms['Ensembl'] = non_isoforms['Ensembl'].str.split(";").str[:-1]


Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,"[ENST00000349014.8, ENST00000371602.9, ENST000..."
1,Q5TGY3,Q5TGY3,"[ENST00000247087.10, ENST00000374011.6, ENST00..."
4,Q96QS3,Q96QS3,[ENST00000379044.5]
12,Q96JM3,Q96JM3,"[ENST00000361283.4, ENST00000643483.2, ENST000..."
13,Q96RK0,Q96RK0,[ENST00000575354.6]
17,O14529,O14529,[ENST00000261726.11]
23,P19622,P19622,[ENST00000297375.4]
27,P55316,P55316,"[ENST00000313071.7, ENST00000706482.1]"
30,Q8NBF1,Q8NBF1,[ENST00000312233.4]
33,P31629,P31629,"[ENST00000012134.7, ENST00000367603.8, ENST000..."


In [110]:
# Arbitrarily choosing the first transcript to keep 
# since all of the transcripts correspond to the same protein
non_isoforms['ENST'] = non_isoforms['Ensembl'].str[0]
non_isoforms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_isoforms['ENST'] = non_isoforms['Ensembl'].str[0]


Unnamed: 0,From,Entry,Ensembl,ENST
0,Q9H2P0,Q9H2P0,"[ENST00000349014.8, ENST00000371602.9, ENST000...",ENST00000349014.8
1,Q5TGY3,Q5TGY3,"[ENST00000247087.10, ENST00000374011.6, ENST00...",ENST00000247087.10
4,Q96QS3,Q96QS3,[ENST00000379044.5],ENST00000379044.5
12,Q96JM3,Q96JM3,"[ENST00000361283.4, ENST00000643483.2, ENST000...",ENST00000361283.4
13,Q96RK0,Q96RK0,[ENST00000575354.6],ENST00000575354.6
17,O14529,O14529,[ENST00000261726.11],ENST00000261726.11
23,P19622,P19622,[ENST00000297375.4],ENST00000297375.4
27,P55316,P55316,"[ENST00000313071.7, ENST00000706482.1]",ENST00000313071.7
30,Q8NBF1,Q8NBF1,[ENST00000312233.4],ENST00000312233.4
33,P31629,P31629,"[ENST00000012134.7, ENST00000367603.8, ENST000...",ENST00000012134.7


In [111]:
# Formatting
non_isoforms = non_isoforms.rename(columns = {"From" : "uniprotID"})
non_isoforms = non_isoforms[["uniprotID", "ENST"]]
non_isoforms

Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
4,Q96QS3,ENST00000379044.5
12,Q96JM3,ENST00000361283.4
13,Q96RK0,ENST00000575354.6
17,O14529,ENST00000261726.11
23,P19622,ENST00000297375.4
27,P55316,ENST00000313071.7
30,Q8NBF1,ENST00000312233.4
33,P31629,ENST00000012134.7


In [112]:
# Now, need to figure out which isoform corresponds to the TF
multiple_isoforms["Ensembl"] = multiple_isoforms["Ensembl"].str.split(";").str[:-1]
multiple_isoforms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_isoforms["Ensembl"] = multiple_isoforms["Ensembl"].str.split(";").str[:-1]


Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,[ENST00000334344.11 [Q68CP9-1]]
3,Q9HBZ2,Q9HBZ2,"[ENST00000303329.9 [Q9HBZ2-1], ENST00000527771..."
5,P10275,P10275,"[ENST00000374690.9 [P10275-1], ENST00000504326..."
6,Q9NR48,Q9NR48,"[ENST00000368346.7 [Q9NR48-1], ENST00000392403..."
7,Q9UIF8,Q9UIF8,"[ENST00000392782.5 [Q9UIF8-5], ENST00000392783..."
...,...,...,...
119,Q8NEK5,Q8NEK5,"[ENST00000336128.12 [Q8NEK5-2], ENST0000036619..."
120,Q9BR84,Q9BR84,"[ENST00000317221.11 [Q9BR84-2], ENST0000039388..."
121,Q68DY1,Q68DY1,"[ENST00000291750.6 [Q68DY1-3], ENST00000601440..."
122,Q9Y462,Q9Y462,"[ENST00000276123.7 [Q9Y462-1], ENST00000360700..."


In [113]:
# # Obtaining all isoform sequences from uniprot
# multiple_isoforms[["From"]].to_csv("../data/SFARI_TF_isoform_uniprotIDs.txt", 
#                                                  header=None, index=None, sep=' ', mode='a')

In [114]:
isoforms = multiple_isoforms.explode("Ensembl")

In [115]:
isoforms["uniprotID"] = isoforms["Ensembl"].str.extract("\[(.*)\]")

In [116]:
isoforms = isoforms.drop_duplicates(subset=['uniprotID'])
isoforms

Unnamed: 0,From,Entry,Ensembl,uniprotID
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1
3,Q9HBZ2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2
5,P10275,P10275,ENST00000374690.9 [P10275-1],P10275-1
5,P10275,P10275,ENST00000504326.5 [P10275-3],P10275-3
...,...,...,...,...
121,Q68DY1,Q68DY1,ENST00000601440.6 [Q68DY1-1],Q68DY1-1
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1],Q9Y462-1
122,Q9Y462,Q9Y462,ENST00000360700.4 [Q9Y462-3],Q9Y462-3
126,Q17R98,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2


In [117]:
isoforms["ENST"] = isoforms["Ensembl"].str.split(" ").str[0]
isoforms

Unnamed: 0,From,Entry,Ensembl,uniprotID,ENST
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,ENST00000334344.11
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,ENST00000303329.9
3,Q9HBZ2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2,ENST00000527771.5
5,P10275,P10275,ENST00000374690.9 [P10275-1],P10275-1,ENST00000374690.9
5,P10275,P10275,ENST00000504326.5 [P10275-3],P10275-3,ENST00000504326.5
...,...,...,...,...,...
121,Q68DY1,Q68DY1,ENST00000601440.6 [Q68DY1-1],Q68DY1-1,ENST00000601440.6
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1],Q9Y462-1,ENST00000276123.7
122,Q9Y462,Q9Y462,ENST00000360700.4 [Q9Y462-3],Q9Y462-3,ENST00000360700.4
126,Q17R98,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2,ENST00000379448.9


In [118]:
isoforms = isoforms[["uniprotID", "ENST"]]

In [119]:
isoforms

Unnamed: 0,uniprotID,ENST
2,Q68CP9-1,ENST00000334344.11
3,Q9HBZ2-1,ENST00000303329.9
3,Q9HBZ2-2,ENST00000527771.5
5,P10275-1,ENST00000374690.9
5,P10275-3,ENST00000504326.5
...,...,...
121,Q68DY1-1,ENST00000601440.6
122,Q9Y462-1,ENST00000276123.7
122,Q9Y462-3,ENST00000360700.4
126,Q17R98-2,ENST00000379448.9


In [120]:
# Putting together the two
all_mappings = pd.concat([isoforms, non_isoforms])
all_mappings

Unnamed: 0,uniprotID,ENST
2,Q68CP9-1,ENST00000334344.11
3,Q9HBZ2-1,ENST00000303329.9
3,Q9HBZ2-2,ENST00000527771.5
5,P10275-1,ENST00000374690.9
5,P10275-3,ENST00000504326.5
...,...,...
111,O95365,ENST00000322357.9
118,Q6ZMY9,ENST00000359971.4
123,Q8N859,ENST00000429591.4
124,Q6NX45,ENST00000354377.8


In [121]:
all_mappings.to_csv("../data/uniprotID_ENST_SFARI_TF_mappings.csv")

In [122]:
# Now adding to SFARI TFs

In [123]:
SFARI_TFs = pd.read_csv("../data/SFARI_TFs_with_isoform_id.csv")

In [124]:
SFARI_TFs_with_ENST = pd.merge(SFARI_TFs, all_mappings, how = "left", on = "uniprotID")
SFARI_TFs_with_ENST

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST
0,0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,['Q9H2P0'],1,Q9H2P0,ENST00000349014.8
1,1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,['Q5TGY3'],1,Q5TGY3,ENST00000247087.10
2,2,ARID2,1,1835,Q68CP9-1,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,['Q68CP9'],1,Q68CP9,ENST00000334344.11
3,3,ARNT2,1,717,Q9HBZ2-1,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,['Q9HBZ2'],1,Q9HBZ2,ENST00000303329.9
4,4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,['Q96QS3'],1,Q96QS3,ENST00000379044.5
...,...,...,...,...,...,...,...,...,...,...,...
122,122,ZNF711,1,761,Q9Y462-1,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,['Q9Y462'],1,Q9Y462,ENST00000276123.7
123,123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859,
124,124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,['Q6NX45'],1,Q6NX45,ENST00000354377.8
125,125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,['Q7Z570'],1,Q7Z570,ENST00000302277.7


In [125]:
SFARI_TFs_with_ENST[SFARI_TFs_with_ENST["ENST"].isna()]

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST
19,19,DLX6,1,175,P56179-1,,MSHSQHSPYLQSYHNSSAAAQTRGDDTDQQKTTVIENGEIRFNGKG...,['P56179'],1,P56179,
25,25,ESRRB,1,433,O95718-3,,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,"['O95718', 'O95718-1']",2,O95718,
35,35,HOXA1,1,335,P49639-1,,MDNARMNSFLEYPILSSGDSGTCSARAYPSDHRITTFQSCAVSANS...,['P49639'],1,P49639,
102,102,TET3,1,1660,no match,,MDSGPVYHGDSRQLSASGVPVNGAREPAGPSLLGTGGPWRVDQKPD...,[],0,O43151,
116,116,ZNF385B,1,471,Q569K4-1,,MNMANFLRGFEEKGIKNDRPEDQLSKEKKKILFSFCEVCNIQLNSA...,['Q569K4'],1,Q569K4,
123,123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859,


In [126]:
set(SFARI_TFs["uniprotID"]) - set(all_mappings["uniprotID"])

{'O95718-3', 'P49639-1', 'P56179-1', 'Q569K4-1', 'no match'}

In [127]:
all_mappings[all_mappings["uniprotID"].str.contains('O95718')]

Unnamed: 0,uniprotID,ENST
25,O95718-1,ENST00000380887.7
25,O95718-2,ENST00000505752.6


In [128]:
all_mappings[all_mappings["uniprotID"].str.contains('P49639')]

Unnamed: 0,uniprotID,ENST


In [129]:
all_mappings[all_mappings["uniprotID"].str.contains('P56179')]

Unnamed: 0,uniprotID,ENST
19,P56179-3,ENST00000518156.3


In [130]:
# ISsue with these uniprot isoforms not having ENST IDs...
# For now, only O95718 is an AD
# Will just skip that TF
SFARI_TFs_with_ENST.dropna(subset = ["ENST"]).to_csv("../data/SFARI_TFs_with_ENST_nulls_dropped.csv")

In [131]:
SFARI_TFs_with_ENST[SFARI_TFs_with_ENST["uniprotID"].str.contains("O95718")]

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST
25,25,ESRRB,1,433,O95718-3,,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,"['O95718', 'O95718-1']",2,O95718,


In [None]:
# Future to do: adding the enst code for this manually, keeping track of coordinates

In [132]:
# Downloading the ENST codes corresponding ENSP codes
SFARI_TFs_with_ENST[["ENST"]].to_csv("../data/SFARI_TF_ENST_codes_only.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [133]:
SFARI_TFs_with_ENST

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST
0,0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,['Q9H2P0'],1,Q9H2P0,ENST00000349014.8
1,1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,['Q5TGY3'],1,Q5TGY3,ENST00000247087.10
2,2,ARID2,1,1835,Q68CP9-1,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,['Q68CP9'],1,Q68CP9,ENST00000334344.11
3,3,ARNT2,1,717,Q9HBZ2-1,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,['Q9HBZ2'],1,Q9HBZ2,ENST00000303329.9
4,4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,['Q96QS3'],1,Q96QS3,ENST00000379044.5
...,...,...,...,...,...,...,...,...,...,...,...
122,122,ZNF711,1,761,Q9Y462-1,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,['Q9Y462'],1,Q9Y462,ENST00000276123.7
123,123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859,
124,124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,['Q6NX45'],1,Q6NX45,ENST00000354377.8
125,125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,['Q7Z570'],1,Q7Z570,ENST00000302277.7


In [142]:
ENST_to_ENSP = pd.read_csv("../data/SFARI_TF_ENST_to_ENSP.txt", sep = "\t")
ENST_to_ENSP = ENST_to_ENSP.drop_duplicates(subset = ["Gene stable ID", "Protein stable ID"])
ENST_to_ENSP

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Protein stable ID,Protein stable ID version,UniProtKB isoform ID,UniProtKB Gene Name symbol
0,ENSG00000010818,ENSG00000010818.11,ENST00000012134,ENST00000012134.7,ENSP00000012134,ENSP00000012134.2,,HIVEP2
1,ENSG00000050344,ENSG00000050344.9,ENST00000056233,ENST00000056233.4,ENSP00000056233,ENSP00000056233.3,,NFE2L3
2,ENSG00000070444,ENSG00000070444.15,ENST00000174618,ENST00000174618.5,ENSP00000174618,ENSP00000174618.4,,MNT
3,ENSG00000101883,ENSG00000101883.6,ENST00000217999,ENST00000217999.3,ENSP00000217999,ENSP00000217999.1,,RHOXF1
4,ENSG00000115844,ENSG00000115844.11,ENST00000234198,ENST00000234198.9,ENSP00000234198,ENSP00000234198.4,Q07687-1,DLX2
...,...,...,...,...,...,...,...,...
122,ENSG00000008441,ENSG00000008441.19,ENST00000592199,ENST00000592199.6,ENSP00000467512,ENSP00000467512.1,Q14938-1,NFIX
123,ENSG00000188171,ENSG00000188171.17,ENST00000601440,ENST00000601440.6,ENSP00000469958,ENSP00000469958.1,Q68DY1-1,ZNF626
124,ENSG00000119866,ENSG00000119866.22,ENST00000642384,ENST00000642384.2,ENSP00000496168,ENSP00000496168.1,Q9H165-1,BCL11A
125,ENSG00000152217,ENSG00000152217.20,ENST00000649279,ENST00000649279.2,ENSP00000497406,ENSP00000497406.1,Q9Y6X0-1,SETBP1


In [143]:
ENST_TFs_with_ENSP = pd.merge(SFARI_TFs_with_ENST, ENST_to_ENSP, left_on = "ENST", right_on = "Transcript stable ID version", how = "left")
ENST_TFs_with_ENSP

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Protein stable ID,Protein stable ID version,UniProtKB isoform ID,UniProtKB Gene Name symbol
0,0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,['Q9H2P0'],1,Q9H2P0,ENST00000349014.8,ENSG00000101126,ENSG00000101126.18,ENST00000349014,ENST00000349014.8,ENSP00000342905,ENSP00000342905.3,,ADNP
1,1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,['Q5TGY3'],1,Q5TGY3,ENST00000247087.10,ENSG00000126705,ENSG00000126705.15,ENST00000247087,ENST00000247087.10,ENSP00000247087,ENSP00000247087.4,,AHDC1
2,2,ARID2,1,1835,Q68CP9-1,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,['Q68CP9'],1,Q68CP9,ENST00000334344.11,ENSG00000189079,ENSG00000189079.18,ENST00000334344,ENST00000334344.11,ENSP00000335044,ENSP00000335044.6,Q68CP9-1,ARID2
3,3,ARNT2,1,717,Q9HBZ2-1,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,['Q9HBZ2'],1,Q9HBZ2,ENST00000303329.9,ENSG00000172379,ENSG00000172379.22,ENST00000303329,ENST00000303329.9,ENSP00000307479,ENSP00000307479.4,Q9HBZ2-1,ARNT2
4,4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,['Q96QS3'],1,Q96QS3,ENST00000379044.5,ENSG00000004848,ENSG00000004848.8,ENST00000379044,ENST00000379044.5,ENSP00000368332,ENSP00000368332.4,,ARX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,122,ZNF711,1,761,Q9Y462-1,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,['Q9Y462'],1,Q9Y462,ENST00000276123.7,ENSG00000147180,ENSG00000147180.17,ENST00000276123,ENST00000276123.7,ENSP00000276123,ENSP00000276123.3,Q9Y462-1,ZNF711
123,123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859,,,,,,,,,
124,124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,['Q6NX45'],1,Q6NX45,ENST00000354377.8,ENSG00000196391,ENSG00000196391.11,ENST00000354377,ENST00000354377.8,ENSP00000346348,ENSP00000346348.3,,ZNF774
125,125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,['Q7Z570'],1,Q7Z570,ENST00000302277.7,ENSG00000170396,ENSG00000170396.9,ENST00000302277,ENST00000302277.7,ENSP00000303252,ENSP00000303252.6,,ZNF804A


In [144]:
ENST_TFs_with_ENSP[ENST_TFs_with_ENSP["Protein stable ID"].isna()]

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Protein stable ID,Protein stable ID version,UniProtKB isoform ID,UniProtKB Gene Name symbol
19,19,DLX6,1,175,P56179-1,,MSHSQHSPYLQSYHNSSAAAQTRGDDTDQQKTTVIENGEIRFNGKG...,['P56179'],1,P56179,,,,,,,,,
25,25,ESRRB,1,433,O95718-3,,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,"['O95718', 'O95718-1']",2,O95718,,,,,,,,,
35,35,HOXA1,1,335,P49639-1,,MDNARMNSFLEYPILSSGDSGTCSARAYPSDHRITTFQSCAVSANS...,['P49639'],1,P49639,,,,,,,,,
102,102,TET3,1,1660,no match,,MDSGPVYHGDSRQLSASGVPVNGAREPAGPSLLGTGGPWRVDQKPD...,[],0,O43151,,,,,,,,,
116,116,ZNF385B,1,471,Q569K4-1,,MNMANFLRGFEEKGIKNDRPEDQLSKEKKKILFSFCEVCNIQLNSA...,['Q569K4'],1,Q569K4,,,,,,,,,
123,123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859,,,,,,,,,


In [145]:
ENST_TFs_with_ENSP = ENST_TFs_with_ENSP.rename(columns = {"Protein stable ID":"ENSP"})

In [146]:
columns_to_keep = SFARI_TFs_with_ENST.columns
columns_to_keep = list(columns_to_keep)
columns_to_keep.append("ENSP")
columns_to_keep

['Unnamed: 0',
 'GeneName',
 'Start',
 'End',
 'uniprotID',
 'Reference',
 'Sequence',
 'matching_isoforms',
 'isoforms',
 'orig_uniprotID',
 'ENST',
 'ENSP']

In [147]:
ENST_TFs_with_ENSP = ENST_TFs_with_ENSP[columns_to_keep]
ENST_TFs_with_ENSP

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID,ENST,ENSP
0,0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,['Q9H2P0'],1,Q9H2P0,ENST00000349014.8,ENSP00000342905
1,1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,['Q5TGY3'],1,Q5TGY3,ENST00000247087.10,ENSP00000247087
2,2,ARID2,1,1835,Q68CP9-1,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,['Q68CP9'],1,Q68CP9,ENST00000334344.11,ENSP00000335044
3,3,ARNT2,1,717,Q9HBZ2-1,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,['Q9HBZ2'],1,Q9HBZ2,ENST00000303329.9,ENSP00000307479
4,4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,['Q96QS3'],1,Q96QS3,ENST00000379044.5,ENSP00000368332
...,...,...,...,...,...,...,...,...,...,...,...,...
122,122,ZNF711,1,761,Q9Y462-1,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,['Q9Y462'],1,Q9Y462,ENST00000276123.7,ENSP00000276123
123,123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859,,
124,124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,['Q6NX45'],1,Q6NX45,ENST00000354377.8,ENSP00000346348
125,125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,['Q7Z570'],1,Q7Z570,ENST00000302277.7,ENSP00000303252


In [148]:
ENST_TFs_with_ENSP.dropna(subset = ["ENSP"]).to_csv("../data/SFARI_TFs_with_ENSP_nulls_dropped.csv")

In [16]:
isoforms = isoforms.rename(columns = {"From" : "uniprotID"})
isoforms = isoforms[["uniprotID", "Ensembl"]]
isoforms

Unnamed: 0,uniprotID,Ensembl
2,Q68CP9,ENST00000334344.11 [Q68CP9-1]
3,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1]
3,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2]
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2]
5,P10275,ENST00000374690.9 [P10275-1]
...,...,...
122,Q9Y462,ENST00000373165.7 [Q9Y462-1]
122,Q9Y462,ENST00000674551.1 [Q9Y462-3]
126,Q17R98,ENST00000379448.9 [Q17R98-2]
126,Q17R98,ENST00000508784.6 [Q17R98-1]


In [17]:
isoforms["isoform_uniprotID"] = isoforms["Ensembl"].str.split("[").str[1].str.split("]").str[0]
isoforms

Unnamed: 0,uniprotID,Ensembl,isoform_uniprotID
2,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1
3,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1
3,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-2
3,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-2
5,P10275,ENST00000374690.9 [P10275-1],P10275-1
...,...,...,...
122,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1
122,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-3
126,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-2
126,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1


In [85]:
isoforms[isoforms["uniprot"]]

KeyError: 'uniprot'

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,ENST00000349014.8;ENST00000371602.9;ENST000003...
1,Q5TGY3,Q5TGY3,ENST00000247087.10;ENST00000374011.6;ENST00000...
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
4,Q96QS3,Q96QS3,ENST00000379044.5;
...,...,...,...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...
123,Q8N859,Q8N859,ENST00000429591.4;
124,Q6NX45,Q6NX45,ENST00000354377.8;
125,Q7Z570,Q7Z570,ENST00000302277.7;


In [None]:
isoform_seqs = AD_predictor_tools.makeFullLengthProteinDF("../data/SFARI_TF_isoforms.fasta")
isoform_seqs

In [None]:
isoform_seqs["isoform_uniprotID"] = isoform_seqs["GeneName"].str.split("|").str[1]
isoform_seqs

In [None]:
isoform_seqs = isoform_seqs[["isoform_uniprotID", "AAseq"]]
isoform_seqs

In [None]:
# Using uniprot to get the isoform uniprot ID of the canonical sequences
isoform_uniprotIDs = pd.read_csv("../data/SFARI_TF_isoform_uniprotIDs.tsv", sep = "\t")
isoform_uniprotIDs

In [None]:
isoform_uniprotIDs[isoform_uniprotIDs["From"] == "Q9H334"]["Alternative products (isoforms)"].iloc[0]

In [None]:
pat1 = r'IsoId=(.*?);'
isoform_uniprotIDs["canonical_uniprotID"] = isoform_uniprotIDs["Alternative products (isoforms)"].str.extract(pat1)
isoform_uniprotIDs

In [None]:
isoform_uniprotIDs = isoform_uniprotIDs.rename(columns = {"From" : "uniprotID"})
isoform_uniprotIDs

In [None]:
canonical_IDs = dict(zip(isoform_uniprotIDs["uniprotID"], isoform_uniprotIDs["canonical_uniprotID"]))
canonical_IDs

In [None]:
for uniprotID in canonical_IDs.keys():
    pat = "^" + uniprotID + "$"
    repl = canonical_IDs[uniprotID]
    isoform_seqs["isoform_uniprotID"] = isoform_seqs["isoform_uniprotID"].str.replace(pat, repl)

In [None]:
isoform_seqs = isoform_seqs[isoform_seqs["isoform_uniprotID"].isin(isoforms["isoform_uniprotID"])]
isoform_seqs

In [None]:
isoforms

In [None]:
isoforms = pd.merge(isoforms, isoform_seqs, on = "isoform_uniprotID", how = "left")
isoforms

In [None]:
SFARI_tfs

In [None]:
isoforms_TFs_seq_merged = pd.merge(isoforms, SFARI_tfs, 
                                   left_on = "AAseq", right_on = "ProteinSeq", how = "left")
isoforms_TFs_seq_merged

In [None]:
isoforms_TFs_seq_merged = isoforms_TFs_seq_merged[~isoforms_TFs_seq_merged["ProteinSeq"].isna()]
isoforms_TFs_seq_merged

In [None]:
# Arbitrarily choose the first of each isoform_uniprotID
isoforms_with_seqs = isoforms_TFs_seq_merged.groupby("isoform_uniprotID").agg(lambda sf: sf.iloc[0])
isoforms_with_seqs = isoforms_with_seqs.reset_index()
isoforms_with_seqs

In [None]:
multiple_isoforms

In [None]:
set(multiple_isoforms["From"]) - set(isoforms_with_seqs["uniprotID_x"])

In [None]:
lambert = pd.read_csv("../data/LambertTFs.csv")
lambert

In [None]:
for uniprotID in set(multiple_isoforms["From"]) - set(isoforms_with_seqs["uniprotID_x"]):
    print("Unmatched uniprotID: " + uniprotID)
    print("TF length:")
    print("\t" + str(len(SFARI_tfs[SFARI_tfs["uniprotID"] == uniprotID]["ProteinSeq"].iloc[0])))
    print("Isoform length(s):")
    for AAseq in isoform_seqs[isoform_seqs["isoform_uniprotID"].str.contains(uniprotID)]["AAseq"]:
        print("\t" + str(len(AAseq)))
    # print(len(lambert[lambert["GeneName"].str.contains(uniprotID)]["ProteinSeq"].iloc[0]))
    print("---")

The problem for the four above is that the correct isoform does not have a corresponding ENST code- will have to proceed without these four for now.

In [None]:
isoforms_with_seqs

In [None]:
isoforms_cleaned = isoforms_with_seqs[["uniprotID_x", "Ensembl"]]
isoforms_cleaned["Ensembl"] = isoforms_cleaned["Ensembl"].str.split(" ").str[0]
isoforms_cleaned = isoforms_cleaned.rename(columns = {"uniprotID_x" : "uniprotID",
                                                     "Ensembl" : "ENST"})
isoforms_cleaned

In [None]:
non_isoforms_cleaned = non_isoforms.reset_index(drop = True)
non_isoforms_cleaned

In [None]:
uniprotID_ENST_mapping_df = pd.concat([isoforms_cleaned, non_isoforms_cleaned])
uniprotID_ENST_mapping_df

In [None]:
set(SFARI_tfs["uniprotID"]) - set(uniprotID_ENST_mapping_df["uniprotID"])
# P49639 has no ensembl transcript code
# The other four do not have an ensembl transcript codes that corresponds to the correct isoform

In [None]:
# uniprotID_ENST_mapping_df.to_csv("../data/SFARI_tf_ENST_codes.csv")

In [None]:
SFARI_TFs_with_ENST = pd.merge(SFARI_tfs, uniprotID_ENST_mapping_df, on = "uniprotID")
SFARI_TFs_with_ENST

In [None]:
# SFARI_TFs_with_ENST.to_csv("../data/SFARI_TFs_with_ENST.csv")