In [1]:
import pandas as pd
import protfasta
import re

Plan:

1. Check SFARI_TFs_with_ENST.csv
2. Rebuild input table with ERG's correct ENST
3. Rerun input sequences (only ERG's should change)
   

---
1. Check SFARI_TFs_with_ENST.csv

In [2]:
curr_enst_mapping = pd.read_csv("../data/SFARI_TFs_with_ENST.csv")
curr_enst_mapping

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
0,0,0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0,ENST00000349014.8
1,1,1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3,ENST00000247087.10
2,2,2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9,ENST00000334344.11
3,3,3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2,ENST00000303329.9
4,4,4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3,ENST00000379044.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,117,122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462,ENST00000276123.7
118,118,123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859,ENST00000429591.4
119,119,124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45,ENST00000354377.8
120,120,125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570,ENST00000302277.7


---
Steps: 

1A. Get list of SFARI TFs, using ENSG and new uniprot sequences
<br>1B. match TF sequences with all isoforms to get isoform ID
<br>1C. uniprot isoform id to ENST

In [3]:
# 1A. Get list of SFARI TFs, using ENSG and new uniprot sequences

In [4]:
# Using code from "Redoing datasets w/ found non-canonical ADs. ipynb"

# Redownloading lambert TFs from uniprot.
SFARI_Gene = pd.read_csv("../data/SFARI-Gene_genes_01-23-2023release_03-21-2023export.csv")

# First, downloaded tables s1-s4 from lambert review
lambert_table_s1 = pd.read_excel(open('../data/lambert_supp_tables.xlsx', 'rb'),
                               sheet_name='Table S1. Related to Figure 1B')

# Keeping TF rows
lambert_table_s1_TF_rows = lambert_table_s1[lambert_table_s1["Is TF?"] == "Yes"]
lambert_TF_ensg_codes = lambert_table_s1_TF_rows["Gene Information"]

# Rows of SFARI Gene with ENSG codes in lambert TFs
SFARI_TFs = SFARI_Gene[SFARI_Gene["ensembl-id"].isin(lambert_TF_ensg_codes)]

# Rows of SFARI Gene with ENSG codes in lambert TFs
SFARI_TFs = SFARI_Gene[SFARI_Gene["ensembl-id"].isin(lambert_TF_ensg_codes)]
SFARI_TFs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
16,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64
31,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24
60,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12
61,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15
62,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24
...,...,...,...,...,...,...,...,...,...,...
1111,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5
1112,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3
1113,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4
1114,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16


In [5]:
# Using ENSG to get (most recent) sequences from uniprot
output = protfasta.read_fasta("../data/SFARI_TFs_recent.fasta")
keys = output.keys()
vals = output.values()
SFARI_TFs_seqs = pd.DataFrame({"id": keys, "ProteinSeq": vals})
SFARI_TFs_seqs["GeneName"] = SFARI_TFs_seqs["id"].str.extract(r'GN=(.*) PE')
SFARI_TFs_seqs

Unnamed: 0,id,ProteinSeq,GeneName
0,sp|O00712|NFIB_HUMAN Nuclear factor 1 B-type O...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,NFIB
1,sp|O14529|CUX2_HUMAN Homeobox protein cut-like...,MAANVGSMFQYWKRFDLRRLQKELNSVASELSARQEESEHSHKHLI...,CUX2
2,sp|O14770|MEIS2_HUMAN Homeobox protein Meis2 O...,MAQRYDELPHYGGMDGVGVPASMYGDPHAPRPIPPVHHLNHGPPLH...,MEIS2
3,sp|O15266|SHOX_HUMAN Short stature homeobox pr...,MEELTAFVSKSFDQKSKDGNGGGGGGGGKKDSITYREVLESGLARS...,SHOX
4,sp|O15409|FOXP2_HUMAN Forkhead box protein P2 ...,MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVST...,FOXP2
...,...,...,...
121,sp|Q9Y2K7|KDM2A_HUMAN Lysine-specific demethyl...,MEPEEERIRYSQRLRGTMRRRYEDDGISDDEIEGKRTFDLEEKLHT...,KDM2A
122,sp|Q9Y458|TBX22_HUMAN T-box transcription fact...,MALSSRARAFSVEALVGRPSKRKLQDPIQAEQPELREKKGGEEEEE...,TBX22
123,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,ZNF711
124,sp|Q9Y4A8|NF2L3_HUMAN Nuclear factor erythroid...,MKHLKRWWSAGGGLLHLTLLLSLAGLRVDLDLYLLLPPPTLLQDEL...,NFE2L3


In [6]:
SFARI_TFs = pd.merge(SFARI_TFs, SFARI_TFs_seqs, left_on = "gene-symbol", right_on = "GeneName", how = "left")
SFARI_TFs

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,id,ProteinSeq,GeneName
0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN Activity-dependent neurop...,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,ADNP
1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN Transcription factor Gib...,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,AHDC1
2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN AT-rich interactive doma...,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,ARID2
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN Aryl hydrocarbon recepto...,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,ARNT2
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN Homeobox protein ARX OS=Ho...,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,ARX
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,ZNF711
123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN Zinc finger protein 713 ...,MPSQNAVFSQEGNMEEEEMNDGSQMVRSQESLTFQDVAVDFTREEW...,ZNF713
124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN Zinc finger protein 774 ...,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,ZNF774
125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN Zinc finger protein 804A...,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,ZNF804A


In [7]:
SFARI_TFs[SFARI_TFs["ProteinSeq"].isna()]
#HOXA1 has no verified uniprot entry

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,id,ProteinSeq,GeneName
35,9,HOXA1,homeobox A1,ENSG00000105991,7,"Rare Single Gene Mutation, Syndromic, Genetic ...",,1,,16,,,


1B. match TF sequences with all isoforms to get isoform ID

In [8]:
# Importing the sequences of all isoforms of the uniprotIDs from uniprot
TF_isoform_seqs = protfasta.read_fasta("../data/all_lambert_TF_isoforms.txt")
TF_isoform_seqs_df = pd.DataFrame({"id" : TF_isoform_seqs.keys(),
                                  "seq" : TF_isoform_seqs.values()})
TF_isoform_seqs_df["uniprotID"] = TF_isoform_seqs_df["id"].str.split("|").str[1]

TF_isoform_seqs_df

Unnamed: 0,id,seq,uniprotID
0,sp|A0AVK6|E2F8_HUMAN Transcription factor E2F8...,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,A0AVK6
1,sp|A0PJY2|FEZF1_HUMAN Fez family zinc finger p...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,A0PJY2
2,sp|A0PJY2-2|FEZF1_HUMAN Isoform 2 of Fez famil...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,A0PJY2-2
3,sp|A0PJY2-3|FEZF1_HUMAN Isoform 3 of Fez famil...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,A0PJY2-3
4,sp|A1A519|F170A_HUMAN Protein FAM170A OS=Homo ...,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,A1A519
...,...,...,...
3526,sp|Q9Y6X8|ZHX2_HUMAN Zinc fingers and homeobox...,MASKRKSTTPCMVRTSQVVEQDVPEEVDRAKEKGIGTPQPDVAKDS...,Q9Y6X8
3527,sp|Q9Y6Y1|CMTA1_HUMAN Calmodulin-binding trans...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1
3528,sp|Q9Y6Y1-2|CMTA1_HUMAN Isoform 2 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-2
3529,sp|Q9Y6Y1-3|CMTA1_HUMAN Isoform 3 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-3


In [9]:
SFARI_TFs_with_uniprotID = pd.merge(SFARI_TFs, TF_isoform_seqs_df, left_on = "ProteinSeq", right_on = "seq", how = "left")
SFARI_TFs_with_uniprotID

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,id_x,ProteinSeq,GeneName,id_y,seq,uniprotID
0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN Activity-dependent neurop...,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,ADNP,sp|Q9H2P0|ADNP_HUMAN Activity-dependent neurop...,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0
1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN Transcription factor Gib...,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,AHDC1,sp|Q5TGY3|AHDC1_HUMAN Transcription factor Gib...,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3
2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN AT-rich interactive doma...,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,ARID2,sp|Q68CP9|ARID2_HUMAN AT-rich interactive doma...,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN Aryl hydrocarbon recepto...,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,ARNT2,sp|Q9HBZ2|ARNT2_HUMAN Aryl hydrocarbon recepto...,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN Homeobox protein ARX OS=Ho...,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,ARX,sp|Q96QS3|ARX_HUMAN Homeobox protein ARX OS=Ho...,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,ZNF711,sp|Q9Y462|ZN711_HUMAN Zinc finger protein 711 ...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN Zinc finger protein 713 ...,MPSQNAVFSQEGNMEEEEMNDGSQMVRSQESLTFQDVAVDFTREEW...,ZNF713,sp|Q8N859|ZN713_HUMAN Zinc finger protein 713 ...,MPSQNAVFSQEGNMEEEEMNDGSQMVRSQESLTFQDVAVDFTREEW...,Q8N859
124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN Zinc finger protein 774 ...,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,ZNF774,sp|Q6NX45|ZN774_HUMAN Zinc finger protein 774 ...,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45
125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN Zinc finger protein 804A...,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,ZNF804A,sp|Q7Z570|Z804A_HUMAN Zinc finger protein 804A...,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570


In [10]:
# Seems like everything matched to same entry
# Everything has canonical uniprotID
SFARI_TFs_with_uniprotID[SFARI_TFs_with_uniprotID["uniprotID"].isna()]

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,id_x,ProteinSeq,GeneName,id_y,seq,uniprotID
18,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,sp|O60479|DLX3_HUMAN Homeobox protein DLX-3 OS...,MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...,DLX3,,,
35,9,HOXA1,homeobox A1,ENSG00000105991,7,"Rare Single Gene Mutation, Syndromic, Genetic ...",,1,,16,,,,,,


In [11]:
SFARI_TFs_with_uniprotID[SFARI_TFs_with_uniprotID["id_x"] != SFARI_TFs_with_uniprotID["id_y"]]

Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,id_x,ProteinSeq,GeneName,id_y,seq,uniprotID
18,9,DLX3,distal-less homeobox 3,ENSG00000064195,17,Rare Single Gene Mutation,2.0,0,,4,sp|O60479|DLX3_HUMAN Homeobox protein DLX-3 OS...,MSGSFDRKLSSILTDISSSLSCHAGSKDSPTLPESSVTDLGYYSAP...,DLX3,,,
35,9,HOXA1,homeobox A1,ENSG00000105991,7,"Rare Single Gene Mutation, Syndromic, Genetic ...",,1,,16,,,,,,


1C. uniprot isoform id to ENST

In [12]:
# Loading in the table from uniprot 
SFARI_tf_ensembl_matches = pd.read_csv("../data/SFARI_tf_enst_codes.tsv", sep = "\t")
SFARI_tf_ensembl_matches

Unnamed: 0,From,Entry,Ensembl
0,Q9H2P0,Q9H2P0,ENST00000349014.8;ENST00000371602.9;ENST000003...
1,Q5TGY3,Q5TGY3,ENST00000247087.10;ENST00000374011.6;ENST00000...
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
4,Q96QS3,Q96QS3,ENST00000379044.5;
...,...,...,...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...
123,Q8N859,Q8N859,ENST00000429591.4;
124,Q6NX45,Q6NX45,ENST00000354377.8;
125,Q7Z570,Q7Z570,ENST00000302277.7;


In [13]:
# Looking at uniprotIDs that have multiple isoforms
multiple_isoforms = SFARI_tf_ensembl_matches[SFARI_tf_ensembl_matches["Ensembl"].str.contains("\[") == True]
multiple_isoforms

Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1];
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1];ENST00000527771.5...
5,P10275,P10275,ENST00000374690.9 [P10275-1];ENST00000504326.5...
6,Q9NR48,Q9NR48,ENST00000368346.7 [Q9NR48-1];ENST00000392403.8...
7,Q9UIF8,Q9UIF8,ENST00000392782.5 [Q9UIF8-5];ENST00000392783.7...
...,...,...,...
119,Q8NEK5,Q8NEK5,ENST00000336128.12 [Q8NEK5-2];ENST00000366197....
120,Q9BR84,Q9BR84,ENST00000317221.11 [Q9BR84-2];ENST00000393883....
121,Q68DY1,Q68DY1,ENST00000291750.6 [Q68DY1-3];ENST00000601440.6...
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1];ENST00000360700.4...


In [14]:
multiple_isoforms[multiple_isoforms["From"] == "P11308"]["Ensembl"].iloc[0]

'ENST00000288319.12 [P11308-4];ENST00000398897.5 [P11308-2];ENST00000398911.5 [P11308-1];ENST00000398919.6 [P11308-3];ENST00000417133.6 [P11308-3];ENST00000442448.5 [P11308-1];'

In [15]:
curr_enst_mapping[curr_enst_mapping["gene-symbol"] == "ERG"]
# Should be ENST00000288319.12

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
29,29,31,9,ERG,"ERG, ETS transcription factor",ENSG00000157554,21,Genetic Association,2.0,0,,1,sp|P11308|ERG_HUMAN,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...,P11308,ENST00000398919.6


In [16]:
# Looking at uniprotIDs that do not have multiple isoforms
non_isoforms = SFARI_tf_ensembl_matches[SFARI_tf_ensembl_matches["Ensembl"].str.contains("\[") == False]
non_isoforms['Ensembl'] = non_isoforms['Ensembl'].str.split(";").str[:-1]

# Arbitrarily choosing the first transcript to keep 
# since all of the transcripts correspond to the same protein
non_isoforms['ENST'] = non_isoforms['Ensembl'].str[0]

# Formatting
non_isoforms = non_isoforms.rename(columns = {"From" : "uniprotID"})
non_isoforms = non_isoforms[["uniprotID", "ENST"]]
non_isoforms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_isoforms['Ensembl'] = non_isoforms['Ensembl'].str.split(";").str[:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_isoforms['ENST'] = non_isoforms['Ensembl'].str[0]


Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
4,Q96QS3,ENST00000379044.5
12,Q96JM3,ENST00000361283.4
13,Q96RK0,ENST00000575354.6
17,O14529,ENST00000261726.11
23,P19622,ENST00000297375.4
27,P55316,ENST00000313071.7
30,Q8NBF1,ENST00000312233.4
33,P31629,ENST00000012134.7


In [17]:
# Now, need to figure out which isoform corresponds to the TF
multiple_isoforms["Ensembl"] = multiple_isoforms["Ensembl"].str.split(";").str[:-1]
multiple_isoforms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_isoforms["Ensembl"] = multiple_isoforms["Ensembl"].str.split(";").str[:-1]


Unnamed: 0,From,Entry,Ensembl
2,Q68CP9,Q68CP9,[ENST00000334344.11 [Q68CP9-1]]
3,Q9HBZ2,Q9HBZ2,"[ENST00000303329.9 [Q9HBZ2-1], ENST00000527771..."
5,P10275,P10275,"[ENST00000374690.9 [P10275-1], ENST00000504326..."
6,Q9NR48,Q9NR48,"[ENST00000368346.7 [Q9NR48-1], ENST00000392403..."
7,Q9UIF8,Q9UIF8,"[ENST00000392782.5 [Q9UIF8-5], ENST00000392783..."
...,...,...,...
119,Q8NEK5,Q8NEK5,"[ENST00000336128.12 [Q8NEK5-2], ENST0000036619..."
120,Q9BR84,Q9BR84,"[ENST00000317221.11 [Q9BR84-2], ENST0000039388..."
121,Q68DY1,Q68DY1,"[ENST00000291750.6 [Q68DY1-3], ENST00000601440..."
122,Q9Y462,Q9Y462,"[ENST00000276123.7 [Q9Y462-1], ENST00000360700..."


In [18]:
import requests

In [19]:
def get_canonical_uniprot_id(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        regex_matches = re.findall(r"'isoformIds': \['(.*?)'\]", str(data))
        return(regex_matches[0])
    else:
        return f"Error: {response.status_code}"

canonical_ids = []
for uniprotID in multiple_isoforms["From"]:
    canonical_ids.append(get_canonical_uniprot_id(uniprotID))
multiple_isoforms["canonical_uniprot_id"] = canonical_ids
multiple_isoforms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_isoforms["canonical_uniprot_id"] = canonical_ids


Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id
2,Q68CP9,Q68CP9,[ENST00000334344.11 [Q68CP9-1]],Q68CP9-1
3,Q9HBZ2,Q9HBZ2,"[ENST00000303329.9 [Q9HBZ2-1], ENST00000527771...",Q9HBZ2-1
5,P10275,P10275,"[ENST00000374690.9 [P10275-1], ENST00000504326...",P10275-1
6,Q9NR48,Q9NR48,"[ENST00000368346.7 [Q9NR48-1], ENST00000392403...",Q9NR48-1
7,Q9UIF8,Q9UIF8,"[ENST00000392782.5 [Q9UIF8-5], ENST00000392783...",Q9UIF8-1
...,...,...,...,...
119,Q8NEK5,Q8NEK5,"[ENST00000336128.12 [Q8NEK5-2], ENST0000036619...",Q8NEK5-1
120,Q9BR84,Q9BR84,"[ENST00000317221.11 [Q9BR84-2], ENST0000039388...",Q9BR84-1
121,Q68DY1,Q68DY1,"[ENST00000291750.6 [Q68DY1-3], ENST00000601440...",Q68DY1-1
122,Q9Y462,Q9Y462,"[ENST00000276123.7 [Q9Y462-1], ENST00000360700...",Q9Y462-1


In [20]:
multiple_isoforms["num"] = multiple_isoforms["canonical_uniprot_id"].str.split("-").str[1]
multiple_isoforms["num"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_isoforms["num"] = multiple_isoforms["canonical_uniprot_id"].str.split("-").str[1]


num
1    83
3     4
2     2
4     1
Name: count, dtype: int64

In [21]:
multiple_isoforms[multiple_isoforms["num"] == "3"]

Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,num
25,O95718,O95718,"[ENST00000380887.7 [O95718-1], ENST00000505752...",O95718-3,3
74,Q9ULL5,Q9ULL5,[ENST00000418929.7 [Q9ULL5-3]],Q9ULL5-3,3
77,Q2KHR2,Q2KHR2,"[ENST00000559447.8 [Q2KHR2-3], ENST00000673948...",Q2KHR2-3,3
100,Q15554,Q15554,"[ENST00000254942.8 [Q15554-3], ENST00000567296...",Q15554-3,3


In [22]:
multiple_isoforms = multiple_isoforms.drop(columns = "num")
multiple_isoforms

Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id
2,Q68CP9,Q68CP9,[ENST00000334344.11 [Q68CP9-1]],Q68CP9-1
3,Q9HBZ2,Q9HBZ2,"[ENST00000303329.9 [Q9HBZ2-1], ENST00000527771...",Q9HBZ2-1
5,P10275,P10275,"[ENST00000374690.9 [P10275-1], ENST00000504326...",P10275-1
6,Q9NR48,Q9NR48,"[ENST00000368346.7 [Q9NR48-1], ENST00000392403...",Q9NR48-1
7,Q9UIF8,Q9UIF8,"[ENST00000392782.5 [Q9UIF8-5], ENST00000392783...",Q9UIF8-1
...,...,...,...,...
119,Q8NEK5,Q8NEK5,"[ENST00000336128.12 [Q8NEK5-2], ENST0000036619...",Q8NEK5-1
120,Q9BR84,Q9BR84,"[ENST00000317221.11 [Q9BR84-2], ENST0000039388...",Q9BR84-1
121,Q68DY1,Q68DY1,"[ENST00000291750.6 [Q68DY1-3], ENST00000601440...",Q68DY1-1
122,Q9Y462,Q9Y462,"[ENST00000276123.7 [Q9Y462-1], ENST00000360700...",Q9Y462-1


In [23]:
multiple_isoforms_expanded = multiple_isoforms.explode("Ensembl")
multiple_isoforms_expanded

Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1
3,Q9HBZ2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-1
3,Q9HBZ2,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-1
5,P10275,P10275,ENST00000374690.9 [P10275-1],P10275-1
...,...,...,...,...
122,Q9Y462,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1
122,Q9Y462,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-1
126,Q17R98,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-1
126,Q17R98,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1


In [24]:
multiple_isoforms_expanded["Ensembl_uniprot"] = multiple_isoforms_expanded["Ensembl"].str.split("[").str[1].str.split("]").str[0]
multiple_isoforms_expanded

Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,Q68CP9-1
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,Q9HBZ2-1
3,Q9HBZ2,Q9HBZ2,ENST00000527771.5 [Q9HBZ2-2],Q9HBZ2-1,Q9HBZ2-2
3,Q9HBZ2,Q9HBZ2,ENST00000533983.5 [Q9HBZ2-2],Q9HBZ2-1,Q9HBZ2-2
5,P10275,P10275,ENST00000374690.9 [P10275-1],P10275-1,P10275-1
...,...,...,...,...,...
122,Q9Y462,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1,Q9Y462-1
122,Q9Y462,Q9Y462,ENST00000674551.1 [Q9Y462-3],Q9Y462-1,Q9Y462-3
126,Q17R98,Q17R98,ENST00000379448.9 [Q17R98-2],Q17R98-1,Q17R98-2
126,Q17R98,Q17R98,ENST00000508784.6 [Q17R98-1],Q17R98-1,Q17R98-1


In [25]:
multiple_isoforms_matched = multiple_isoforms_expanded[multiple_isoforms_expanded["canonical_uniprot_id"] == multiple_isoforms_expanded["Ensembl_uniprot"]]
multiple_isoforms_matched["ENST"] = multiple_isoforms_matched["Ensembl"].str.split(" ").str[0]
multiple_isoforms_matched

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_isoforms_matched["ENST"] = multiple_isoforms_matched["Ensembl"].str.split(" ").str[0]


Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,Q68CP9-1,ENST00000334344.11
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,Q9HBZ2-1,ENST00000303329.9
5,P10275,P10275,ENST00000374690.9 [P10275-1],P10275-1,P10275-1,ENST00000374690.9
5,P10275,P10275,ENST00000612452.5 [P10275-1],P10275-1,P10275-1,ENST00000612452.5
6,Q9NR48,Q9NR48,ENST00000368346.7 [Q9NR48-1],Q9NR48-1,Q9NR48-1,ENST00000368346.7
...,...,...,...,...,...,...
120,Q9BR84,Q9BR84,ENST00000603380.6 [Q9BR84-1],Q9BR84-1,Q9BR84-1,ENST00000603380.6
121,Q68DY1,Q68DY1,ENST00000601440.6 [Q68DY1-1],Q68DY1-1,Q68DY1-1,ENST00000601440.6
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1],Q9Y462-1,Q9Y462-1,ENST00000276123.7
122,Q9Y462,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1,Q9Y462-1,ENST00000373165.7


In [26]:
multiple_isoforms_matched[multiple_isoforms_matched["From"] == "P11308"]

Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
31,P11308,P11308,ENST00000288319.12 [P11308-4],P11308-4,P11308-4,ENST00000288319.12


In [27]:
# Now, check whether current ENST is a match in this table. If not, flag.

for uniprotID, curr_ENST in zip(curr_enst_mapping["uniprotID"], curr_enst_mapping["ENST"]):
    if uniprotID in set(multiple_isoforms_matched["From"]):
        # print(uniprotID)
        # print(curr_ENST)
        
        new_mapping_rows = multiple_isoforms_matched[multiple_isoforms_matched["From"] == uniprotID]
        #display(new_mapping_rows)
        
        if curr_ENST not in set(new_mapping_rows["ENST"]):
            #print(curr_ENST)
            #print(new_mapping_rows["ENST"])
            print(uniprotID + " WRONG")
            display(new_mapping_rows)
            display(curr_enst_mapping[curr_enst_mapping["uniprotID"] == uniprotID])
            
        else:
            print(uniprotID + " ok")

Q68CP9 ok
Q9HBZ2 ok
P10275 ok
Q9NR48 ok
Q9UIF8 ok
Q9H165 ok
Q86V15 ok
Q6P1N0 ok
O94983 ok
P49711 ok
P39880 ok
Q13948 ok
O75398 ok
Q07687 ok
Q9H4W6 ok
Q06889 ok
Q92731 ok
Q8TBJ5 ok
Q9H334 ok
O15409 ok
P11308 WRONG


Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
31,P11308,P11308,ENST00000288319.12 [P11308-4],P11308-4,P11308-4,ENST00000288319.12


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
29,29,31,9,ERG,"ERG, ETS transcription factor",ENSG00000157554,21,Genetic Association,2.0,0,,1,sp|P11308|ERG_HUMAN,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...,P11308,ENST00000398919.6


P78347 ok
Q5T1R4 ok
Q13422 ok
Q9Y2K7 ok
Q9UGL1 ok
O75840 ok
Q03164 ok
Q8NHM5 ok
O60663 ok
Q9UIS9 ok
O95983 ok
O95243 ok
P51608 ok
Q06413 ok
O14770 ok
Q9UL68 ok
Q15788 ok
Q12857 ok
O00712 ok
Q14938 ok
P08235 ok
P43354 ok
Q02548 ok
P26367 ok
P40424 ok
Q96BD5 ok
Q9ULL5 ok
P48380 ok
Q33E94 ok
Q2KHR2 WRONG


Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
77,Q2KHR2,Q2KHR2,ENST00000559447.8 [Q2KHR2-3],Q2KHR2-3,Q2KHR2-3,ENST00000559447.8


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
74,74,77,9,RFX7,regulatory factor X7,ENSG00000181827,15,"Rare Single Gene Mutation, Syndromic",3.0,1,,4,sp|Q2KHR2|RFX7_HUMAN,MSSSRAQQMHAFSWIRNTLEEHPETSLPKQEVYDEYKSYCDNLGYH...,Q2KHR2,ENST00000673997.1


P35398 ok
Q92753 ok
Q01826 ok
Q9UPW6 ok
Q9Y6X0 ok
Q15047 ok
Q96T68 ok
O15266 ok
P18583 ok
P35711 ok
P35712 ok
Q6ZRS2 ok
Q16650 ok
O43435 ok
Q9Y458 ok
Q9UGU0 ok
P15884 ok
Q9NQB0 ok
Q15554 ok
Q6N021 ok
P19532 ok
P10827 ok
Q6ZSZ6 ok
Q05516 ok
Q9HC78 ok
P11473 ok
Q9ULJ3 ok
P17022 ok
O60281 ok
Q96JM2 ok
Q8NEK5 ok
Q9BR84 ok
Q68DY1 ok
Q9Y462 ok
Q17R98 ok


In [28]:
multiple_isoforms_matched

Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
2,Q68CP9,Q68CP9,ENST00000334344.11 [Q68CP9-1],Q68CP9-1,Q68CP9-1,ENST00000334344.11
3,Q9HBZ2,Q9HBZ2,ENST00000303329.9 [Q9HBZ2-1],Q9HBZ2-1,Q9HBZ2-1,ENST00000303329.9
5,P10275,P10275,ENST00000374690.9 [P10275-1],P10275-1,P10275-1,ENST00000374690.9
5,P10275,P10275,ENST00000612452.5 [P10275-1],P10275-1,P10275-1,ENST00000612452.5
6,Q9NR48,Q9NR48,ENST00000368346.7 [Q9NR48-1],Q9NR48-1,Q9NR48-1,ENST00000368346.7
...,...,...,...,...,...,...
120,Q9BR84,Q9BR84,ENST00000603380.6 [Q9BR84-1],Q9BR84-1,Q9BR84-1,ENST00000603380.6
121,Q68DY1,Q68DY1,ENST00000601440.6 [Q68DY1-1],Q68DY1-1,Q68DY1-1,ENST00000601440.6
122,Q9Y462,Q9Y462,ENST00000276123.7 [Q9Y462-1],Q9Y462-1,Q9Y462-1,ENST00000276123.7
122,Q9Y462,Q9Y462,ENST00000373165.7 [Q9Y462-1],Q9Y462-1,Q9Y462-1,ENST00000373165.7


In [29]:
# Now, need to rebuild mapping table to correct mistakes. keep other ENSTs same as possible.

new_mappings = dict(zip(non_isoforms["uniprotID"], non_isoforms["ENST"]))

for uniprotID, curr_ENST in zip(curr_enst_mapping["uniprotID"], curr_enst_mapping["ENST"]):
    if uniprotID in set(multiple_isoforms_matched["From"]):
        new_mapping_rows = multiple_isoforms_matched[multiple_isoforms_matched["From"] == uniprotID]
        
        if curr_ENST not in set(new_mapping_rows["ENST"]):
            #print(curr_ENST)
            #print(new_mapping_rows["ENST"])
            print(uniprotID + " WRONG")
            display(new_mapping_rows)
            display(curr_enst_mapping[curr_enst_mapping["uniprotID"] == uniprotID])
            new_mappings[uniprotID] = new_mapping_rows["ENST"].iloc[0]
            
        else:
            new_mappings[uniprotID] = curr_ENST
        

P11308 WRONG


Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
31,P11308,P11308,ENST00000288319.12 [P11308-4],P11308-4,P11308-4,ENST00000288319.12


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
29,29,31,9,ERG,"ERG, ETS transcription factor",ENSG00000157554,21,Genetic Association,2.0,0,,1,sp|P11308|ERG_HUMAN,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...,P11308,ENST00000398919.6


Q2KHR2 WRONG


Unnamed: 0,From,Entry,Ensembl,canonical_uniprot_id,Ensembl_uniprot,ENST
77,Q2KHR2,Q2KHR2,ENST00000559447.8 [Q2KHR2-3],Q2KHR2-3,Q2KHR2-3,ENST00000559447.8


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID,ENST
74,74,77,9,RFX7,regulatory factor X7,ENSG00000181827,15,"Rare Single Gene Mutation, Syndromic",3.0,1,,4,sp|Q2KHR2|RFX7_HUMAN,MSSSRAQQMHAFSWIRNTLEEHPETSLPKQEVYDEYKSYCDNLGYH...,Q2KHR2,ENST00000673997.1


In [30]:
new_mappings_df = pd.DataFrame({"uniprotID" : new_mappings.keys(), "ENST" : new_mappings.values()})
new_mappings_df

Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
2,Q96QS3,ENST00000379044.5
3,Q96JM3,ENST00000361283.4
4,Q96RK0,ENST00000575354.6
...,...,...
117,Q8NEK5,ENST00000366197.9
118,Q9BR84,ENST00000393883.6
119,Q68DY1,ENST00000601440.6
120,Q9Y462,ENST00000276123.7


In [31]:
curr_enst_mapping_test = curr_enst_mapping[["uniprotID", "ENST"]]
curr_enst_mapping_test

Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
2,Q68CP9,ENST00000334344.11
3,Q9HBZ2,ENST00000303329.9
4,Q96QS3,ENST00000379044.5
...,...,...
117,Q9Y462,ENST00000276123.7
118,Q8N859,ENST00000429591.4
119,Q6NX45,ENST00000354377.8
120,Q7Z570,ENST00000302277.7


In [32]:
overlaps = pd.merge(curr_enst_mapping_test, new_mappings_df)
overlaps

Unnamed: 0,uniprotID,ENST
0,Q9H2P0,ENST00000349014.8
1,Q5TGY3,ENST00000247087.10
2,Q68CP9,ENST00000334344.11
3,Q9HBZ2,ENST00000303329.9
4,Q96QS3,ENST00000379044.5
...,...,...
115,Q9Y462,ENST00000276123.7
116,Q8N859,ENST00000429591.4
117,Q6NX45,ENST00000354377.8
118,Q7Z570,ENST00000302277.7


In [33]:
new_mappings_df[~new_mappings_df["uniprotID"].isin(overlaps["uniprotID"])]

Unnamed: 0,uniprotID,ENST
56,P11308,ENST00000288319.12
86,Q2KHR2,ENST00000559447.8


In [34]:
curr_enst_mapping_test[~curr_enst_mapping_test["uniprotID"].isin(overlaps["uniprotID"])]

Unnamed: 0,uniprotID,ENST
29,P11308,ENST00000398919.6
74,Q2KHR2,ENST00000673997.1


In [35]:
# Everything looks corrected now for the two genes, including ERG
#new_mappings_df.to_csv("../data/SFARI_TFs_with_ENST_corrected.csv")

In [36]:
# Rebuild input table with ERG's correct ENST
# Rerun input sequences (only ERG's should change)