In [75]:
%load_ext autoreload

In [79]:
import pandas as pd
import protfasta

In [112]:
# SFARI TFs
SFARI_TFs = pd.read_csv("../../data/SFARI_TFs.csv")
SFARI_TFs = SFARI_TFs.drop(columns = "GeneName")
SFARI_TFs = SFARI_TFs.rename(columns = {"ProteinSeq":"Sequence",
                                      "gene-symbol":"GeneName"})
SFARI_TFs["Reference"] = ""
SFARI_TFs["Start"] = 1
SFARI_TFs["End"] = SFARI_TFs["Sequence"].str.len()
SFARI_TFs = SFARI_TFs[["GeneName", "Start", "End", "uniprotID", "Reference", "Sequence"]]
SFARI_TFs

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence
0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...
1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...
2,ARID2,1,1835,Q68CP9,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...
3,ARNT2,1,717,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...
4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...
...,...,...,...,...,...,...
122,ZNF711,1,761,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...
123,ZNF713,1,430,Q8N859,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...
124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...
125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...


In [113]:
# Using uniprotIDs of SFARI TFs to get uniprot IDs
SFARI_TFs[["uniprotID"]].to_csv("../data/SFARI_TFs_uniprotIDs.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [114]:
all_isoform_seqs = protfasta.read_fasta("../data/uniprot_SFARI_TF_seqs_isoforms.txt")
isoform_df = pd.DataFrame({"id":all_isoform_seqs.keys(),
             "seq":all_isoform_seqs.values()})
isoform_df["uniprotID"] = isoform_df["id"].str.split("|").str[1]
isoform_df

Unnamed: 0,id,seq,uniprotID
0,sp|O00712|NFIB_HUMAN Nuclear factor 1 B-type O...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712
1,sp|O00712-2|NFIB_HUMAN Isoform 3 of Nuclear fa...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712-2
2,sp|O00712-4|NFIB_HUMAN Isoform 4 of Nuclear fa...,MERIPVSVDFWVVCCAVLKCNPGIPMERIPVSVDFWVVCCAVLKCN...,O00712-4
3,sp|O00712-5|NFIB_HUMAN Isoform 5 of Nuclear fa...,MMYSPICLTQDEFHPFIEALLPHVRAIAYTWFNLQARKRKYFKKHE...,O00712-5
4,sp|O00712-6|NFIB_HUMAN Isoform 6 of Nuclear fa...,MNSGVNLQRSLSSPPSSKRPKTISIDENMEPSPTGDFYPSPSSPAA...,O00712-6
...,...,...,...
389,sp|Q9Y462-2|ZN711_HUMAN Isoform 2 of Zinc fing...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462-2
390,sp|Q9Y462-3|ZN711_HUMAN Isoform 3 of Zinc fing...,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462-3
391,sp|Q9Y4A8|NF2L3_HUMAN Nuclear factor erythroid...,MKHLKRWWSAGGGLLHLTLLLSLAGLRVDLDLYLLLPPPTLLQDEL...,Q9Y4A8
392,sp|Q9Y6X0|SETBP_HUMAN SET-binding protein OS=H...,MESRETLSSSRQRGGESDFLPVSSAKPPAAPGCAGEPLLSTPGPGK...,Q9Y6X0


In [115]:
# Iterate through all known ADs, use uniprotID to filter the isoform_df
# Then use start and end

def return_uniprotID_isoform_mappings(known_ADs, isoform_df):
    #isoform_match_dict = {}
    known_ADs["matching_isoforms"] = ""
    
    for i in known_ADs.index:
        uniprotID = known_ADs["uniprotID"].loc[i]
        matching_isoforms = []

        # Check that uniprotID does not already specify its isoform
        if ("-" not in uniprotID):
            obs_region_seqs = []
            
            AD_seq = known_ADs["Sequence"].loc[i]
            AD_start = known_ADs["Start"].loc[i]
            AD_end = known_ADs["End"].loc[i]

            uniprotID_isoforms = isoform_df[isoform_df["uniprotID"].str.contains(uniprotID)]
            for j in uniprotID_isoforms.index:
                obs_full_seq = uniprotID_isoforms["seq"].loc[j]
                if (len(obs_full_seq) > AD_start) and (len(obs_full_seq) >= AD_end):
                    obs_region_seq = obs_full_seq[int(AD_start) - 1:int(AD_end)]
                else:
                    obs_region_seq = ""

                if obs_region_seq == AD_seq:
                    matching_isoforms.append(uniprotID_isoforms["uniprotID"].loc[j])
                obs_region_seqs.append(obs_region_seq)
        else:
            # Already know isoform
            matching_isoforms.append(uniprotID)
        
        known_ADs["matching_isoforms"].loc[i] = matching_isoforms
        #isoform_match_dict[uniprotID] = matching_isoforms
        
        if len(matching_isoforms) == 0:
            print("no matching isoforms for:")
            print(uniprotID)
            print("Tested:")
            print(uniprotID_isoforms["uniprotID"])
            print(AD_start)
            print(AD_end)
            print("expected:")
            print(uniprotID)
            print(AD_seq)
            print("observed:")
            print(obs_region_seqs)
            print()
    
    return known_ADs

In [116]:
SFARI_TFs = return_uniprotID_isoform_mappings(SFARI_TFs, isoform_df)
SFARI_TFs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_ADs["matching_isoforms"].loc[i] = matching_isoforms


no matching isoforms for:
O43151
Tested:
24      O43151
25    O43151-2
26    O43151-3
Name: uniprotID, dtype: object
1
1660
expected:
O43151
MDSGPVYHGDSRQLSASGVPVNGAREPAGPSLLGTGGPWRVDQKPDWEAAPGPAHTARLEDAHDLVAFSAVAEAVSSYGALSTRLYETFNREMSREAGNNSRGPRPGPEGCSAGSEDLDTLQTALALARHGMKPPNCNCDGPECPDYLEWLEGKIKSVVMEGGEERPRLPGPLPPGEAGLPAPSTRPLLSSEVPQISPQEGLPLSQSALSIAKEKNISLQTAIAIEALTQLSSALPQPSHSTPQASCPLPEALSPPAPFRSPQSYLRAPSWPVVPPEEHSSFAPDSSAFPPATPRTEFPEAWGTDTPPATPRSSWPMPRPSPDPMAELEQLLGSASDYIQSVFKRPEALPTKPKVKVEAPSSSPAPAPSPVLQREAPTPSSEPDTHQKAQTALQQHLHHKRSLFLEQVHDTSFPAPSEPSAPGWWPPPSSPVPRLPDRPPKEKKKKLPTPAGGPVGTEKAAPGIKPSVRKPIQIKKSRPREAQPLFPPVRQIVLEGLRSPASQEVQAHPPAPLPASQGSAVPLPPEPSLALFAPSPSRDSLLPPTQEMRSPSPMTALQPGSTGPLPPADDKLEELIRQFEAEFGDSFGLPGPPSVPIQDPENQQTCLPAPESPFATRSPKQIKIESSGAVTVLSTTCFHSEEGGQEATPTKAENPLTPTLSGFLESPLKYLDTPTKSLLDTPAKRAQAEFPTCDCVEQIVEKDEGPYYTHLGSGPTVASIRELMEERYGEKGKAIRIEKVIYTGKEGKSSRGCPIAKWVIRRHTLEEKLLCLVRHRAGHHCQNAVIVILILAWEGIPRSLGDTLYQELTDTLRKYGNPTSRRCGLNDDRTCACQGKDPNTCGASFSFGCSWSMYFNGCK

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms
0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,[Q9H2P0]
1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,[Q5TGY3]
2,ARID2,1,1835,Q68CP9,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,[Q68CP9]
3,ARNT2,1,717,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,[Q9HBZ2]
4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,[Q96QS3]
...,...,...,...,...,...,...,...
122,ZNF711,1,761,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,[Q9Y462]
123,ZNF713,1,430,Q8N859,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[]
124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,[Q6NX45]
125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,[Q7Z570]


In [117]:
SFARI_TFs["isoforms"] = [len(_) for _ in SFARI_TFs["matching_isoforms"]]
SFARI_TFs

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms
0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,[Q9H2P0],1
1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,[Q5TGY3],1
2,ARID2,1,1835,Q68CP9,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,[Q68CP9],1
3,ARNT2,1,717,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,[Q9HBZ2],1
4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,[Q96QS3],1
...,...,...,...,...,...,...,...,...
122,ZNF711,1,761,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,[Q9Y462],1
123,ZNF713,1,430,Q8N859,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0
124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,[Q6NX45],1
125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,[Q7Z570],1


In [118]:
SFARI_TFs[SFARI_TFs_isoforms["isoforms"] != 1]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms
25,ESRRB,1,433,O95718,,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,"[O95718, O95718-1]",2
102,TET3,1,1660,O43151,,MDSGPVYHGDSRQLSASGVPVNGAREPAGPSLLGTGGPWRVDQKPD...,[],0
123,ZNF713,1,430,Q8N859,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0


In [119]:
SFARI_TFs["orig_uniprotID"] = SFARI_TFs["uniprotID"]
SFARI_TFs["uniprotID"] = [_[0]  if len(_) > 0 else "no match" for _ in SFARI_TFs["matching_isoforms"]]

In [120]:
SFARI_TFs

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID
0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,[Q9H2P0],1,Q9H2P0
1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,[Q5TGY3],1,Q5TGY3
2,ARID2,1,1835,Q68CP9,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,[Q68CP9],1,Q68CP9
3,ARNT2,1,717,Q9HBZ2,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,[Q9HBZ2],1,Q9HBZ2
4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,[Q96QS3],1,Q96QS3
...,...,...,...,...,...,...,...,...,...
122,ZNF711,1,761,Q9Y462,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,[Q9Y462],1,Q9Y462
123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859
124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,[Q6NX45],1,Q6NX45
125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,[Q7Z570],1,Q7Z570


In [121]:
uniprot_output = pd.read_csv("../data/SFARI_TFs_all_isoforms.txt", sep = "\t")
uniprot_output

Unnamed: 0,From,Entry,Alternative products (isoforms)
0,Q9H2P0,Q9H2P0,ALTERNATIVE PRODUCTS:
1,Q5TGY3,Q5TGY3,ALTERNATIVE PRODUCTS:
2,Q68CP9,Q68CP9,ALTERNATIVE PRODUCTS: Event=Alternative splic...
3,Q9HBZ2,Q9HBZ2,ALTERNATIVE PRODUCTS: Event=Alternative splic...
4,Q96QS3,Q96QS3,ALTERNATIVE PRODUCTS:
...,...,...,...
122,Q9Y462,Q9Y462,ALTERNATIVE PRODUCTS: Event=Alternative splic...
123,Q8N859,Q8N859,ALTERNATIVE PRODUCTS:
124,Q6NX45,Q6NX45,ALTERNATIVE PRODUCTS:
125,Q7Z570,Q7Z570,


In [122]:
uniprot_output["uniprotID"] = uniprot_output["Alternative products (isoforms)"].str.extract('IsoId=(.*?);')
uniprot_output

Unnamed: 0,From,Entry,Alternative products (isoforms),uniprotID
0,Q9H2P0,Q9H2P0,ALTERNATIVE PRODUCTS:,
1,Q5TGY3,Q5TGY3,ALTERNATIVE PRODUCTS:,
2,Q68CP9,Q68CP9,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q68CP9-1
3,Q9HBZ2,Q9HBZ2,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9HBZ2-1
4,Q96QS3,Q96QS3,ALTERNATIVE PRODUCTS:,
...,...,...,...,...
122,Q9Y462,Q9Y462,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9Y462-1
123,Q8N859,Q8N859,ALTERNATIVE PRODUCTS:,
124,Q6NX45,Q6NX45,ALTERNATIVE PRODUCTS:,
125,Q7Z570,Q7Z570,,


In [123]:
uniprot_output["Alternative products (isoforms)"].iloc[3]

'ALTERNATIVE PRODUCTS:  Event=Alternative splicing; Named isoforms=2; Name=1; IsoId=Q9HBZ2-1; Sequence=Displayed; Name=2; IsoId=Q9HBZ2-2; Sequence=VSP_022687;'

In [124]:
uniprot_output["uniprotID"] = uniprot_output["uniprotID"].fillna(uniprot_output["From"])
uniprot_output

Unnamed: 0,From,Entry,Alternative products (isoforms),uniprotID
0,Q9H2P0,Q9H2P0,ALTERNATIVE PRODUCTS:,Q9H2P0
1,Q5TGY3,Q5TGY3,ALTERNATIVE PRODUCTS:,Q5TGY3
2,Q68CP9,Q68CP9,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q68CP9-1
3,Q9HBZ2,Q9HBZ2,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9HBZ2-1
4,Q96QS3,Q96QS3,ALTERNATIVE PRODUCTS:,Q96QS3
...,...,...,...,...
122,Q9Y462,Q9Y462,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Q9Y462-1
123,Q8N859,Q8N859,ALTERNATIVE PRODUCTS:,Q8N859
124,Q6NX45,Q6NX45,ALTERNATIVE PRODUCTS:,Q6NX45
125,Q7Z570,Q7Z570,,Q7Z570


In [125]:
uniprot_output[["From", "uniprotID"]].to_csv("../data/SFARI_TFs_all_isoforms_mapped.csv")

In [126]:
uniprotID_mappings = {}
for key, val in zip(uniprot_output["From"], uniprot_output["uniprotID"]):
    uniprotID_mappings[key] = val

In [127]:
SFARI_TFs_copy = SFARI_TFs
SFARI_TFs_copy = SFARI_TFs_copy.replace({"uniprotID": uniprotID_mappings}) 
SFARI_TFs_copy

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID
0,ADNP,1,1102,Q9H2P0,,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,[Q9H2P0],1,Q9H2P0
1,AHDC1,1,1603,Q5TGY3,,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,[Q5TGY3],1,Q5TGY3
2,ARID2,1,1835,Q68CP9-1,,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,[Q68CP9],1,Q68CP9
3,ARNT2,1,717,Q9HBZ2-1,,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,[Q9HBZ2],1,Q9HBZ2
4,ARX,1,562,Q96QS3,,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,[Q96QS3],1,Q96QS3
...,...,...,...,...,...,...,...,...,...
122,ZNF711,1,761,Q9Y462-1,,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,[Q9Y462],1,Q9Y462
123,ZNF713,1,430,no match,,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,[],0,Q8N859
124,ZNF774,1,483,Q6NX45,,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,[Q6NX45],1,Q6NX45
125,ZNF804A,1,1209,Q7Z570,,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,[Q7Z570],1,Q7Z570


In [129]:
SFARI_TFs_copy.to_csv("../data/SFARI_TFs_with_isoform_id.csv")

In [128]:
SFARI_TFs_copy[SFARI_TFs_copy["GeneName"] == "ERG"]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,matching_isoforms,isoforms,orig_uniprotID
31,ERG,1,486,P11308-3,,MIQTVPDPAAHIKEALSVVSEDQSLFECAYGTPHLAKTEMTASSSS...,[P11308-3],1,P11308
