In [1]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import protfasta 
import re

from Bio import pairwise2
from Bio.Seq import Seq 

%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools



In [3]:
all_isoform_seqs = protfasta.read_fasta("../data/uniprot_known_AD_seqs_isoforms.txt")
isoform_df = pd.DataFrame({"id":all_isoform_seqs.keys(),
             "seq":all_isoform_seqs.values()})
isoform_df["uniprotID"] = isoform_df["id"].str.split("|").str[1]

# Adding the sequences of A0A669KBM4 and O75152 to isoform_df
# Uniprot says A0A669KBM4 is now obsolete
isoform_df.loc[len(isoform_df.index)] = ['>tr|A0A669KBM4|Release 2021_03/2021_03|02-Jun-2021',
                                         'MAEEQQQPPPQQPDAHQQLPPSAPNSGVALPALVPGLPGTEASALQHKIKNSICKTVQSKVDCILQEVEKFTDLEKLYLYLQLPSGLSNGEKSDQNAMSSSRAQQMHAFSWIRNTLEEHPETSLPKQEVYDEYKSYCDNLGYHPLSAADFGKIMKNVFPNMKARRLGTRGKSKYCYSGLRKKAFVHMPTLPNLDFHKTGDGLEGAEPSGQLQNIDEEVISSACRLVCEWAQKVLSQPFDTVLELARFLVKSHYIGTKSMAALTVMAAAPAGMKGITQPSAFIPTAESNSFQPQVKTLPSPIDAKQQLQRKIQKKQQEQKLQSPLPGESAAKKSESATSNGVTNLPNGNPSILSPQPIGIVVAAVPSPIPVQRTRQLVTSPSPMSSSDGKVLPLNVQVVTQHMQSVKQAPKTPQNVPASPGGDRSARHRYPQILPKPANTSALTIRSPTTVLFTSSPIKTAVVPASHMSSLNVVKMTTISLTPSNSNTPLKHSASVSSATGTTEESRSVPQIKNGSVVSLQSPGSRSSSAGGTSAVEVKVEPETSSDEHPVQCQENSDEAKAPQTPSALLGQKSNTDGALQKPSNEGVIEIKATKVCDQRTKCKSRCNEMLPGTSTGNNQSTITLSVASQNLTFTSSSSPPNGDSINKDPKLCTKSPRKRLSSTLQETQVPPVKKPIVEQLSAATIEGQKQGSVKKDQKVPHSGKTEGSTAGAQIPSKVSVNVSSHIGANQPLNSSALVISDSALEQQTTPSSSPDIKVKLEGSVFLLDSDSKSVGSFNPNGWQQITKDSEFISASCEQQQDISVMTIPEHSDINDLEKSVWELEGMPQDTYSQQLHSQIQESSLNQIQAHSSDQLPLQSELKEFEPSVSQTNESYFPFDDELTQDSIVEELVLMEQQMSMNNSHSYGNCLGMTLQSQSVTPGAPMSSHTSSTHFYHPIHSNGTPIHTPTPTPTPTPTPTPTPTPTSEMIAGSQSLSRESPCSRLAQTTPVDSALGSSRHTPIGTPHSNCSSSVPPSPVECRNPFAFTPISSSMAYHDASIVSSSPVKPMQRPMATHPDKTKLEWMNNGYSGVGNSSVSGHGILPSYQELVEDRFRKPHAFAVPGQSYQSQSRHHDTHFGRLTPVSPVQHQGATVNNTNKQEGFAVPAPLDNKGTNSSASSNFRCRSVSPAVHRQRNLSGSTLYPVSNIPRSNVTPFGSPVTPEVHVFTNVHTDACANNIAQRSQSVPLTVMMQTAFPNALQKQANSKKITNVLLSKLDSDNDDAVRGLGMNNLPSNYTARMNLTQILEPSTVFPSANPQNMIDSSTSVYEFQTPSYLTKSNSTGQINFSPGDNQAQSEIGEQQLDFNSTVKDLLSGDSLQTNQQLVGQGASDLTNTASDFSSDIRLSSELSGSINDLNTLDPNLLFDPGRQQGQDDEATLEELKNDPLFQQICSESMNSMTSSGFEWIESKDHPTVEMLG',
                                         "A0A669KBM4"] 
# O75152
isoform_df.loc[len(isoform_df.index)] = ['>sp|O75152|ZC11A_HUMAN Zinc finger CCCH domain-containing protein 11A OS=Homo sapiens OX=9606 GN=ZC3H11A PE=1 SV=3',
                                         'MPNQGEDCYFFFYSTCTKGDSCPFRHCEAAIGNETVCTLWQEGRCFRQVCRFRHMEIDKKRSEIPCYWENQPTGCQKLNCAFHHNRGRYVDGLFLPPSKTVLPTVPESPEEEVKASQLSVQQNKLSVQSNPSPQLRSVMKVESSENVPSPTHPPVVINAADDDEDDDDQFSEEGDETKTPTLQPTPEVHNGLRVTSVRKPAVNIKQGECLNFGIKTLEEIKSKKMKEKSKKQGEGSSGVSSLLLHPEPVPGPEKENVRTVVRTVTLSTKQGEEPLVRLSLTERLGKRKFSAGGDSDPPLKRSLAQRLGKKVEAPETNIDKTPKKAQVSKSLKERLGMSADPDNEDATDKVNKVGEIHVKTLEEILLERASQKRGELQTKLKTEGPSKTDDSTSGARSSSTIRIKTFSEVLAEKKHRQQEAERQKSKKDTTCIKLKIDSEIKKTVVLPPIVASRGQSEEPAGKTKSMQEVHIKTLEEIKLEKALRVQQSSESSTSSPSQHEATPGARRLLRITKRTGMKEEKNLQEGNEVDSQSSIRTEAKEASGETTGVDITKIQVKRCETMREKHMQKQQEREKSVLTPLRGDVASCNTQVAEKPVLTAVPGITRHLTKRLPTKSSQKVEVETSGIGDSLLNVKCAAQTLEKRGKAKPKVNVKPSVVKVVSSPKLAPKRKAVEMHAAVIAAVKPLSSSSVLQEPPAKKAAVAVVPLVSEDKSVTVPEAENPRDSLVLPPTQSSSDSSPPEVSGPSSSQMSMKTRRLSSASTGKPPLSVEDDFEKLIWEISGGKLEAEIDLDPGKDEDDLLLELSEMIDS',
                                         "O75152"] 
isoform_df

Unnamed: 0,id,seq,uniprotID
0,tr|A0A024R0Y4|A0A024R0Y4_HUMAN Transcriptional...,MDRLGSFSNDPSDKPPCRGCSSYLMEPYIKCAECGPPPFFLCLQCF...,A0A024R0Y4
1,tr|A0A087WXG3|A0A087WXG3_HUMAN Basic helix-loo...,MSIRPPGEPPSPGGAAMAELKSLSGDAYLALSHGYAAAAAGLAYGA...,A0A087WXG3
2,sp|A6NJG6|ARGFX_HUMAN Arginine-fifty homeobox ...,MRNRMAPENPQPDPFINRNYSNMKVIPPQDPASPSFTLLSKLECSG...,A6NJG6
3,sp|A8MTJ6|FOXI3_HUMAN Forkhead box protein I3 ...,MALYCGDNFGVYSQPGLPPPAATAAAPGAPPAARAPYGLADYAAPP...,A8MTJ6
4,sp|A8MW92|P20L1_HUMAN PHD finger protein 20-li...,MSKKPPNRPGITFEIGARLEALDYLQKWYPSRIEKIDYEEGKMLVH...,A8MW92
...,...,...,...
1607,sp|Q9Y6Y1-2|CMTA1_HUMAN Isoform 2 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-2
1608,sp|Q9Y6Y1-3|CMTA1_HUMAN Isoform 3 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-3
1609,sp|Q9Y6Y1-4|CMTA1_HUMAN Isoform 4 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-4
1610,>tr|A0A669KBM4|Release 2021_03/2021_03|02-Jun-...,MAEEQQQPPPQQPDAHQQLPPSAPNSGVALPALVPGLPGTEASALQ...,A0A669KBM4


In [4]:
ADs_with_isoforms = pd.read_csv("../output/known_ADs_considering_isoforms.csv")
ADs_with_isoforms

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
0,ABRAXAS1,121,200,Q6UWZ7,[['Q6UWZ7']],ENST00000321945,Q6UWZ7,DelRosso et al.,CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,
1,AEBP1,1088,1158,Q8IUX7,[['Q8IUX7']],,Q8IUX7,Staller Activity Data,TF,EVVTEFGTEVEPEFGTKVEPEFETQLEPEFETQLEPEFEEEEEEEK...,
2,AHCTF1,1445,1698,Q8WYP5,"[['Q8WYP5'], ['Q8WYP5']]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF,IRANDNKSMADVLGDGGNSSLTISEGPIVSERRLNQEVALNLKEDH...,
3,AHR,118,126,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,LLQALNGFV,
4,AHR,266,268,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,FAI,
...,...,...,...,...,...,...,...,...,...,...,...
739,ZSCAN20,262,341,P17040,"[['P17040', 'P17040-3']]",ENST00000361328,P17040,DelRosso et al.,TF,PSNTSEKEQGPEFWGLSLINSGKRSTADYSLDNEPAQALTWRDSRA...,
740,ZXDA,572,699,P98168,"[['P98168'], ['P98168']]",nan / nan,P98168 / P98168 / P98168,"PMID: 17493635, Soto / R4TA_regions.txt / acti...",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
741,ZXDB,576,703,P98169,[['P98169']],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
742,ZXDC,579,688,Q2QGD7,"[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2']]",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF,DSPLVLGTAATVLQQGSFSVDDVQTVSAGALGCLVALPMKNLSDDP...,


In [5]:
# For each, check if they have a match in their canonical isoform

In [6]:
# Returns the coordinates of the domain in the canonical isofom, if found
# Must pass in an isoform

def return_canonical_coords(isoform_id, ADs):
    if "-" in isoform_id:
        canonical_isoform_id = isoform_id.split("-")[0]
        full_seq_canonical_seq = isoform_df[isoform_df["uniprotID"] == canonical_isoform_id]["seq"].iloc[0]
        isoform_domain_seq = ADs[ADs["uniprotID"] == isoform_id]["ProteinRegionSeq"].iloc[0]
        match=(re.search(isoform_domain_seq, full_seq_canonical_seq))
        if match:
            start, end = match.span()
            start += 1
            return canonical_isoform_id, start, end
        else:
            print("No canonical match for " + isoform_id)
            print_first_alignment(full_seq_canonical_seq, isoform_domain_seq)
            print()

In [7]:
def print_first_alignment(AA_seq1, AA_seq2):
    seq1 = Seq(AA_seq1) 
    seq2 = Seq(AA_seq2)
    alignments = pairwise2.align.globalxx(seq1, seq2)
    print("\tCanonical full protein sequence:")
    print(alignments[0][0]) 
    print("\tIsoform AD sequence:")
    print(alignments[0][1])

In [8]:
matches_found = 0

for i in ADs_with_isoforms.index:
    uniprotID = ADs_with_isoforms["uniprotID"].iloc[i]
    orig_start = ADs_with_isoforms["Start"].iloc[i]
    orig_end = ADs_with_isoforms["End"].iloc[i]

    canonical_coords = return_canonical_coords(uniprotID, ADs_with_isoforms)
    
    if canonical_coords:
        print("Changing: " + ADs_with_isoforms["Gene"].iloc[i])
        print("\tFrom: " + str((uniprotID, orig_start, orig_end)))
        print("\tTo:" + str(canonical_coords))
        print()
        canonical_isoform_id, start, end = canonical_coords
        ADs_with_isoforms.at[i, 'uniprotID'] = canonical_isoform_id
        ADs_with_isoforms.at[i, 'Start'] = start
        ADs_with_isoforms.at[i, 'End'] = end
        matches_found += 1

print(str(matches_found) + " matches found")

No canonical match for P17544-1
	Canonical full protein sequence:
MGDDRPFVCNAPGCGQRFTNEDHLAVHKHKHEMTLKFGPARTDSVIIADQTPTPTRFLKNCEEVGLFNELASSFEHEFKKAADEDEKKA-----A----A--GPLDMSLPSTPDIKIKEEEPVEVDSSPPDSPASSPCSPPLKEKEVTPKPVLISTPTPTIVRPGSLPLHLGYDPLHPTLPSPTSVITQAPPSNRQMGSPTGSLPLVMHLANGQTMPVLPGPPVQMPSVISLARPVSMVPNIPGIPGPPVNSSGSISPSGHPIPSEAKMRLKATLTHQVSSINGGCGMVVGTASTMVTARPEQSQILIQHPDAPSPAQPQVSPAQPTPSTGGRRRRTVDEDPDERRQRFLERNRAAASRCRQKRKLWVSSLEKKAEELTSQNIQLSNEVTLLRNEVAQLKQLLLAHKDCPVTALQKKTQGYLESPKESSEPTGSPAPVIQHSSATAPSNGLSVRSAAEAVATSVLTQMASQRTELSMPIQSHVIMTPQSQSAGR
	Isoform AD sequence:
MGDDRPFVCNAPGCGQRFTNEDHLAVHKHKHEMTLKFGPARTDSVIIADQTPTPTRFLKNCEEVGLFNELASSFEHEFKKAADEDEKKARSRTVAKKLVAAAGPLDMSLPSTPDIKIKEEEPVEVDSSPPDSPASSPCSPPLKEKEVTPKPVLISTPTPTIVRPGSLPLHLGYDPLHPTLPSPTSVITQAPPSNRQMGSPTGSLPLVMHLANGQTMPVLPGPPVQMPSVISLARPVSMVPNIPGIPGPPVNSSGSISPSGHPIPSEAKMRLKATLTHQVSSINGGC----G----M------------------------------------------------------------------------------------------------------------------V------

In [9]:
ADs_with_isoforms[ADs_with_isoforms["uniprotID"].str.contains("-")]

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
32,ATF7,1,296,P17544-1,[['P17544-1']],,P17544,"activation_regions.txt, GSL",TF,MGDDRPFVCNAPGCGQRFTNEDHLAVHKHKHEMTLKFGPARTDSVI...,
71,CREB1,1,253,P16220-1,[['P16220-1']],,P16220,"PMID: 8394325, Soto",TF,MTMESGAENQQSGDAAVTEAENQQMTVQAQPQIATLAQVSMPAAHA...,
102,EBF3,429,551,Q9H4W6-2,[['Q9H4W6-2']],,Q9H4W6-2,"PMID: 28017370, Soto",TF,NSFSSQLAVNVSETSQANDQVGYSRNTSSVSPRGYVPSSTPQQSNY...,
328,MEF2B,170,368,Q02080-2,[['Q02080-2']],,Q02080-2,"PMID: 8668199, Soto",TF,FRPAAPKAGPPGLVHPLFSPSHLTSKTPPPLYLPTEGRRSDLPGGL...,
332,MEIS2,340,470,O14770-4,[['O14770-4']],,O14770-4,"PMID: 20553494, Soto",TF,DQSNRAVSQGAAYSPEGQPMGSFVLDGQQHMGIRPAGLQSMPGDYV...,
338,MITF,212,301,O75030-2,[['O75030-2']],ENST00000352241,O75030,DelRosso et al.,TF,MNTHSRASCMQMDDVIDDIISLESSYNEEILGLMDPALQMANTLPV...,
407,NFYC,149,335,Q13952-2,[['Q13952-2']],,Q13952-2,"PMID: 8662945, Soto",TF,AQQPTAVQVQGQQQGQQTTSSTTTIQPGQIIIAQPQQGQTTPVTMQ...,
423,NR1H4,12,111,Q96RI1-2,[['Q96RI1-2']],ENST00000548884,Q96RI1,DelRosso et al.,TF,HLPTTDEFSFSENLFGVLTEQVAGPLGQNLEVEPYSQYSNVQFPQV...,
524,RARB,1,80,P10826-2,[['P10826-2']],,P10826-2,"PMID: 8389696, Soto",TF,MFDCMDVLSVSPGQILDFYTASPSSCMLQEKALKACFSGLTQTEWQ...,
525,RARB,147,448,P10826-2,[['P10826-2']],,P10826-2,"PMID: 8389696, Soto",TF,SKESVRNDRNKKKKETSKQECTESYEMTAELDDLTEKIRKAHQETF...,


In [10]:
len(ADs_with_isoforms[ADs_with_isoforms["uniprotID"].str.contains("-")])

16

In [11]:
ADs_with_isoforms = ADs_with_isoforms.rename(columns = {"ProteinRegionSeq" : "Sequence"})

In [12]:
ADs_with_isoforms

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,Sequence,Notes
0,ABRAXAS1,121,200,Q6UWZ7,[['Q6UWZ7']],ENST00000321945,Q6UWZ7,DelRosso et al.,CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,
1,AEBP1,1088,1158,Q8IUX7,[['Q8IUX7']],,Q8IUX7,Staller Activity Data,TF,EVVTEFGTEVEPEFGTKVEPEFETQLEPEFETQLEPEFEEEEEEEK...,
2,AHCTF1,1445,1698,Q8WYP5,"[['Q8WYP5'], ['Q8WYP5']]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF,IRANDNKSMADVLGDGGNSSLTISEGPIVSERRLNQEVALNLKEDH...,
3,AHR,118,126,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,LLQALNGFV,
4,AHR,266,268,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,FAI,
...,...,...,...,...,...,...,...,...,...,...,...
739,ZSCAN20,262,341,P17040,"[['P17040', 'P17040-3']]",ENST00000361328,P17040,DelRosso et al.,TF,PSNTSEKEQGPEFWGLSLINSGKRSTADYSLDNEPAQALTWRDSRA...,
740,ZXDA,572,699,P98168,"[['P98168'], ['P98168']]",nan / nan,P98168 / P98168 / P98168,"PMID: 17493635, Soto / R4TA_regions.txt / acti...",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
741,ZXDB,576,703,P98169,[['P98169']],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
742,ZXDC,579,688,Q2QGD7,"[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2']]",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF,DSPLVLGTAATVLQQGSFSVDDVQTVSAGALGCLVALPMKNLSDDP...,


In [13]:
checking_isoforms = AD_comparison_tools.return_uniprotID_isoform_mappings(ADs_with_isoforms, isoform_df)
checking_isoforms

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,Sequence,Notes,matching_isoforms
0,ABRAXAS1,121,200,Q6UWZ7,[['Q6UWZ7']],ENST00000321945,Q6UWZ7,DelRosso et al.,CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,,[Q6UWZ7]
1,AEBP1,1088,1158,Q8IUX7,[['Q8IUX7']],,Q8IUX7,Staller Activity Data,TF,EVVTEFGTEVEPEFGTKVEPEFETQLEPEFETQLEPEFEEEEEEEK...,,[Q8IUX7]
2,AHCTF1,1445,1698,Q8WYP5,"[['Q8WYP5'], ['Q8WYP5']]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF,IRANDNKSMADVLGDGGNSSLTISEGPIVSERRLNQEVALNLKEDH...,,[Q8WYP5]
3,AHR,118,126,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,LLQALNGFV,,[P35869]
4,AHR,266,268,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,FAI,,[P35869]
...,...,...,...,...,...,...,...,...,...,...,...,...
739,ZSCAN20,262,341,P17040,"[['P17040', 'P17040-3']]",ENST00000361328,P17040,DelRosso et al.,TF,PSNTSEKEQGPEFWGLSLINSGKRSTADYSLDNEPAQALTWRDSRA...,,"[P17040, P17040-3]"
740,ZXDA,572,699,P98168,"[['P98168'], ['P98168']]",nan / nan,P98168 / P98168 / P98168,"PMID: 17493635, Soto / R4TA_regions.txt / acti...",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,,[P98168]
741,ZXDB,576,703,P98169,[['P98169']],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,,[P98169]
742,ZXDC,579,688,Q2QGD7,"[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2']]",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF,DSPLVLGTAATVLQQGSFSVDDVQTVSAGALGCLVALPMKNLSDDP...,,"[Q2QGD7, Q2QGD7-2]"


In [14]:
ADs_with_isoforms = ADs_with_isoforms.drop(columns = "Notes")
checking_isoforms = checking_isoforms.drop(columns = "Notes")

In [15]:
ADs_with_isoforms.equals(checking_isoforms)

True

In [16]:
#ADs_with_isoforms.to_csv("../output/known_ADs_considering_isoforms_and_canonical.csv", index = False)

In [18]:
ADs_with_isoforms[ADs_with_isoforms["Gene"] == "ERG"]

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,Sequence,matching_isoforms
125,ERG,433,479,P11308,[['P11308-3']],,P11308,"PMID: 9681824, Soto",TF,PHPPALPVTSSSFFAAPNPYWNSPTGGIYPNTRLPTSHMPSHLGTYY,[P11308]
126,ERG,118,261,P11308,"[['P11308-1', 'P11308-3', 'P11308-5', 'P11308-...",nan / ENST00000288319,P11308 / P11308,"PMID: 14603248, Soto / DelRosso et al.",TF,MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...,[P11308]


In [10]:
# Now, merging
def return_merged_row(uniprotID, df):
    # Only look at rows with the same uniprot ID
    same_uniprotID_rows = df[df["uniprotID"] == uniprotID]
    same_uniprotID_rows = same_uniprotID_rows.sort_values(by = "Start")
    
    # Final dataframe columns
    new_starts = []
    new_ends = []
    genes = []
    AD_names = []
    references = []
    matching_isoforms = []
    transcript_IDs = []
    orig_uniprotIDs = []
    TileTypes = []
    
    # Current row's values
    curr_start = -1
    curr_end = -1
    curr_genes = []
    curr_AD_names = []
    curr_references = []
    curr_matching_isoforms = []
    curr_transcript_IDs = []
    curr_orig_uniprotIDs = []
    curr_TileTypes = []
    
    for i in same_uniprotID_rows.index:
        # Merge current row with next row
        if curr_end >= same_uniprotID_rows.loc[i]["Start"]:
            curr_end = max(curr_end, same_uniprotID_rows.loc[i]["End"])
            curr_genes.append(same_uniprotID_rows.loc[i]["GeneName"])
            curr_references.append(same_uniprotID_rows.loc[i]["Reference"])
            curr_matching_isoforms.append(same_uniprotID_rows.loc[i]["matching_isoforms"])
            curr_transcript_IDs.append(same_uniprotID_rows.loc[i]["Canonical Transcript ID"])
            curr_orig_uniprotIDs.append(same_uniprotID_rows.loc[i]["orig_uniprotID"])
            curr_TileTypes.append(same_uniprotID_rows.loc[i]["TileType"])
        
        # Don't merge current row with next row
        else: 
            new_starts.append(curr_start)
            new_ends.append(curr_end)
            genes.append(" / ".join(set([c.strip() for c in curr_genes])))
            
            curr_AD_names = [str(c) for c in curr_AD_names]
            AD_names.append(" / ".join(curr_AD_names))
            
            curr_references = [str(c) for c in curr_references]
            references.append(" / ".join(curr_references))
            
            # curr_matching_isoforms = [c for c in curr_matching_isoforms]
            matching_isoforms.append(curr_matching_isoforms)
            
            curr_transcript_IDs = [str(c) for c in curr_transcript_IDs]
            transcript_IDs.append(" / ".join(curr_transcript_IDs))
            
            curr_orig_uniprotIDs = [str(c) for c in curr_orig_uniprotIDs]
            orig_uniprotIDs.append(" / ".join(curr_orig_uniprotIDs))
            
            curr_TileTypes = [str(c) for c in curr_TileTypes]
            TileTypes.append(" / ".join(curr_TileTypes))
            
            curr_start = same_uniprotID_rows.loc[i]["Start"]
            curr_end = same_uniprotID_rows.loc[i]["End"]
            
            curr_genes = [same_uniprotID_rows.loc[i]["GeneName"]]
            curr_references = [same_uniprotID_rows.loc[i]["Reference"]]
            curr_matching_isoforms = [same_uniprotID_rows.loc[i]["matching_isoforms"]]
            curr_transcript_IDs = [same_uniprotID_rows.loc[i]["Canonical Transcript ID"]]
            curr_orig_uniprotIDs = [same_uniprotID_rows.loc[i]["orig_uniprotID"]]
            curr_TileTypes = [same_uniprotID_rows.loc[i]["TileType"]]
    
    # Append the last values
    new_starts.append(curr_start)
    new_ends.append(curr_end)
    
    genes.append(" / ".join(set([c.strip() for c in curr_genes])))
    
    curr_AD_names = [str(c) for c in curr_AD_names]
    AD_names.append(" / ".join(curr_AD_names))
    
    curr_references = [str(c) for c in curr_references]
    references.append(" / ".join(curr_references))
    
    # curr_matching_isoforms = [c for c in curr_matching_isoforms]
    matching_isoforms.append(curr_matching_isoforms)
            
    curr_transcript_IDs = [str(c) for c in curr_transcript_IDs]
    transcript_IDs.append(" / ".join(curr_transcript_IDs))
            
    curr_orig_uniprotIDs = [str(c) for c in curr_orig_uniprotIDs]
    orig_uniprotIDs.append(" / ".join(curr_orig_uniprotIDs))
    
    curr_TileTypes = [str(c) for c in curr_TileTypes]
    TileTypes.append(" / ".join(curr_TileTypes))
    
    # Remove the first (because it is just -1 or "")
    new_starts = new_starts[1:]
    new_ends = new_ends[1:]
    genes = genes[1:]
    references = references[1:]
    matching_isoforms = matching_isoforms[1:]
    transcript_IDs = transcript_IDs[1:]
    orig_uniprotIDs = orig_uniprotIDs[1:]
    TileTypes = TileTypes[1:]
    
    return pd.DataFrame({"Gene": genes,
                         "Start": new_starts,
                        "End": new_ends,
                        "uniprotID": uniprotID,
                         "Matching Isoforms" : matching_isoforms, 
                         "Canonical Transcript ID" : transcript_IDs,
                         "orig_uniprotID" :orig_uniprotIDs,
                         "Reference": references,
                         "TileType" : TileTypes
                        })

In [16]:
ADs_with_isoforms = ADs_with_isoforms.rename(columns = {"Gene" : "GeneName", 
                                                       "Matching Isoforms" : "matching_isoforms"})

In [18]:
ADs_with_isoforms

Unnamed: 0,GeneName,Start,End,uniprotID,matching_isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
0,ABRAXAS1,121,200,Q6UWZ7,[['Q6UWZ7']],ENST00000321945,Q6UWZ7,DelRosso et al.,CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,
1,AEBP1,1088,1158,Q8IUX7,[['Q8IUX7']],,Q8IUX7,Staller Activity Data,TF,EVVTEFGTEVEPEFGTKVEPEFETQLEPEFETQLEPEFEEEEEEEK...,
2,AHCTF1,1445,1698,Q8WYP5,"[['Q8WYP5'], ['Q8WYP5']]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF,IRANDNKSMADVLGDGGNSSLTISEGPIVSERRLNQEVALNLKEDH...,
3,AHR,118,126,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,LLQALNGFV,
4,AHR,266,268,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,FAI,
...,...,...,...,...,...,...,...,...,...,...,...
755,ZXDB,576,703,P98169,[['P98169']],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
756,ZXDC,579,688,Q2QGD7,"[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2']]",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF,DSPLVLGTAATVLQQGSFSVDDVQTVSAGALGCLVALPMKNLSDDP...,
757,c-ETS1,131,242,P14921,[['P14921-3']],,P14921,"Choi 2000 list, GSL",TF,ILQKEDVKPYQVNGVNPAYPESRYTSDYFISYGIEHAQCVPPSEFS...,
758,c-Jun/AP-1,238,257,P05412,[['P05412']],,P05412,"Choi 2000 list, GSL",TF,ETPPLSPIDMESQERIKAER,


In [17]:
dfs = []
i = 0
for uniprotID in ADs_with_isoforms["uniprotID"].unique():
    dfs.append(return_merged_row(uniprotID, ADs_with_isoforms))

merged_ADs_with_isoforms = pd.concat(dfs)
merged_ADs_with_isoforms = merged_ADs_with_isoforms.reset_index(drop = True)
merged_ADs_with_isoforms

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType
0,ABRAXAS1,121,200,Q6UWZ7,[[['Q6UWZ7']]],ENST00000321945,Q6UWZ7,DelRosso et al.,CR
1,AEBP1,1088,1158,Q8IUX7,[[['Q8IUX7']]],,Q8IUX7,Staller Activity Data,TF
2,AHCTF1,1445,1698,Q8WYP5,"[[['Q8WYP5'], ['Q8WYP5']]]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF
3,AHR,118,126,P35869,[[['P35869']]],,P35869,"transcriptionalactivity_regions.txt, GSL",TF
4,AHR,266,268,P35869,[[['P35869']]],,P35869,"transcriptionalactivity_regions.txt, GSL",TF
...,...,...,...,...,...,...,...,...,...
739,ZSCAN20,262,341,P17040,"[[['P17040', 'P17040-3']]]",ENST00000361328,P17040,DelRosso et al.,TF
740,ZXDA,572,699,P98168,"[[['P98168'], ['P98168']]]",nan / nan,P98168 / P98168 / P98168,"PMID: 17493635, Soto / R4TA_regions.txt / acti...",TF
741,ZXDB,576,703,P98169,[[['P98169']]],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF
742,ZXDC,579,688,Q2QGD7,"[[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2...",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF
