In [1]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import protfasta 
import re

from Bio import pairwise2
from Bio.Seq import Seq 
%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools



# Adding Alerasool ADs to known ADs list

# 1. Loading in lists

In [3]:
# Loading in other lists
GSL = pd.read_csv("../output/GSL_with_isoforms.csv")#[["GeneName", "Start", "End", "uniprotID", "Reference", "Sequence"]]
GSL["Reference"] = GSL["Reference"] + ", GSL (" + GSL["uniprotID"] + ", " + GSL["Start"].astype(str) +"_" + GSL["End"].astype(str) + ")"
GSL["TileType"] = "TF"

Soto = pd.read_csv("../data/SotoEtAl_ADs.csv")
Soto["Reference"] = "PMID: " + Soto["Reference (PMID)"] + ", Soto (" + Soto["uniprotID"] + ", " + Soto["Start"].astype(str) +"_" + Soto["End"].astype(str) + ")"
Soto = Soto[["GeneName", "Start", "End", "uniprotID", "Reference", "Sequence"]]
Soto["TileType"] = "TF"

activity_data = pd.read_csv('../data/Staller2021/SupplementalDataSet4_ActivityData_PredictedADs_renorm20210708_uniprotIDs_added.csv', index_col = 0)
activity_data = activity_data[activity_data["RegionType"] == "Prediction"]
active = activity_data[activity_data["Activity_mean"] > 221]
active["Reference"] = "Staller Activity Data (" + activity_data["uniprotID"] + ", " + activity_data["Start"].astype(str) +"_" + activity_data["End"].astype(str) + ")"
active = active[["GeneName", "Start", "End", "uniprotID", "Reference", "ProteinRegionSeq"]]
active = active.rename(columns = {"ProteinRegionSeq" : "Sequence"})
active["GeneName"] = active["GeneName"].str.extract(r'\|.*\|(.*)_')
# Zero-indexing, so adjusting to make one-indexed like other lists
# [Start, End] -> [Start, End)
# It looks like the coordinates for Q8IZM8 that would match the sequence are actually starting at 680
# Based on BLAST and alignment
active["Start"] = active["Start"] + 1
active["End"] = active["End"]
active.at[89, "Start"] = 680
active.at[89, "End"] = 680 + (177-132) - 1
active["TileType"] = "TF"

In [4]:
active["orig_uniprotID"] = active["uniprotID"]
Soto["orig_uniprotID"] = Soto["uniprotID"]

In [5]:
Stanford_isoform_matches_finalized = pd.read_csv("../output/Stanford_isoform_matches_finalized.csv", index_col = 0)
Stanford_isoform_matches_finalized["uniprotID"] = Stanford_isoform_matches_finalized["uniprotID_match"]
Stanford_isoform_matches_finalized = Stanford_isoform_matches_finalized.rename(columns = {"Sublibrary" : "TileType"})
Stanford_isoform_matches_finalized = Stanford_isoform_matches_finalized[["GeneName", "Start", "End", "uniprotID", "Reference", "Sequence", "orig_uniprotID", "Canonical Transcript ID", "TileType"]]
Stanford_isoform_matches_finalized["Reference"] = "DelRosso et al. (" + Stanford_isoform_matches_finalized["uniprotID"] + ", "  + Stanford_isoform_matches_finalized["Start"].astype(str) +"_" + Stanford_isoform_matches_finalized["End"].astype(str) + ")"
Stanford_isoform_matches_finalized["TileType"] = Stanford_isoform_matches_finalized["TileType"].replace({"CRTiles" : "CR",
                                                                                                          "TFTiles" : "TF"})
Stanford_isoform_matches_finalized.head(3)

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,orig_uniprotID,Canonical Transcript ID,TileType
0,ABRAXAS1,121,200,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,Q6UWZ7,ENST00000321945,CR
1,AHR,532,641,P35869,"DelRosso et al. (P35869, 532_641)",QDSKNSDLYSIMKNLGIDFEDIRHMQNEKFFRNDFSGEVDFRDIDL...,P35869,ENST00000242057,TF
2,AHR,642,721,P35869,"DelRosso et al. (P35869, 642_721)",MKHMQVNGMFENWNSNQFVPFNCPQQDPQQYNVFTDLHGISQEFPY...,P35869,ENST00000242057,TF


In [6]:
alerasool = pd.read_csv("../data/alerasool_ADs_cleaned.csv", index_col = 0)
alerasool["TileType"] = "TF"
alerasool = alerasool.rename(columns = {"Gene" : "GeneName"})
alerasool.head(3)

Unnamed: 0,GeneName,uniprotID,Start,End,Sequence,Reference,new_AD,TileType
0,NEUROG3,Q9Y4Z2,81,140,RRSRRKKANDRERNRMHNLNSALDALRGVLPTFPDDAKLTKIETLR...,Alerasool et al,False,TF
1,ELF4,Q99607,1,80,MAITLQPSDLIFEFASNGMDDDIHQLEDPSVFPAVIVEQVPYPDLL...,Alerasool et al,False,TF
2,KLF7,O75840,1,80,MDVLASYSIFQELQLVHDTGYFSALPSLEETWQQTCLELERYLQTE...,Alerasool et al,False,TF


# 2. Obtaining all isoforms from uniprot

In [7]:
uniprotIDs = set(GSL["uniprotID"]) | set(Soto["uniprotID"]) | set(active["uniprotID"]) | set(Stanford_isoform_matches_finalized["uniprotID"]) | set(alerasool["uniprotID"])
len(uniprotIDs)

608

In [8]:
# saving uniprotIDs to upload
pd.DataFrame(uniprotIDs).to_csv("../data/all_AD_uniprotIDs.csv", header = None, index = None)

In [9]:
#all_isoform_seqs = protfasta.read_fasta("../data/uniprot_known_AD_seqs_isoforms.txt")
all_isoform_seqs = protfasta.read_fasta("../data/uniprot_known_AD_seqs_isoforms_alerasool_added.fasta")
isoform_df = pd.DataFrame({"id":all_isoform_seqs.keys(),
             "seq":all_isoform_seqs.values()})
isoform_df["uniprotID"] = isoform_df["id"].str.split("|").str[1]

# Adding the sequences of A0A669KBM4 and O75152 to isoform_df
# Uniprot says A0A669KBM4 is now obsolete
isoform_df.loc[len(isoform_df.index)] = ['>tr|A0A669KBM4|Release 2021_03/2021_03|02-Jun-2021',
                                         'MAEEQQQPPPQQPDAHQQLPPSAPNSGVALPALVPGLPGTEASALQHKIKNSICKTVQSKVDCILQEVEKFTDLEKLYLYLQLPSGLSNGEKSDQNAMSSSRAQQMHAFSWIRNTLEEHPETSLPKQEVYDEYKSYCDNLGYHPLSAADFGKIMKNVFPNMKARRLGTRGKSKYCYSGLRKKAFVHMPTLPNLDFHKTGDGLEGAEPSGQLQNIDEEVISSACRLVCEWAQKVLSQPFDTVLELARFLVKSHYIGTKSMAALTVMAAAPAGMKGITQPSAFIPTAESNSFQPQVKTLPSPIDAKQQLQRKIQKKQQEQKLQSPLPGESAAKKSESATSNGVTNLPNGNPSILSPQPIGIVVAAVPSPIPVQRTRQLVTSPSPMSSSDGKVLPLNVQVVTQHMQSVKQAPKTPQNVPASPGGDRSARHRYPQILPKPANTSALTIRSPTTVLFTSSPIKTAVVPASHMSSLNVVKMTTISLTPSNSNTPLKHSASVSSATGTTEESRSVPQIKNGSVVSLQSPGSRSSSAGGTSAVEVKVEPETSSDEHPVQCQENSDEAKAPQTPSALLGQKSNTDGALQKPSNEGVIEIKATKVCDQRTKCKSRCNEMLPGTSTGNNQSTITLSVASQNLTFTSSSSPPNGDSINKDPKLCTKSPRKRLSSTLQETQVPPVKKPIVEQLSAATIEGQKQGSVKKDQKVPHSGKTEGSTAGAQIPSKVSVNVSSHIGANQPLNSSALVISDSALEQQTTPSSSPDIKVKLEGSVFLLDSDSKSVGSFNPNGWQQITKDSEFISASCEQQQDISVMTIPEHSDINDLEKSVWELEGMPQDTYSQQLHSQIQESSLNQIQAHSSDQLPLQSELKEFEPSVSQTNESYFPFDDELTQDSIVEELVLMEQQMSMNNSHSYGNCLGMTLQSQSVTPGAPMSSHTSSTHFYHPIHSNGTPIHTPTPTPTPTPTPTPTPTPTSEMIAGSQSLSRESPCSRLAQTTPVDSALGSSRHTPIGTPHSNCSSSVPPSPVECRNPFAFTPISSSMAYHDASIVSSSPVKPMQRPMATHPDKTKLEWMNNGYSGVGNSSVSGHGILPSYQELVEDRFRKPHAFAVPGQSYQSQSRHHDTHFGRLTPVSPVQHQGATVNNTNKQEGFAVPAPLDNKGTNSSASSNFRCRSVSPAVHRQRNLSGSTLYPVSNIPRSNVTPFGSPVTPEVHVFTNVHTDACANNIAQRSQSVPLTVMMQTAFPNALQKQANSKKITNVLLSKLDSDNDDAVRGLGMNNLPSNYTARMNLTQILEPSTVFPSANPQNMIDSSTSVYEFQTPSYLTKSNSTGQINFSPGDNQAQSEIGEQQLDFNSTVKDLLSGDSLQTNQQLVGQGASDLTNTASDFSSDIRLSSELSGSINDLNTLDPNLLFDPGRQQGQDDEATLEELKNDPLFQQICSESMNSMTSSGFEWIESKDHPTVEMLG',
                                         "A0A669KBM4"] 
# O75152
isoform_df.loc[len(isoform_df.index)] = ['>sp|O75152|ZC11A_HUMAN Zinc finger CCCH domain-containing protein 11A OS=Homo sapiens OX=9606 GN=ZC3H11A PE=1 SV=3',
                                         'MPNQGEDCYFFFYSTCTKGDSCPFRHCEAAIGNETVCTLWQEGRCFRQVCRFRHMEIDKKRSEIPCYWENQPTGCQKLNCAFHHNRGRYVDGLFLPPSKTVLPTVPESPEEEVKASQLSVQQNKLSVQSNPSPQLRSVMKVESSENVPSPTHPPVVINAADDDEDDDDQFSEEGDETKTPTLQPTPEVHNGLRVTSVRKPAVNIKQGECLNFGIKTLEEIKSKKMKEKSKKQGEGSSGVSSLLLHPEPVPGPEKENVRTVVRTVTLSTKQGEEPLVRLSLTERLGKRKFSAGGDSDPPLKRSLAQRLGKKVEAPETNIDKTPKKAQVSKSLKERLGMSADPDNEDATDKVNKVGEIHVKTLEEILLERASQKRGELQTKLKTEGPSKTDDSTSGARSSSTIRIKTFSEVLAEKKHRQQEAERQKSKKDTTCIKLKIDSEIKKTVVLPPIVASRGQSEEPAGKTKSMQEVHIKTLEEIKLEKALRVQQSSESSTSSPSQHEATPGARRLLRITKRTGMKEEKNLQEGNEVDSQSSIRTEAKEASGETTGVDITKIQVKRCETMREKHMQKQQEREKSVLTPLRGDVASCNTQVAEKPVLTAVPGITRHLTKRLPTKSSQKVEVETSGIGDSLLNVKCAAQTLEKRGKAKPKVNVKPSVVKVVSSPKLAPKRKAVEMHAAVIAAVKPLSSSSVLQEPPAKKAAVAVVPLVSEDKSVTVPEAENPRDSLVLPPTQSSSDSSPPEVSGPSSSQMSMKTRRLSSASTGKPPLSVEDDFEKLIWEISGGKLEAEIDLDPGKDEDDLLLELSEMIDS',
                                         "O75152"] 
isoform_df

Unnamed: 0,id,seq,uniprotID
0,tr|A0A024R0Y4|A0A024R0Y4_HUMAN Transcriptional...,MDRLGSFSNDPSDKPPCRGCSSYLMEPYIKCAECGPPPFFLCLQCF...,A0A024R0Y4
1,tr|A0A087WXG3|A0A087WXG3_HUMAN Basic helix-loo...,MSIRPPGEPPSPGGAAMAELKSLSGDAYLALSHGYAAAAAGLAYGA...,A0A087WXG3
2,sp|A1L443|NTM2F_HUMAN NUT family member 2F OS=...,MASNGAYPVLGPGVTVNPGTSLSVFTALPFATPAPGPAHRPPLVTA...,A1L443
3,sp|A6NJG6|ARGFX_HUMAN Arginine-fifty homeobox ...,MRNRMAPENPQPDPFINRNYSNMKVIPPQDPASPSFTLLSKLECSG...,A6NJG6
4,sp|A6NLX3|SPDE4_HUMAN Speedy protein E4 OS=Hom...,MASGQARPPFEEESPQPSTTVRSPEVVVDDEVPGPSAPWIDPSPQP...,A6NLX3
...,...,...,...
1634,sp|Q9Y6Y1-2|CMTA1_HUMAN Isoform 2 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-2
1635,sp|Q9Y6Y1-3|CMTA1_HUMAN Isoform 3 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-3
1636,sp|Q9Y6Y1-4|CMTA1_HUMAN Isoform 4 of Calmoduli...,MWRAEGKWLPKTSRKSVSQSVFCGTSTYCVLNTVPPIEDDHGNSNS...,Q9Y6Y1-4
1637,>tr|A0A669KBM4|Release 2021_03/2021_03|02-Jun-...,MAEEQQQPPPQQPDAHQQLPPSAPNSGVALPALVPGLPGTEASALQ...,A0A669KBM4


# 3. Checking for any canonical isoform matches

In [10]:
# Returns the coordinates of the domain in the canonical isofom, if found
# Must pass in an isoform

def return_canonical_coords(isoform_id, ADs):
    if "-" in isoform_id:
        canonical_isoform_id = isoform_id.split("-")[0]
        full_seq_canonical_seq = isoform_df[isoform_df["uniprotID"] == canonical_isoform_id]["seq"].iloc[0]
        isoform_domain_seq = ADs[ADs["uniprotID"] == isoform_id]["Sequence"].iloc[0]
        match=(re.search(isoform_domain_seq, full_seq_canonical_seq))
        if match:
            start, end = match.span()
            start += 1
            return canonical_isoform_id, start, end
        else:
            print("No canonical match for " + isoform_id)
            print_first_alignment(full_seq_canonical_seq, isoform_domain_seq)
            print()

def print_first_alignment(AA_seq1, AA_seq2):
    seq1 = Seq(AA_seq1) 
    seq2 = Seq(AA_seq2)
    alignments = pairwise2.align.globalxx(seq1, seq2)
    print("\tCanonical full protein sequence:")
    print(alignments[0][0]) 
    print("\tIsoform AD sequence:")
    print(alignments[0][1])

In [11]:
Stanford_output = AD_comparison_tools.return_uniprotID_isoform_mappings(Stanford_isoform_matches_finalized, isoform_df)
active_output = AD_comparison_tools.return_uniprotID_isoform_mappings(active, isoform_df)
Soto_output = AD_comparison_tools.return_uniprotID_isoform_mappings(Soto, isoform_df)
GSL_output = AD_comparison_tools.return_uniprotID_isoform_mappings(GSL, isoform_df)
alerasool_output = AD_comparison_tools.return_uniprotID_isoform_mappings(alerasool, isoform_df)

no matching isoforms for:
Q6SJ96
Tested:
1171    Q6SJ96
Name: uniprotID, dtype: object
2
81
expected:
Q6SJ96
ASAPWPERVPRLLAPRLPSYPPPPPTVGLRSMEQEETYLELYLDQCAAQDGLAPPRSPLFSPVVPYDMYILNASNPDTAF
observed:
['EQEETYLELYLDQCAAQDGLAPPRSPLFSPVVPYDMYILNASNPDTAFNSNPEVKETSGDFSSVDLSFLPDEVTQENKDQ']

no matching isoforms for:
Q6SJ96
Tested:
1171    Q6SJ96
Name: uniprotID, dtype: object
22
101
expected:
Q6SJ96
PPPPPTVGLRSMEQEETYLELYLDQCAAQDGLAPPRSPLFSPVVPYDMYILNASNPDTAFNSNPEVKETSGDFSSVDLSF
observed:
['APPRSPLFSPVVPYDMYILNASNPDTAFNSNPEVKETSGDFSSVDLSFLPDEVTQENKDQPVISKHETEENSESQSPQSR']

no matching isoforms for:
H3BS19
Tested:
Series([], Name: uniprotID, dtype: object)
792
891
expected:
H3BS19
SHAKTFLLAGDAQAEGKDDPLRTGFLPSLAATPFPLPASDLDMEDDAKLDSLITEALNGMEYQSDNPEIDSSFIDVFADEEPSGPRGPSSGHPLKSKAGV
observed:
[]

no matching isoforms for:
H3BS19
Tested:
Series([], Name: uniprotID, dtype: object)
832
911
expected:
H3BS19
LDMEDDAKLDSLITEALNGMEYQSDNPEIDSSFIDVFADEEPSGPRGPSSGHPLKSKAGVTPESKAPPPLPAATPDPQTP
observed:
[]

In [12]:
# Problem with Soto - no match found for this
Soto_output[[len(_) ==0  for _ in Soto_output["matching_isoforms"]]]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,TileType,orig_uniprotID,matching_isoforms
165,HNF1A,281,631,P20823,"PMID: 1656070, 8288579, Soto (P20823, 281_631)",LAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVHGVRYGQPA...,TF,P20823,[]


In [13]:
Soto[Soto["GeneName"] == "HNF1A"]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,TileType,orig_uniprotID,matching_isoforms
165,HNF1A,281,631,P20823,"PMID: 1656070, 8288579, Soto (P20823, 281_631)",LAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVHGVRYGQPA...,TF,P20823,[]


In [14]:
# Correcting the sequence
print(isoform_df[isoform_df["uniprotID"] == "P20823"]["seq"].iloc[0])
Soto.at[165, "Sequence"] = 'LAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVHGVRYGQPATSETAEVPSSSGGPLVTVSTPLHQVSPTGLEPSHSLLSTEAKLVSAAGGPLPPVSTLTALHSLEQTSPGLNQQPQNLIMASLPGVMTIGPGEPASLGPTFTNTGASTLVIGLASTQAQSVPVINSMGSSLTTLQPVQFSQPLHPSYQQPLMPPVQSHVTQSPFMATMAQLQSPHALYSHKPEVAQYTHTGLLPQTMLITDTTNLSALASLTPTKQVFTSDTEASSESGLHTPASQATTLHVPSQDPASIQHLQPAHRLSASPTVSSSSLVLYQSSDSSNGQSHLLPSNHSVIETFISTQMASSSQ'
display(Soto.iloc[[165]])

MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDKGESCGGGRGELAELPNGLGETRGSEDETDDDGEDFTPPILKELENLSPEEAAHQKAVVETLLQEDPWRVAKMVKSYLQQHNIPQREVVDTTGLNQSHLSQHLNKGTPMKTQKRAALYTWYVRKQREVAQQFTHAGQGGLIEEPTGDELPTKKGRRNRFKWGPASQQILFQAYERQKNPSKEERETLVEECNRAECIQRGVSPSQAQGLGSNLVTEVRVYNWFANRRKEEAFRHKLAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVHGVRYGQPATSETAEVPSSSGGPLVTVSTPLHQVSPTGLEPSHSLLSTEAKLVSAAGGPLPPVSTLTALHSLEQTSPGLNQQPQNLIMASLPGVMTIGPGEPASLGPTFTNTGASTLVIGLASTQAQSVPVINSMGSSLTTLQPVQFSQPLHPSYQQPLMPPVQSHVTQSPFMATMAQLQSPHALYSHKPEVAQYTHTGLLPQTMLITDTTNLSALASLTPTKQVFTSDTEASSESGLHTPASQATTLHVPSQDPASIQHLQPAHRLSASPTVSSSSLVLYQSSDSSNGQSHLLPSNHSVIETFISTQMASSSQ


Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,TileType,orig_uniprotID,matching_isoforms
165,HNF1A,281,631,P20823,"PMID: 1656070, 8288579, Soto (P20823, 281_631)",LAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVHGVRYGQPA...,TF,P20823,[]


In [15]:
# Same problem with GSL - no match found for this
GSL_output[[len(_) ==0  for _ in GSL_output["matching_isoforms"]]]

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Matching Isoforms,orig_uniprotID,Reference,Sequence,Length,TileType,matching_isoforms
57,57,HNF1/TCF1/LFB1,546,628,P20823,[['P20823']],P20823,"Choi 2000 list, GSL (P20823, 546_628)",DTEASSESGLHTPASQATTLHVPSQDPAGIQHLQPAHRLSASPTVS...,83,TF,[]


In [16]:
print(GSL_output[[len(_) ==0  for _ in GSL_output["matching_isoforms"]]]["Sequence"].iloc[0])

DTEASSESGLHTPASQATTLHVPSQDPAGIQHLQPAHRLSASPTVSSSSLVLYQSSDSSNGQSHLLPSNHSVIETFISTQMAS


In [17]:
print(isoform_df[isoform_df["uniprotID"] == "P20823"]["seq"].iloc[0])

MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDKGESCGGGRGELAELPNGLGETRGSEDETDDDGEDFTPPILKELENLSPEEAAHQKAVVETLLQEDPWRVAKMVKSYLQQHNIPQREVVDTTGLNQSHLSQHLNKGTPMKTQKRAALYTWYVRKQREVAQQFTHAGQGGLIEEPTGDELPTKKGRRNRFKWGPASQQILFQAYERQKNPSKEERETLVEECNRAECIQRGVSPSQAQGLGSNLVTEVRVYNWFANRRKEEAFRHKLAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVHGVRYGQPATSETAEVPSSSGGPLVTVSTPLHQVSPTGLEPSHSLLSTEAKLVSAAGGPLPPVSTLTALHSLEQTSPGLNQQPQNLIMASLPGVMTIGPGEPASLGPTFTNTGASTLVIGLASTQAQSVPVINSMGSSLTTLQPVQFSQPLHPSYQQPLMPPVQSHVTQSPFMATMAQLQSPHALYSHKPEVAQYTHTGLLPQTMLITDTTNLSALASLTPTKQVFTSDTEASSESGLHTPASQATTLHVPSQDPASIQHLQPAHRLSASPTVSSSSLVLYQSSDSSNGQSHLLPSNHSVIETFISTQMASSSQ


In [18]:
GSL[GSL["uniprotID"] == "P20823"]

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Matching Isoforms,orig_uniprotID,Reference,Sequence,Length,TileType,matching_isoforms
55,55,HNF1/TCF1/LFB1,440,506,P20823,"[['P20823', 'P20823-7']]",P20823,"Choi 2000 list, GSL (P20823, 440_506)",STQAQSVPVINSMGSSLTTLQPVQFSQPLHPSYQQPLMPPVQSHVT...,67,TF,"[P20823, P20823-7]"
56,56,HNF1/TCF1/LFB1,281,318,P20823,"[['P20823', 'P20823-2', 'P20823-3', 'P20823-7']]",P20823,"Choi 2000 list, GSL (P20823, 281_318)",LAMDTYSGPPPGPGPGPALPAHSSPGLPPPALSPSKVH,38,TF,"[P20823, P20823-2, P20823-3, P20823-7]"
57,57,HNF1/TCF1/LFB1,546,628,P20823,[['P20823']],P20823,"Choi 2000 list, GSL (P20823, 546_628)",DTEASSESGLHTPASQATTLHVPSQDPAGIQHLQPAHRLSASPTVS...,83,TF,[]


In [19]:
# Correcting the sequence
GSL.at[57, "Sequence"] = 'DTEASSESGLHTPASQATTLHVPSQDPASIQHLQPAHRLSASPTVSSSSLVLYQSSDSSNGQSHLLPSNHSVIETFISTQMAS'

In [20]:
# Rerunning
Soto_output = AD_comparison_tools.return_uniprotID_isoform_mappings(Soto, isoform_df)
GSL_output = AD_comparison_tools.return_uniprotID_isoform_mappings(GSL, isoform_df)
alerasool_output = AD_comparison_tools.return_uniprotID_isoform_mappings(alerasool, isoform_df)

In [21]:
# Choosing first of all matches by default
active_output["uniprotID"] = [_[0] for _ in active_output["matching_isoforms"]]
Soto_output["uniprotID"] = [_[0] for _ in Soto_output["matching_isoforms"]]
GSL_output["uniprotID"] = [_[0] for _ in GSL_output["matching_isoforms"]]
alerasool_output["uniprotID"] = [_[0] for _ in alerasool_output["matching_isoforms"]]

In [22]:
# Looking more closely at matches being selected
active_output["num_matches"] = [len(_) for _ in active_output["matching_isoforms"]]
active_output[active_output["num_matches"] > 1]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,TileType,orig_uniprotID,matching_isoforms,num_matches
17,ZN644,852.0,893.0,Q9H582,"Staller Activity Data (Q9H582, 851.0_893.0)",SYETEDESSWDNVELGDYTTQAIEDETYSDINQEHVNLFPLF,TF,Q9H582,"[Q9H582, Q9H582-2]",2
54,TIGD7,408.0,446.0,Q6NT04,"Staller Activity Data (Q6NT04, 407.0_446.0)",PEYDFQGLEHGDYREILEKCGELETKLDDDRVWLNGDEE,TF,Q6NT04,"[Q6NT04, Q6NT04-2]",2
74,SRBP2,2.0,56.0,Q12772,"Staller Activity Data (Q12772, 1.0_56.0)",DDSGELGGLETMETLTELGDELTLGDIDEMLQFVSNQVGEFPDLFS...,TF,Q12772,"[Q12772, Q12772-2]",2
75,CR3L1,12.0,86.0,Q96BA8,"Staller Activity Data (Q96BA8, 11.0_86.0)",RLFPGSSFLDLGDLNESDFLNNAHFPEHLDHFTENMEDFSNDLFSS...,TF,Q96BA8,"[Q96BA8, Q96BA8-2]",2
81,C2D1A,22.0,60.0,Q6P1N0,"Staller Activity Data (Q6P1N0, 21.0_60.0)",GLLVDLSPDGLMIPEDGANDEELEAEFLALVGGQPPALE,TF,Q6P1N0,"[Q6P1N0, Q6P1N0-2]",2
99,ZFHX4,1405.0,1443.0,Q86UP3,"Staller Activity Data (Q86UP3, 1404.0_1443.0)",ELSEAELQQLYASLPVNGELWAESETMSQDDHGLEQEME,TF,Q86UP3,"[Q86UP3, Q86UP3-4]",2
112,HIF1A,534.0,572.0,Q16665,"Staller Activity Data (Q16665, 533.0_572.0)",ELVEKLFAEDTEAKNPFSTQDTDLDLEMLAPYIPMDDDF,TF,Q16665,"[Q16665, Q16665-2]",2
118,SRBP1,3.0,53.0,P36956,"Staller Activity Data (P36956, 2.0_53.0)",EPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGLFD...,TF,P36956,"[P36956, P36956-2, P36956-5]",3
141,CREB3,2.0,55.0,O43889,"Staller Activity Data (O43889, 1.0_55.0)",ELELDAGDQDLLAFLLEESGDLGTAPDEAVRAPLDWALPLSEVPSD...,TF,O43889,"[O43889, O43889-3]",2
143,ETV1,37.0,83.0,P50549,"Staller Activity Data (P50549, 36.0_83.0)",RDLAHDSEELFQDLSQLQETWLAEAQVPDNDEQFVPDYQAESLAFHG,TF,P50549,"[P50549, P50549-3]",2


In [23]:
Soto_output["num_matches"] = [len(_) for _ in Soto_output["matching_isoforms"]]
Soto_output["first_match"] = [_[0] for _ in Soto_output["matching_isoforms"]]
Soto_output["first_match_not_canonical"] = ["-" in _ for _ in Soto_output["first_match"]]
Soto_output[Soto_output["first_match_not_canonical"] & (Soto_output["num_matches"] > 1)]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,TileType,orig_uniprotID,matching_isoforms,num_matches,first_match,first_match_not_canonical
72,ERG,125,209,P11308-1,"PMID: 14603248, Soto (P11308, 125_209)",MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...,TF,P11308,"[P11308-1, P11308-3, P11308-5, P11308-6]",4,P11308-1,True


In [24]:
Soto_output[Soto_output["GeneName"] == "ERG"]

Unnamed: 0,GeneName,Start,End,uniprotID,Reference,Sequence,TileType,orig_uniprotID,matching_isoforms,num_matches,first_match,first_match_not_canonical
72,ERG,125,209,P11308-1,"PMID: 14603248, Soto (P11308, 125_209)",MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...,TF,P11308,"[P11308-1, P11308-3, P11308-5, P11308-6]",4,P11308-1,True
73,ERG,440,486,P11308-3,"PMID: 9681824, Soto (P11308, 440_486)",PHPPALPVTSSSFFAAPNPYWNSPTGGIYPNTRLPTSHMPSHLGTYY,TF,P11308,[P11308-3],1,P11308-3,True


In [25]:
# All GSL entries w/ multip matches have at least one canonical match
GSL_output["num_matches"] = [len(_) for _ in GSL_output["matching_isoforms"]]
GSL_output[GSL_output["num_matches"] > 1]

Unnamed: 0.1,Unnamed: 0,GeneName,Start,End,uniprotID,Matching Isoforms,orig_uniprotID,Reference,Sequence,Length,TileType,matching_isoforms,num_matches
5,5,TP73,1,46,O15350,"[['O15350', 'O15350-13', 'O15350-2', 'O15350-3...",O15350,"activation_regions.txt, GSL (O15350, 1_46)",MAQSTATSPDGGTTFEHLWSSLEPDSTYFDLPQSSRGNNEVVGGTD,46,TF,"[O15350, O15350-13, O15350-2, O15350-3, O15350...",7
9,9,CREB3,1,92,O43889,"[['O43889', 'O43889-3']]",O43889,"activation_regions.txt, GSL (O43889, 1_92)",MELELDAGDQDLLAFLLEESGDLGTAPDEAVRAPLDWALPLSEVPS...,92,TF,"[O43889, O43889-3]",2
13,13,NFATC1,126,218,O95644,"[['O95644', 'O95644-10', 'O95644-2', 'O95644-4...",O95644 / O95644,"TAD_regions.txt / activation_regions.txt, GSL ...",LGLYHNNNQFFHDVEVEDVLPSSKRSPSTATLSLPSLEAYRDPSCL...,93,TF,"[O95644, O95644-10, O95644-2, O95644-4]",4
14,14,ESRRB,203,433,O95718,"[['O95718', 'O95718-1']]",O95718,"transcriptionalactivity_regions.txt, GSL (O957...",PPAKKPLTKIVSYLLVAEPDKLYAMPPPGMPEGDIKALTTLCDLAD...,231,TF,"[O95718, O95718-1]",2
18,18,ESR1,1,184,P03372,"[['P03372', 'P03372-2']]",P03372,"activation_regions.txt, GSL (P03372, 1_184)",MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDS...,184,TF,"[P03372, P03372-2]",2
20,20,NR3C1,98,115,P04150,"[['P04150', 'P04150-10', 'P04150-2', 'P04150-3...",P04150,"transcriptionalactivity_regions.txt, GSL (P041...",MGNDLGFPQQGQISLSSG,18,TF,"[P04150, P04150-10, P04150-2, P04150-3, P04150...",7
21,21,GlucocorticoidReceptor,526,556,P04150,"[['P04150', 'P04150-2']]",P04150,"Choi 2000 list, GSL (P04150, 526_556)",PQLTPTLVSLLEVIEPEVLYAGYDSSVPDST,31,TF,"[P04150, P04150-2]",2
22,22,p53 / TP53,1,61,P04637,"[['P04637', 'P04637-2', 'P04637-3'], ['P04637'...",P04637 / P04637 / P04637,"nan / activation_regions.txt / nan, GSL (P0463...",MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,61,TF,"[P04637, P04637-2, P04637-3]",3
24,24,AP-2,6,117,P05549,"[['P05549', 'P05549']]",Q96CW1,"Choi 2000 list, GSL (P05549, 6_117)",TDNIKYEDCEDRHDGTSNGTARLPQLGTVGQSPYTSAPPLSHTPNA...,112,TF,"[P05549, P05549-2]",2
25,25,AP-2,6,117,P05549,"[['P05549', 'P05549']]",Q96CW1,"Choi 2000 list, GSL (P05549, 6_117)",TDNIKYEDCEDRHDGTSNGTARLPQLGTVGQSPYTSAPPLSHTPNA...,112,TF,"[P05549, P05549-2]",2


In [26]:
active_output["Canonical Transcript ID"] = np.nan
Soto_output["Canonical Transcript ID"] = np.nan
GSL_output["Canonical Transcript ID"] = np.nan
alerasool_output["Canonical Transcript ID"] = np.nan

In [27]:
def return_merged_row(uniprotID, df):
    # Only look at rows with the same uniprot ID
    same_uniprotID_rows = df[df["uniprotID"] == uniprotID]
    same_uniprotID_rows = same_uniprotID_rows.sort_values(by = "Start")
    
    # Final dataframe columns
    new_starts = []
    new_ends = []
    genes = []
    AD_names = []
    references = []
    matching_isoforms = []
    transcript_IDs = []
    orig_uniprotIDs = []
    TileTypes = []
    
    # Current row's values
    curr_start = -1
    curr_end = -1
    curr_genes = []
    curr_AD_names = []
    curr_references = []
    curr_matching_isoforms = []
    curr_transcript_IDs = []
    curr_orig_uniprotIDs = []
    curr_TileTypes = []
    
    for i in same_uniprotID_rows.index:
        # Merge current row with next row
        if curr_end >= same_uniprotID_rows.loc[i]["Start"]:
            curr_end = max(curr_end, same_uniprotID_rows.loc[i]["End"])
            curr_genes.append(same_uniprotID_rows.loc[i]["GeneName"])
            curr_references.append(same_uniprotID_rows.loc[i]["Reference"])
            curr_matching_isoforms.append(same_uniprotID_rows.loc[i]["matching_isoforms"])
            curr_transcript_IDs.append(same_uniprotID_rows.loc[i]["Canonical Transcript ID"])
            curr_orig_uniprotIDs.append(same_uniprotID_rows.loc[i]["orig_uniprotID"])
            curr_TileTypes.append(same_uniprotID_rows.loc[i]["TileType"])
        
        # Don't merge current row with next row
        else: 
            new_starts.append(curr_start)
            new_ends.append(curr_end)
            genes.append(" / ".join(set([c.strip() for c in curr_genes])))
            
            curr_AD_names = [str(c) for c in curr_AD_names]
            AD_names.append(" / ".join(curr_AD_names))
            
            curr_references = [str(c) for c in curr_references]
            references.append(" // ".join(curr_references))
            
            # curr_matching_isoforms = [c for c in curr_matching_isoforms]
            matching_isoforms.append(curr_matching_isoforms)
            
            curr_transcript_IDs = [str(c) for c in curr_transcript_IDs]
            transcript_IDs.append(" / ".join(curr_transcript_IDs))
            
            curr_orig_uniprotIDs = [str(c) for c in curr_orig_uniprotIDs]
            orig_uniprotIDs.append(" / ".join(curr_orig_uniprotIDs))
            
            curr_TileTypes = [str(c) for c in curr_TileTypes]
            TileTypes.append(" / ".join(curr_TileTypes))
            
            curr_start = same_uniprotID_rows.loc[i]["Start"]
            curr_end = same_uniprotID_rows.loc[i]["End"]
            
            curr_genes = [same_uniprotID_rows.loc[i]["GeneName"]]
            curr_references = [same_uniprotID_rows.loc[i]["Reference"]]
            curr_matching_isoforms = [same_uniprotID_rows.loc[i]["matching_isoforms"]]
            curr_transcript_IDs = [same_uniprotID_rows.loc[i]["Canonical Transcript ID"]]
            curr_orig_uniprotIDs = [same_uniprotID_rows.loc[i]["orig_uniprotID"]]
            curr_TileTypes = [same_uniprotID_rows.loc[i]["TileType"]]
    
    # Append the last values
    new_starts.append(curr_start)
    new_ends.append(curr_end)
    
    genes.append(" / ".join(set([c.strip() for c in curr_genes])))
    
    curr_AD_names = [str(c) for c in curr_AD_names]
    AD_names.append(" / ".join(curr_AD_names))
    
    curr_references = [str(c) for c in curr_references]
    references.append(" // ".join(curr_references))
    
    # curr_matching_isoforms = [c for c in curr_matching_isoforms]
    matching_isoforms.append(curr_matching_isoforms)
            
    curr_transcript_IDs = [str(c) for c in curr_transcript_IDs]
    transcript_IDs.append(" / ".join(curr_transcript_IDs))
            
    curr_orig_uniprotIDs = [str(c) for c in curr_orig_uniprotIDs]
    orig_uniprotIDs.append(" / ".join(curr_orig_uniprotIDs))
    
    curr_TileTypes = [str(c) for c in curr_TileTypes]
    TileTypes.append(" / ".join(curr_TileTypes))
    
    # Remove the first (because it is just -1 or "")
    new_starts = new_starts[1:]
    new_ends = new_ends[1:]
    genes = genes[1:]
    references = references[1:]
    matching_isoforms = matching_isoforms[1:]
    transcript_IDs = transcript_IDs[1:]
    orig_uniprotIDs = orig_uniprotIDs[1:]
    TileTypes = TileTypes[1:]
    
    return pd.DataFrame({"Gene": genes,
                         "Start": new_starts,
                        "End": new_ends,
                        "uniprotID": uniprotID,
                         "Matching Isoforms" : matching_isoforms, 
                         "Canonical Transcript ID" : transcript_IDs,
                         "orig_uniprotID" :orig_uniprotIDs,
                         "Reference": references,
                         "TileType" : TileTypes
                        })

In [28]:
mapped_known_ADs = pd.concat([Stanford_output, active_output, Soto_output, GSL_output, alerasool_output])

In [29]:
matches_found = 0
mapped_known_ADs = mapped_known_ADs.reset_index(drop = True)

for i in mapped_known_ADs.index:
    uniprotID = mapped_known_ADs["uniprotID"].iloc[i]
    orig_start = mapped_known_ADs["Start"].iloc[i]
    orig_end = mapped_known_ADs["End"].iloc[i]

    canonical_coords = return_canonical_coords(uniprotID, mapped_known_ADs)

    if canonical_coords:
        print("Changing: " + mapped_known_ADs["GeneName"].iloc[i])
        print("\tFrom: " + str((uniprotID, orig_start, orig_end)))
        print("\tTo:" + str(canonical_coords))
        print()
        canonical_isoform_id, start, end = canonical_coords
        mapped_known_ADs.at[i, 'uniprotID'] = canonical_isoform_id
        mapped_known_ADs.at[i, 'Start'] = start
        mapped_known_ADs.at[i, 'End'] = end
        matches_found += 1

print(str(matches_found) + " matches found")

Changing: ARNTL
	From: ('O00327-8', 542.0, 625.0)
	To:('O00327', 543, 626)

Changing: EBF3
	From: ('Q9H4W6-2', 272.0, 351.0)
	To:('Q9H4W6', 281, 360)

Changing: EP400
	From: ('Q96L91-2', 2061.0, 2140.0)
	To:('Q96L91', 2097, 2176)

Changing: FOXM1
	From: ('Q08050-3', 722.0, 801.0)
	To:('Q08050', 684, 763)

Changing: MEN1
	From: ('O00255-2', 341.0, 420.0)
	To:('O00255', 341, 420)

No canonical match for O75030-2
	Canonical full protein sequence:
MQSESGIVPDFEVGEEFHEEPKTYYELKSQPLKSSSSAEHPGASKPPISSSSMTSRILLRQQLMREQMQEQERREQQQKLQAAQFMQQRVPVSQTPAINVSVPTTLPSATQVPMEVLKVQTHLENPTKYHIQQAQRQQVKQYLSTTLANKHANQVLSLPCPNQPGDHVMPPVPGSSAPNSPMAMLTLNSNCEKEGFYKFEEQNRAESECPGMNTHSRASCMQMDDVIDDIISLESSYNEEILGLMDPALQMANTLPVSGNLIDLYGNQGLPPPGLTISNSCPANLPNIKRELTACIFPTESEARALAKERQKKDNHNLIERRRRFNINDRIKELGTLIPKSNDPDMRWNKGTILKASVDYIRKLQREQQRAKELENRQKKLEHANRHLLLRIQELEMQARAHGLSLIPSTGLCSPDLVNRIIKQEPVLENCSQDLLQHHADLTCTTTLDLTDGTITFNNNLGTGTEANQAYSVPTKMGSKLEDILMDDTLSPVGVTDPLLSSVSPGASKTSSRRSSMSMEETEHTC
	Isoform AD sequence:
---

In [31]:
mapped_known_ADs[mapped_known_ADs["GeneName"].str.contains("MYT1L")]

Unnamed: 0.1,GeneName,Start,End,uniprotID,Reference,Sequence,orig_uniprotID,Canonical Transcript ID,TileType,matching_isoforms,num_matches,first_match,first_match_not_canonical,Unnamed: 0,Matching Isoforms,Length,new_AD
329,MYT1L,152.0,251.0,Q9UL68,"DelRosso et al. (Q9UL68-4, 152_251)",EEEEEEEEEEEEEEEENEDHQMNCHNTRIMQDTEKDDNNNDEYDNY...,Q9UL68,ENST00000399161,TF,[Q9UL68-4],,,,,,,
330,MYT1L,192.0,271.0,Q9UL68,"DelRosso et al. (Q9UL68-4, 192_271)",DEYDNYDELVAKSLLNLGKIAEDAAYRARTESEMNSNTSNSLEDDS...,Q9UL68,ENST00000399161,TF,[Q9UL68-4],,,,,,,
331,MYT1L,312.0,391.0,Q9UL68,"DelRosso et al. (Q9UL68-4, 312_391)",EKMVEESDEEVCLSSLECLRNQCFDLARKLSETNPQERNPQQNMNI...,Q9UL68,ENST00000399161,TF,[Q9UL68-4],,,,,,,
684,MYT1L,214.0,425.0,Q9UL68,"PMID: 29291346, Soto (Q9UL68, 214_425)",DAAYRARTESEMNSNTSNSLEDDSDKNENLGRKSELSLDLDSDVVR...,Q9UL68,,TF,"[Q9UL68, Q9UL68-4]",2.0,Q9UL68,False,,,,


In [32]:
mapped_known_ADs[mapped_known_ADs["uniprotID"] =="Q9UL68"]

Unnamed: 0.1,GeneName,Start,End,uniprotID,Reference,Sequence,orig_uniprotID,Canonical Transcript ID,TileType,matching_isoforms,num_matches,first_match,first_match_not_canonical,Unnamed: 0,Matching Isoforms,Length,new_AD
329,MYT1L,152.0,251.0,Q9UL68,"DelRosso et al. (Q9UL68-4, 152_251)",EEEEEEEEEEEEEEEENEDHQMNCHNTRIMQDTEKDDNNNDEYDNY...,Q9UL68,ENST00000399161,TF,[Q9UL68-4],,,,,,,
330,MYT1L,192.0,271.0,Q9UL68,"DelRosso et al. (Q9UL68-4, 192_271)",DEYDNYDELVAKSLLNLGKIAEDAAYRARTESEMNSNTSNSLEDDS...,Q9UL68,ENST00000399161,TF,[Q9UL68-4],,,,,,,
331,MYT1L,312.0,391.0,Q9UL68,"DelRosso et al. (Q9UL68-4, 312_391)",EKMVEESDEEVCLSSLECLRNQCFDLARKLSETNPQERNPQQNMNI...,Q9UL68,ENST00000399161,TF,[Q9UL68-4],,,,,,,
684,MYT1L,214.0,425.0,Q9UL68,"PMID: 29291346, Soto (Q9UL68, 214_425)",DAAYRARTESEMNSNTSNSLEDDSDKNENLGRKSELSLDLDSDVVR...,Q9UL68,,TF,"[Q9UL68, Q9UL68-4]",2.0,Q9UL68,False,,,,


In [33]:
# mapped_known_ADs.reset_index(inplace= True, drop=True)

dfs = []
i = 0
for uniprotID in mapped_known_ADs["uniprotID"].unique():
    dfs.append(return_merged_row(uniprotID, mapped_known_ADs))

merged_mapped_known_ADs = pd.concat(dfs)
merged_mapped_known_ADs = merged_mapped_known_ADs.reset_index(drop = True)
merged_mapped_known_ADs

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType
0,ABRAXAS1,121.0,200.0,Q6UWZ7,[[Q6UWZ7]],ENST00000321945,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",CR
1,AHR,118.0,126.0,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF
2,AHR,266.0,268.0,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF
3,AHR,532.0,848.0,P35869,"[[P35869], [P35869], [P35869]]",ENST00000242057 / nan / ENST00000242057,P35869 / P35869 / P35869,"DelRosso et al. (P35869, 532_641) // PMID: 879...",TF / TF / TF
4,AKAP8,2.0,81.0,O43823,[[O43823]],ENST00000269701,O43823,"DelRosso et al. (O43823, 2_81)",TF
...,...,...,...,...,...,...,...,...,...
756,ATXN7L3,1.0,120.0,Q14CW9,"[[Q14CW9, Q14CW9-2]]",,,Alerasool et al,TF
757,FAM22F,361.0,500.0,A1L443,[[A1L443]],,,Alerasool et al,TF
758,SS18L2,18.0,77.0,Q9UHA2,[[Q9UHA2]],,,Alerasool et al,TF
759,SERTAD2,201.0,260.0,Q14140,[[Q14140]],,,Alerasool et al,TF


In [34]:
merged_mapped_known_ADs[merged_mapped_known_ADs["Reference"].str.contains("Alerasool")]

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType
15,ATF6alpha / ATF6 / ATF6 / ATF6A,1.0,150.0,P18850,"[[P18850], [P18850], [P18850], [P18850], [P188...",nan / nan / nan / ENST00000367942 / nan / ENST...,P18850 / P18850 / P18850 / nan / P18850 / P188...,"PMID: 10958673, Soto (P18850, 1_150) // activa...",TF / TF / TF / TF / TF / TF
16,ATF6B,1.0,91.0,Q99941,"[[Q99941], [Q99941], [Q99941]]",nan / ENST00000375203 / nan,nan / Q99941 / Q99941,"Alerasool et al // DelRosso et al. (Q99941, 2_...",TF / TF / TF
17,ATMIN,432.0,823.0,O43313,"[[O43313], [O43313], [O43313], [O43313]]",nan / ENST00000299575 / ENST00000299575 / nan,O43313 / O43313 / O43313 / nan,"PMID: 22167198, Soto (O43313, 432_823) // DelR...",TF / TF / TF / TF
20,BRD8,368.0,480.0,Q9H0E9,"[[Q9H0E9], [Q9H0E9], [Q9H0E9]]",nan / ENST00000254900 / ENST00000254900,nan / Q9H0E9 / Q9H0E9,"Alerasool et al // DelRosso et al. (Q9H0E9, 37...",TF / CR / CR
38,CSRNP1,482.0,589.0,Q96S65,"[[Q96S65], [Q96S65], [Q96S65]]",ENST00000273153 / nan / nan,Q96S65 / Q96S65 / nan,"DelRosso et al. (Q96S65, 482_589) // PMID: 177...",TF / TF / TF
40,DDIT3,1.0,91.0,P35638,"[[P35638], [P35638], [P35638], [P35638]]",nan / nan / ENST00000346473 / nan,P35638 / nan / P35638 / P35638,"PMID: 15775988, Soto (P35638, 1_18) // Aleraso...",TF / TF / TF / TF
99,HOXA2,61.0,160.0,O43364,[[O43364]],,,Alerasool et al,TF
106,HSF1,362.0,529.0,Q00613,"[[Q00613], [Q00613, Q00613-2], [Q00613], [Q006...",ENST00000528838 / nan / nan / nan / nan,Q00613 / Q00613 / Q00613 / Q00613 / nan / Q00613,"DelRosso et al. (Q00613, 362_481) // PMID: 776...",TF / TF / TF / TF / TF
116,JAZF1,1.0,81.0,Q86VZ6,"[[Q86VZ6], [Q86VZ6]]",nan / ENST00000283928,nan / Q86VZ6,"Alerasool et al // DelRosso et al. (Q86VZ6, 2_81)",TF / TF
123,KLF15,72.0,201.0,Q9UIH9,"[[Q9UIH9], [Q9UIH9], [Q9UIH9]]",ENST00000296233 / nan / nan,Q9UIH9 / nan / Q9UIH9,"DelRosso et al. (Q9UIH9, 72_201) // Alerasool ...",TF / TF / TF


In [35]:
# Now, adding sequences using isoform_df
merged = pd.merge(merged_mapped_known_ADs, isoform_df, on = "uniprotID", how = "left")
merged

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,id,seq
0,ABRAXAS1,121.0,200.0,Q6UWZ7,[[Q6UWZ7]],ENST00000321945,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",CR,sp|Q6UWZ7|ABRX1_HUMAN BRCA1-A complex subunit ...,MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSIT...
1,AHR,118.0,126.0,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor ...,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
2,AHR,266.0,268.0,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor ...,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
3,AHR,532.0,848.0,P35869,"[[P35869], [P35869], [P35869]]",ENST00000242057 / nan / ENST00000242057,P35869 / P35869 / P35869,"DelRosso et al. (P35869, 532_641) // PMID: 879...",TF / TF / TF,sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor ...,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
4,AKAP8,2.0,81.0,O43823,[[O43823]],ENST00000269701,O43823,"DelRosso et al. (O43823, 2_81)",TF,sp|O43823|AKAP8_HUMAN A-kinase anchor protein ...,MDQGYGGYGAWSAGPANTQGAYGTGVASWQGYENYNYYGAQNTSVT...
...,...,...,...,...,...,...,...,...,...,...,...
757,ATXN7L3,1.0,120.0,Q14CW9,"[[Q14CW9, Q14CW9-2]]",,,Alerasool et al,TF,sp|Q14CW9|AT7L3_HUMAN Ataxin-7-like protein 3 ...,MKMEEMSLSGLDNSKLEAIAQEIYADLVEDSCLGFCFEVHRAVKCG...
758,FAM22F,361.0,500.0,A1L443,[[A1L443]],,,Alerasool et al,TF,sp|A1L443|NTM2F_HUMAN NUT family member 2F OS=...,MASNGAYPVLGPGVTVNPGTSLSVFTALPFATPAPGPAHRPPLVTA...
759,SS18L2,18.0,77.0,Q9UHA2,[[Q9UHA2]],,,Alerasool et al,TF,sp|Q9UHA2|S18L2_HUMAN SS18-like protein 2 OS=H...,MSVAFVPDWLRGKAEVNQETIQRLLEENDQLIRCIVEYQNKGRGNE...
760,SERTAD2,201.0,260.0,Q14140,[[Q14140]],,,Alerasool et al,TF,sp|Q14140|SRTD2_HUMAN SERTA domain-containing ...,MLGKGGKRKFDEHEDGLEGKIVSPCDGPSKVSYTLQRQTIFNISLM...


In [40]:
isoform_df[isoform_df["id"].str.contains("ZNF469")]

Unnamed: 0,id,seq,uniprotID
1346,sp|Q96JG9|ZN469_HUMAN Zinc finger protein 469 ...,MPGERPRGAPPPTMTGDLQPRQVASSPGHPSQPPLEDNTPATRTTK...,Q96JG9


In [38]:
merged[merged["seq"].isna()]

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,id,seq
272,ZNF469,792,911,H3BS19,"[[], []]",ENST00000565624 / ENST00000565624,H3BS19 / H3BS19,"DelRosso et al. (H3BS19, 792_891) // DelRosso ...",TF / TF,,
273,ZNF469,2892,2971,H3BS19,[[]],ENST00000565624,H3BS19,"DelRosso et al. (H3BS19, 2892_2971)",TF,,
274,ZNF469,3002,3091,H3BS19,[[]],ENST00000565624,H3BS19,"DelRosso et al. (H3BS19, 3002_3091)",TF,,


In [41]:
# https://www.uniprot.org/uniprotkb/Q96JG9/entry
# H3BS19 has been merged into Q96JG9
# Replace H3BS19 with Q96JG9

In [49]:
merged_mapped_known_ADs[merged_mapped_known_ADs["Gene"] == "ZNF469"]

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType
271,ZNF469,792.0,911.0,H3BS19,"[[], []]",ENST00000565624 / ENST00000565624,H3BS19 / H3BS19,"DelRosso et al. (H3BS19, 792_891) // DelRosso ...",TF / TF
272,ZNF469,2892.0,2971.0,H3BS19,[[]],ENST00000565624,H3BS19,"DelRosso et al. (H3BS19, 2892_2971)",TF
273,ZNF469,3002.0,3091.0,H3BS19,[[]],ENST00000565624,H3BS19,"DelRosso et al. (H3BS19, 3002_3091)",TF


In [50]:
merged_mapped_known_ADs.at[271, "uniprotID"] = "Q96JG9"
merged_mapped_known_ADs.at[272, "uniprotID"] = "Q96JG9"
merged_mapped_known_ADs.at[273, "uniprotID"] = "Q96JG9"

In [51]:
# Trying again
merged = pd.merge(merged_mapped_known_ADs, isoform_df, on = "uniprotID", how = "left")
merged

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,id,seq
0,ABRAXAS1,121.0,200.0,Q6UWZ7,[[Q6UWZ7]],ENST00000321945,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",CR,sp|Q6UWZ7|ABRX1_HUMAN BRCA1-A complex subunit ...,MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSIT...
1,AHR,118.0,126.0,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor ...,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
2,AHR,266.0,268.0,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor ...,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
3,AHR,532.0,848.0,P35869,"[[P35869], [P35869], [P35869]]",ENST00000242057 / nan / ENST00000242057,P35869 / P35869 / P35869,"DelRosso et al. (P35869, 532_641) // PMID: 879...",TF / TF / TF,sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor ...,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
4,AKAP8,2.0,81.0,O43823,[[O43823]],ENST00000269701,O43823,"DelRosso et al. (O43823, 2_81)",TF,sp|O43823|AKAP8_HUMAN A-kinase anchor protein ...,MDQGYGGYGAWSAGPANTQGAYGTGVASWQGYENYNYYGAQNTSVT...
...,...,...,...,...,...,...,...,...,...,...,...
757,ATXN7L3,1.0,120.0,Q14CW9,"[[Q14CW9, Q14CW9-2]]",,,Alerasool et al,TF,sp|Q14CW9|AT7L3_HUMAN Ataxin-7-like protein 3 ...,MKMEEMSLSGLDNSKLEAIAQEIYADLVEDSCLGFCFEVHRAVKCG...
758,FAM22F,361.0,500.0,A1L443,[[A1L443]],,,Alerasool et al,TF,sp|A1L443|NTM2F_HUMAN NUT family member 2F OS=...,MASNGAYPVLGPGVTVNPGTSLSVFTALPFATPAPGPAHRPPLVTA...
759,SS18L2,18.0,77.0,Q9UHA2,[[Q9UHA2]],,,Alerasool et al,TF,sp|Q9UHA2|S18L2_HUMAN SS18-like protein 2 OS=H...,MSVAFVPDWLRGKAEVNQETIQRLLEENDQLIRCIVEYQNKGRGNE...
760,SERTAD2,201.0,260.0,Q14140,[[Q14140]],,,Alerasool et al,TF,sp|Q14140|SRTD2_HUMAN SERTA domain-containing ...,MLGKGGKRKFDEHEDGLEGKIVSPCDGPSKVSYTLQRQTIFNISLM...


In [52]:
merged["Start"] = merged["Start"].astype(int)
merged["End"] = merged["End"].astype(int)

merged["ProteinRegionSeq"] = [seq[start - 1: end] for seq, start, end in zip(merged["seq"],
                                                                     merged["Start"],
                                                                     merged["End"])]
merged = merged.drop(columns = {"seq", "id"})
merged

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq
0,ABRAXAS1,121,200,Q6UWZ7,[[Q6UWZ7]],ENST00000321945,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...
1,AHR,118,126,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,LLQALNGFV
2,AHR,266,268,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,FAI
3,AHR,532,848,P35869,"[[P35869], [P35869], [P35869]]",ENST00000242057 / nan / ENST00000242057,P35869 / P35869 / P35869,"DelRosso et al. (P35869, 532_641) // PMID: 879...",TF / TF / TF,QDSKNSDLYSIMKNLGIDFEDIRHMQNEKFFRNDFSGEVDFRDIDL...
4,AKAP8,2,81,O43823,[[O43823]],ENST00000269701,O43823,"DelRosso et al. (O43823, 2_81)",TF,DQGYGGYGAWSAGPANTQGAYGTGVASWQGYENYNYYGAQNTSVTT...
...,...,...,...,...,...,...,...,...,...,...
757,ATXN7L3,1,120,Q14CW9,"[[Q14CW9, Q14CW9-2]]",,,Alerasool et al,TF,MKMEEMSLSGLDNSKLEAIAQEIYADLVEDSCLGFCFEVHRAVKCG...
758,FAM22F,361,500,A1L443,[[A1L443]],,,Alerasool et al,TF,PRPQRPAETNAHLPPPRPQRPAETKVPEEIPPEVVQEYVDIMEELL...
759,SS18L2,18,77,Q9UHA2,[[Q9UHA2]],,,Alerasool et al,TF,QETIQRLLEENDQLIRCIVEYQNKGRGNECVQYQHVLHRNLIYLAT...
760,SERTAD2,201,260,Q14140,[[Q14140]],,,Alerasool et al,TF,SEAGTQKLDGPQESRADDSKLMDSLPGNFEITTSTGFLTDLTLDDI...


In [56]:
merged = merged.drop_duplicates(subset = ["Start", "End", "uniprotID"])
merged

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq
0,ABRAXAS1,121,200,Q6UWZ7,[[Q6UWZ7]],ENST00000321945,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...
1,AHR,118,126,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,LLQALNGFV
2,AHR,266,268,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,FAI
3,AHR,532,848,P35869,"[[P35869], [P35869], [P35869]]",ENST00000242057 / nan / ENST00000242057,P35869 / P35869 / P35869,"DelRosso et al. (P35869, 532_641) // PMID: 879...",TF / TF / TF,QDSKNSDLYSIMKNLGIDFEDIRHMQNEKFFRNDFSGEVDFRDIDL...
4,AKAP8,2,81,O43823,[[O43823]],ENST00000269701,O43823,"DelRosso et al. (O43823, 2_81)",TF,DQGYGGYGAWSAGPANTQGAYGTGVASWQGYENYNYYGAQNTSVTT...
...,...,...,...,...,...,...,...,...,...,...
757,ATXN7L3,1,120,Q14CW9,"[[Q14CW9, Q14CW9-2]]",,,Alerasool et al,TF,MKMEEMSLSGLDNSKLEAIAQEIYADLVEDSCLGFCFEVHRAVKCG...
758,FAM22F,361,500,A1L443,[[A1L443]],,,Alerasool et al,TF,PRPQRPAETNAHLPPPRPQRPAETKVPEEIPPEVVQEYVDIMEELL...
759,SS18L2,18,77,Q9UHA2,[[Q9UHA2]],,,Alerasool et al,TF,QETIQRLLEENDQLIRCIVEYQNKGRGNECVQYQHVLHRNLIYLAT...
760,SERTAD2,201,260,Q14140,[[Q14140]],,,Alerasool et al,TF,SEAGTQKLDGPQESRADDSKLMDSLPGNFEITTSTGFLTDLTLDDI...


In [60]:
merged.to_csv("../output/known_ADs_considering_isoforms_and_canonical_with_alerasool.csv")

---
# Which are new?

In [61]:
old_known_ADs = pd.read_csv("../output/known_ADs_considering_isoforms_and_canonical.csv")
old_known_ADs

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
0,ABRAXAS1,121,200,Q6UWZ7,[['Q6UWZ7']],ENST00000321945,Q6UWZ7,DelRosso et al.,CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...,
1,AEBP1,1088,1158,Q8IUX7,[['Q8IUX7']],,Q8IUX7,Staller Activity Data,TF,EVVTEFGTEVEPEFGTKVEPEFETQLEPEFETQLEPEFEEEEEEEK...,
2,AHCTF1,1445,1698,Q8WYP5,"[['Q8WYP5'], ['Q8WYP5']]",nan / nan,Q8WYP5 / Q8WYP5,"PMID: 11952839, Soto / transcriptionalactivity...",TF,IRANDNKSMADVLGDGGNSSLTISEGPIVSERRLNQEVALNLKEDH...,
3,AHR,118,126,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,LLQALNGFV,
4,AHR,266,268,P35869,[['P35869']],,P35869,"transcriptionalactivity_regions.txt, GSL",TF,FAI,
...,...,...,...,...,...,...,...,...,...,...,...
739,ZSCAN20,262,341,P17040,"[['P17040', 'P17040-3']]",ENST00000361328,P17040,DelRosso et al.,TF,PSNTSEKEQGPEFWGLSLINSGKRSTADYSLDNEPAQALTWRDSRA...,
740,ZXDA,572,699,P98168,"[['P98168'], ['P98168']]",nan / nan,P98168 / P98168 / P98168,"PMID: 17493635, Soto / R4TA_regions.txt / acti...",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
741,ZXDB,576,703,P98169,[['P98169']],,P98169 / P98169,"R4TA_regions.txt / activation_regions.txt, GSL",TF,QDLLAQLEAANSLTPSSELTSQRQNDLSDAEIVSLFSDVPDSTSAA...,
742,ZXDC,579,688,Q2QGD7,"[['Q2QGD7', 'Q2QGD7-2'], ['Q2QGD7', 'Q2QGD7-2']]",nan / nan,Q2QGD7 / Q2QGD7 / Q2QGD7,"PMID: 16600381, Soto / R4TA_regions.txt / acti...",TF,DSPLVLGTAATVLQQGSFSVDDVQTVSAGALGCLVALPMKNLSDDP...,


In [63]:
merged = merged.reset_index(drop = True)
merged

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq
0,ABRAXAS1,121,200,Q6UWZ7,[[Q6UWZ7]],ENST00000321945,Q6UWZ7,"DelRosso et al. (Q6UWZ7, 121_200)",CR,LQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPL...
1,AHR,118,126,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,LLQALNGFV
2,AHR,266,268,P35869,[[P35869]],,P35869,"transcriptionalactivity_regions.txt, GSL (P358...",TF,FAI
3,AHR,532,848,P35869,"[[P35869], [P35869], [P35869]]",ENST00000242057 / nan / ENST00000242057,P35869 / P35869 / P35869,"DelRosso et al. (P35869, 532_641) // PMID: 879...",TF / TF / TF,QDSKNSDLYSIMKNLGIDFEDIRHMQNEKFFRNDFSGEVDFRDIDL...
4,AKAP8,2,81,O43823,[[O43823]],ENST00000269701,O43823,"DelRosso et al. (O43823, 2_81)",TF,DQGYGGYGAWSAGPANTQGAYGTGVASWQGYENYNYYGAQNTSVTT...
...,...,...,...,...,...,...,...,...,...,...
756,ATXN7L3,1,120,Q14CW9,"[[Q14CW9, Q14CW9-2]]",,,Alerasool et al,TF,MKMEEMSLSGLDNSKLEAIAQEIYADLVEDSCLGFCFEVHRAVKCG...
757,FAM22F,361,500,A1L443,[[A1L443]],,,Alerasool et al,TF,PRPQRPAETNAHLPPPRPQRPAETKVPEEIPPEVVQEYVDIMEELL...
758,SS18L2,18,77,Q9UHA2,[[Q9UHA2]],,,Alerasool et al,TF,QETIQRLLEENDQLIRCIVEYQNKGRGNECVQYQHVLHRNLIYLAT...
759,SERTAD2,201,260,Q14140,[[Q14140]],,,Alerasool et al,TF,SEAGTQKLDGPQESRADDSKLMDSLPGNFEITTSTGFLTDLTLDDI...


In [64]:
old_known_ADs[old_known_ADs["Gene"] == "ERG"]

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
125,ERG,433,479,P11308,[['P11308-3']],,P11308,"PMID: 9681824, Soto",TF,PHPPALPVTSSSFFAAPNPYWNSPTGGIYPNTRLPTSHMPSHLGTYY,
126,ERG,118,261,P11308,"[['P11308-1', 'P11308-3', 'P11308-5', 'P11308-...",nan / ENST00000288319,P11308 / P11308,"PMID: 14603248, Soto / DelRosso et al.",TF,MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...,


In [66]:
merged[merged["Gene"] == "ERG"].iloc[0]

Gene                                                                     ERG
Start                                                                    118
End                                                                      261
uniprotID                                                             P11308
Matching Isoforms          [[P11308-1, P11308-3, P11308-5, P11308-6], [P1...
Canonical Transcript ID                                nan / ENST00000288319
orig_uniprotID                                               P11308 / P11308
Reference                  PMID: 14603248, Soto (P11308, 125_209) // DelR...
TileType                                                             TF / TF
ProteinRegionSeq           MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...
Name: 60, dtype: object

In [87]:
# Iterate through new merged df
# OLD: Reference does not have Alerasool 
# COMPLETELY  NEW: Reference is only Alerasool
# If reference contains alerasool and others
    # WITHIN OLD: There is an entry with identical uniprotID and start/end
    # EXPANDED: There is an entry with expanded coordinates

old_genes = []
new_genes = []
within_old = []
expanded = []

for i in merged.index:
    row = merged.iloc[i]
    if row["Reference"] == "Alerasool et al":
        new_genes.append(row["Gene"])
    elif "Alerasool et al" in row["Reference"]:
        old_entries = old_known_ADs[(old_known_ADs["Gene"] == row["Gene"]) & (old_known_ADs["Start"] == row["Start"])& (old_known_ADs["End"] == row["End"])]
        if len(old_entries) > 0:
            within_old.append(row["Gene"])
        else:
            expanded.append(row["Gene"])
    else:
        old_genes.append(row["Gene"])

print(str(len(old_genes)) + " entries same as before")
print(str(len(within_old)) + " entries from " + str(len(set(within_old))) + " unique genes contain alerasool ADs and are the same as before")
print(str(len(new_genes)) + " new alerasool ADs from " + str(len(set(new_genes))) + " unique genes")
print(str(len(expanded)) + " expanded entries after adding alerasool ADs from " + str(len(set(expanded))) + " unique genes")


723 entries same as before
11 entries from 11 unique genes contain alerasool ADs and are the same as before
17 new alerasool ADs from 17 unique genes
10 expanded entries after adding alerasool ADs from 10 unique genes


In [71]:
len(old_genes)

723

In [72]:
len(new_genes)

17

In [92]:
for g in new_genes:
    print(g)

HOXA2
NEUROD1
MYCBP
C11orf74
ATOH1
RBPJ
LIN9
SERTAD1
YAF2
C3orf62
FAM90A1
SPDYE4
ATXN7L3
FAM22F
SS18L2
SERTAD2
BTBD18


In [74]:
len(within_old)

11

In [75]:
len(expanded)

10

In [91]:
for g in expanded:
    print(g)

ATF6B
BRD8
JAZF1
MYCL / MYCL1
NEUROG3
KLF7
p53 / TP53 / TP53
ZXDC
CITE1/MSG1 / CITED1
CITED2


In [76]:
723 + 17 + 11 + 10 # ADDS UP!

761

In [89]:
len(alerasool)

38