In [1]:
import pandas as pd
import numpy as np
import AD_comparison_tools
import AD_predictor_tools
import PlottingTools

In [2]:
uniprot_high_qual_ADs = pd.read_csv("../data/UniprotActivationDomains_HighqualitySet.csv")
uniprot_high_qual_ADs = uniprot_high_qual_ADs[["Gene ", "AD name", "Start", "End", "uniprotID", "Reference"]]
uniprot_high_qual_ADs

Unnamed: 0,Gene,AD name,Start,End,uniprotID,Reference
0,MEIS1,,335,390,O00470,R4TA_regions.txt
1,SOX12,,283,315,O15370,R4TA_regions.txt
2,SOX11,,409,441,P35716,R4TA_regions.txt
3,ZXDA,,572,699,P98168,R4TA_regions.txt
4,ZXDB,,576,703,P98169,R4TA_regions.txt
...,...,...,...,...,...,...
125,NR3C1,,98,115,P04150,transcriptionalactivity_regions.txt
126,SOX11,,409,441,P35716,transcriptionalactivity_regions.txt
127,AHR,,118,126,P35869,transcriptionalactivity_regions.txt
128,AHR,,266,268,P35869,transcriptionalactivity_regions.txt


In [3]:
human_ADs = pd.read_csv("../data/ActivationDomainsHuman.csv")
human_ADs = human_ADs[human_ADs['Start'].notna()]
human_ADs = human_ADs[human_ADs['End'].notna()]
human_ADs = human_ADs[["Gene ", "AD name", "Start", "End", "uniprotID", "Reference"]]
human_ADs

Unnamed: 0,Gene,AD name,Start,End,uniprotID,Reference
0,OTX2,Cterm,231,289.0,P32243,uniprot
1,CRX,Cterm,200,284.0,O43186,Chen 2002 pg 873
2,VP16,Full,411,490.0,P06492,
3,VP16,H1,411,456.0,P06492,
4,VP16,MVS N_C (acid blob library fragment,418,456.0,P06492,
...,...,...,...,...,...,...
83,CEBPB,,24,124.0,P17676,Choi 2000 list
84,PU.1/SPI1,,13,90.0,P17947,Choi 2000 list
85,CREB1,Qrich,151,238.0,P16220,Choi 2000 list
86,NFE2,,1,80.0,Q16621,Choi 2000 list


In [4]:
uniprot_high_qual_ADs['Start'] = uniprot_high_qual_ADs['Start'].astype(int)
uniprot_high_qual_ADs['End'] = uniprot_high_qual_ADs['End'].astype(int)

human_ADs['Start'] = human_ADs['Start'].astype(int)
human_ADs['End'] = human_ADs['End'].astype(int)

In [5]:
both_lists = pd.concat([human_ADs, uniprot_high_qual_ADs])
both_lists

Unnamed: 0,Gene,AD name,Start,End,uniprotID,Reference
0,OTX2,Cterm,231,289,P32243,uniprot
1,CRX,Cterm,200,284,O43186,Chen 2002 pg 873
2,VP16,Full,411,490,P06492,
3,VP16,H1,411,456,P06492,
4,VP16,MVS N_C (acid blob library fragment,418,456,P06492,
...,...,...,...,...,...,...
125,NR3C1,,98,115,P04150,transcriptionalactivity_regions.txt
126,SOX11,,409,441,P35716,transcriptionalactivity_regions.txt
127,AHR,,118,126,P35869,transcriptionalactivity_regions.txt
128,AHR,,266,268,P35869,transcriptionalactivity_regions.txt


In [6]:
def return_merged_row(uniprotID, df):
    # Only look at rows with the same uniprot ID
    same_uniprotID_rows = df[df["uniprotID"] == uniprotID]
    same_uniprotID_rows = same_uniprotID_rows.sort_values(by = "Start")
    
    # Final dataframe columns
    new_starts = []
    new_ends = []
    genes = []
    AD_names = []
    references = []
    
    # Current row's values
    curr_start = -1
    curr_end = -1
    curr_genes = []
    curr_AD_names = []
    curr_references = []
    
    for i in same_uniprotID_rows.index:
        # Merge current row with next row
        if curr_end >= same_uniprotID_rows.loc[i]["Start"]:
            curr_end = max(curr_end, same_uniprotID_rows.loc[i]["End"])
            curr_genes.append(same_uniprotID_rows.loc[i]["Gene "])
            curr_AD_names.append(same_uniprotID_rows.loc[i]["AD name"])
            curr_references.append(same_uniprotID_rows.loc[i]["Reference"])
        
        # Don't merge current row with next row
        else: 
            new_starts.append(curr_start)
            new_ends.append(curr_end)
            genes.append(" / ".join(set([c.strip() for c in curr_genes])))
            
            curr_AD_names = [str(c) for c in curr_AD_names]
            AD_names.append(" / ".join(curr_AD_names))
            
            curr_references = [str(c) for c in curr_references]
            references.append(" / ".join(curr_references))
            
            curr_start = same_uniprotID_rows.loc[i]["Start"]
            curr_end = same_uniprotID_rows.loc[i]["End"]
            
            curr_genes = [same_uniprotID_rows.loc[i]["Gene "]]
            curr_AD_names = [same_uniprotID_rows.loc[i]["AD name"]]
            curr_references = [same_uniprotID_rows.loc[i]["Reference"]]
    
    # Append the last values
    new_starts.append(curr_start)
    
    new_ends.append(curr_end)
    
    genes.append(" / ".join(set([c.strip() for c in curr_genes])))
    
    curr_AD_names = [str(c) for c in curr_AD_names]
    AD_names.append(" / ".join(curr_AD_names))
    
    curr_references = [str(c) for c in curr_references]
    references.append(" / ".join(curr_references))
    
    # Remove the first (because it is just -1 or "")
    new_starts = new_starts[1:]
    new_ends = new_ends[1:]
    genes = genes[1:]
    AD_names = AD_names[1:]
    references = references[1:]

    return pd.DataFrame({"Gene": genes,
                         "AD name": AD_names,
                         "Start": new_starts,
                        "End": new_ends,
                        "uniprotID": uniprotID,
                         "Reference": references
                        })

In [7]:
dfs = []
for uniprotID in both_lists["uniprotID"].unique():
    dfs.append(return_merged_row(uniprotID, both_lists))

In [8]:
new_GSL = pd.concat(dfs)
new_GSL.sort_values(by = "uniprotID")

Unnamed: 0,Gene,AD name,Start,End,uniprotID,Reference
0,MEIS1,nan / nan,335,390,O00470,R4TA_regions.txt / activation_regions.txt
0,E2F3,,391,465,O00716,activation_regions.txt
0,MEIS2,,340,477,O14770,activation_regions.txt
0,TFEC,,1,119,O14948,activation_regions.txt
1,TFEC,,271,347,O14948,activation_regions.txt
...,...,...,...,...,...,...
1,DMTF1,nan / nan,459,760,Q9Y222,R4TA_regions.txt / activation_regions.txt
0,DMTF1,nan / nan,87,170,Q9Y222,R4TA_regions.txt / activation_regions.txt
1,FOXA2/HNF3B / FOXA2,nan / nan,361,463,Q9Y261,activation_regions.txt / Choi 2000 list
0,FOXA2,,14,93,Q9Y261,activation_regions.txt


In [9]:
new_GSL[["uniprotID"]].to_csv("../data/newGSL_uniprotIDs.csv")

Inputted uniprotIDs at https://www.uniprot.org/id-mapping. Downloaded results including sequence.

In [10]:
uniprot_output = pd.read_csv("../data/uniprot-download_true_fields_accession_2Creviewed_2Cid_2Cprotein_nam-2022.09.23-18.22.08.93.tsv", sep="\t")

In [11]:
uniprot_output

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Sequence
0,P32243,P32243,reviewed,OTX2_HUMAN,Homeobox protein OTX2 (Orthodenticle homolog 2),OTX2,Homo sapiens (Human),MMSYLKQPPYAVNGLSLTTSGMDLLHPSVGYPATPRKQRRERTTFT...
1,O43186,O43186,reviewed,CRX_HUMAN,Cone-rod homeobox protein,CRX CORD2,Homo sapiens (Human),MMAYMNPGPHYSVNALALSGPSVDLMHQAVPYPSAPRKQRRERTTF...
2,P06492,P06492,reviewed,VP16_HHV11,Tegument protein VP16 (Alpha trans-inducing pr...,UL48,Human herpesvirus 1 (strain 17) (HHV-1) (Human...,MDLLVDELFADMNADGASPPPPRPAGGPKNTPAAPPLYATGRLSQA...
3,Q01860,Q01860,reviewed,PO5F1_HUMAN,"POU domain, class 5, transcription factor 1 (O...",POU5F1 OCT3 OCT4 OTF3,Homo sapiens (Human),MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...
4,P09086,P09086,reviewed,PO2F2_HUMAN,"POU domain, class 2, transcription factor 2 (L...",POU2F2 OCT2 OTF2,Homo sapiens (Human),MVHSSMGAPEIRMSKPLEAEKQGLDSPSEHTDTERNGPDTNHQNPQ...
...,...,...,...,...,...,...,...,...
130,Q9UBX2,Q9UBX2,reviewed,DUX4_HUMAN,Double homeobox protein 4 (Double homeobox pro...,DUX4 DUX10,Homo sapiens (Human),MALPTPSDSTLPAEARGRGRRRRLVWTPSQSEALRACFERNPYPGI...
131,Q9ULX6,Q9ULX6,reviewed,AKP8L_HUMAN,A-kinase anchor protein 8-like (AKAP8-like pro...,AKAP8L NAKAP NAKAP95 HRIHFB2018,Homo sapiens (Human),MSYTGFVQGSETTLQSTYSDTSAQPTCDYGYGTWNSGTNRGYEGYG...
132,O95718,O95718,reviewed,ERR2_HUMAN,Steroid hormone receptor ERR2 (ERR beta-2) (Es...,ESRRB ERRB2 ESRL2 NR3B2,Homo sapiens (Human),MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...
133,P35869,P35869,reviewed,AHR_HUMAN,Aryl hydrocarbon receptor (Ah receptor) (AhR) ...,AHR BHLHE76,Homo sapiens (Human),MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...


In [12]:
uniprot_seq = uniprot_output[["Entry", "Sequence"]]
uniprot_seq

Unnamed: 0,Entry,Sequence
0,P32243,MMSYLKQPPYAVNGLSLTTSGMDLLHPSVGYPATPRKQRRERTTFT...
1,O43186,MMAYMNPGPHYSVNALALSGPSVDLMHQAVPYPSAPRKQRRERTTF...
2,P06492,MDLLVDELFADMNADGASPPPPRPAGGPKNTPAAPPLYATGRLSQA...
3,Q01860,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...
4,P09086,MVHSSMGAPEIRMSKPLEAEKQGLDSPSEHTDTERNGPDTNHQNPQ...
...,...,...
130,Q9UBX2,MALPTPSDSTLPAEARGRGRRRRLVWTPSQSEALRACFERNPYPGI...
131,Q9ULX6,MSYTGFVQGSETTLQSTYSDTSAQPTCDYGYGTWNSGTNRGYEGYG...
132,O95718,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...
133,P35869,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...


In [13]:
new_GSL = new_GSL.merge(uniprot_seq, left_on="uniprotID", right_on = "Entry")
new_GSL

Unnamed: 0,Gene,AD name,Start,End,uniprotID,Reference,Entry,Sequence
0,OTX2,Cterm,231,289,P32243,uniprot,P32243,MMSYLKQPPYAVNGLSLTTSGMDLLHPSVGYPATPRKQRRERTTFT...
1,CRX,Cterm,200,284,O43186,Chen 2002 pg 873,O43186,MMAYMNPGPHYSVNALALSGPSVDLMHQAVPYPSAPRKQRRERTTF...
2,VP16,Full / H1 / MVS N_C (acid blob library fragmen...,411,490,P06492,nan / nan / nan / nan,P06492,MDLLVDELFADMNADGASPPPPRPAGGPKNTPAAPPLYATGRLSQA...
3,Oct4,Nterm,1,137,Q01860,Brehm,Q01860,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...
4,Oct4,Cterm,290,360,Q01860,Boija 2018,Q01860,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...
...,...,...,...,...,...,...,...,...
163,AKAP8L,,1,268,Q9ULX6,activation_regions.txt,Q9ULX6,MSYTGFVQGSETTLQSTYSDTSAQPTCDYGYGTWNSGTNRGYEGYG...
164,ESRRB,,203,433,O95718,transcriptionalactivity_regions.txt,O95718,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...
165,AHR,,118,126,P35869,transcriptionalactivity_regions.txt,P35869,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
166,AHR,,266,268,P35869,transcriptionalactivity_regions.txt,P35869,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...


In [14]:
new_GSL = new_GSL.rename(columns = {"Sequence" : "full_seq"})
new_GSL

Unnamed: 0,Gene,AD name,Start,End,uniprotID,Reference,Entry,full_seq
0,OTX2,Cterm,231,289,P32243,uniprot,P32243,MMSYLKQPPYAVNGLSLTTSGMDLLHPSVGYPATPRKQRRERTTFT...
1,CRX,Cterm,200,284,O43186,Chen 2002 pg 873,O43186,MMAYMNPGPHYSVNALALSGPSVDLMHQAVPYPSAPRKQRRERTTF...
2,VP16,Full / H1 / MVS N_C (acid blob library fragmen...,411,490,P06492,nan / nan / nan / nan,P06492,MDLLVDELFADMNADGASPPPPRPAGGPKNTPAAPPLYATGRLSQA...
3,Oct4,Nterm,1,137,Q01860,Brehm,Q01860,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...
4,Oct4,Cterm,290,360,Q01860,Boija 2018,Q01860,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...
...,...,...,...,...,...,...,...,...
163,AKAP8L,,1,268,Q9ULX6,activation_regions.txt,Q9ULX6,MSYTGFVQGSETTLQSTYSDTSAQPTCDYGYGTWNSGTNRGYEGYG...
164,ESRRB,,203,433,O95718,transcriptionalactivity_regions.txt,O95718,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...
165,AHR,,118,126,P35869,transcriptionalactivity_regions.txt,P35869,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...
166,AHR,,266,268,P35869,transcriptionalactivity_regions.txt,P35869,MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTE...


In [15]:
new_GSL["Sequence"] = [seq[start - 1: end] for seq, start, end in zip(new_GSL["full_seq"],
                                                                     new_GSL["Start"],
                                                                     new_GSL["End"])]

In [16]:
new_GSL["Length"] = new_GSL["End"] - new_GSL["Start"] + 1

In [17]:
new_GSL = new_GSL.drop(columns = ["full_seq", "Entry"])
new_GSL = new_GSL.rename(columns = {"Gene" : "GeneName"})
new_GSL

Unnamed: 0,GeneName,AD name,Start,End,uniprotID,Reference,Sequence,Length
0,OTX2,Cterm,231,289,P32243,uniprot,LNQSPASLSTQGYGASSLGFNSTTDCLDYKDQTASWKLNFNADCLD...,59
1,CRX,Cterm,200,284,O43186,Chen 2002 pg 873,AFCSSPSAYGSPSSYFSGLDPYLSPMVPQLGGPALSPLSGPSVGPS...,85
2,VP16,Full / H1 / MVS N_C (acid blob library fragmen...,411,490,P06492,nan / nan / nan / nan,STAPPTDVSLGDELHLDGEDVAMAHADALDDFDLDMLGDGDSPGPG...,80
3,Oct4,Nterm,1,137,Q01860,Brehm,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...,137
4,Oct4,Cterm,290,360,Q01860,Boija 2018,SDYAQREDFEAAGSPFSGGPVSFPLAPGPHFGTPGYGSPHFTALYS...,71
...,...,...,...,...,...,...,...,...
163,AKAP8L,,1,268,Q9ULX6,activation_regions.txt,MSYTGFVQGSETTLQSTYSDTSAQPTCDYGYGTWNSGTNRGYEGYG...,268
164,ESRRB,,203,433,O95718,transcriptionalactivity_regions.txt,PPAKKPLTKIVSYLLVAEPDKLYAMPPPGMPEGDIKALTTLCDLAD...,231
165,AHR,,118,126,P35869,transcriptionalactivity_regions.txt,LLQALNGFV,9
166,AHR,,266,268,P35869,transcriptionalactivity_regions.txt,FAI,3


In [18]:
new_GSL['Sequence'].replace(new_GSL[new_GSL["uniprotID"] == "P04637"]["Sequence"][13], np.nan, inplace=True)
new_GSL = new_GSL.dropna()

In [19]:
new_GSL.to_csv("../data/newGSL.csv")

---
Properties of the GSL

In [20]:
len(new_GSL.index)

167

In [21]:
def add_AA_count(aa, GSL):
    GSL[aa] = GSL["Sequence"].str.count(aa)
    
def return_AA_prop(aa, GSL):
    add_AA_count(aa, GSL)
    GSL[aa+"_prop"] = GSL[aa] / (GSL["End"] - GSL["Start"] + 1)
    print(str(sum(GSL[aa+"_prop"] > 0.15)) + " " + aa + " rich ADs")
    print(str(np.round(100 * sum(GSL[aa+"_prop"] > 0.15) / len(GSL.index), 3)) + "% "+ aa + " rich ADs") 

In [22]:
return_AA_prop("Q", new_GSL)

12 Q rich ADs
7.186% Q rich ADs


In [23]:
return_AA_prop("P", new_GSL)

30 P rich ADs
17.964% P rich ADs


In [24]:
return_AA_prop("S", new_GSL)

37 S rich ADs
22.156% S rich ADs


In [25]:
return_AA_prop("A", new_GSL)

7 A rich ADs
4.192% A rich ADs


In [26]:
add_AA_count("D", new_GSL)
add_AA_count("R", new_GSL)
add_AA_count("K", new_GSL)
add_AA_count("E", new_GSL)

In [27]:
PlottingTools.add_charge(new_GSL)

In [28]:
new_GSL[new_GSL["Charge"] < -3]

Unnamed: 0,GeneName,AD name,Start,End,uniprotID,Reference,Sequence,Length,Q,Q_prop,...,P_prop,S,S_prop,A,A_prop,D,R,K,E,Charge
2,VP16,Full / H1 / MVS N_C (acid blob library fragmen...,411,490,P06492,nan / nan / nan / nan,STAPPTDVSLGDELHLDGEDVAMAHADALDDFDLDMLGDGDSPGPG...,80,1,0.012500,...,0.075000,4,0.050000,9,0.112500,16,0,0,5,-21
3,Oct4,Nterm,1,137,Q01860,Brehm,MAGHLASDFAFSPPPGGGGDGPGGPEPGWVDPRTWLSFQGPPGGPG...,137,6,0.043796,...,0.175182,9,0.065693,7,0.051095,4,1,3,13,-13
4,Oct4,Cterm,290,360,Q01860,Boija 2018,SDYAQREDFEAAGSPFSGGPVSFPLAPGPHFGTPGYGSPHFTALYS...,71,1,0.014085,...,0.169014,10,0.140845,6,0.084507,2,1,0,4,-5
7,STAT1,TAD,710,750,P42224,"Wojciak, J., Martinez-Yamout, M., Dyson, H., W...",SEVHPSRLQTTDNLLPMSPEEFDEVSRIVGSVEFDSMMNTV,41,1,0.024390,...,0.073171,6,0.146341,0,0.000000,3,2,0,5,-6
8,STAT3,TAD,719,764,P40763,"Wojciak, J., Martinez-Yamout, M., Dyson, H., W...",SNTIDLPMSPRTLDSLMQFGNNGEGAEPSAGGQFESLTFDMELTSE,46,2,0.043478,...,0.065217,6,0.130435,2,0.043478,3,1,0,5,-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,GRHL1,,1,91,Q9NZI5,activation_regions.txt,MTQEYDNKRPVLVLQNEALYPQRRSYTSEDEAWKSFLENPLTAATK...,91,3,0.032967,...,0.076923,8,0.087912,9,0.098901,6,6,5,10,-5
162,DUX4,nan / nan,327,424,Q9UBX2,activation_regions.txt / activation_regions.txt,AGAAPPPQPAPPDASASARQGQMQGIPAPSQALQEPAPWSALPCGL...,98,9,0.091837,...,0.153061,8,0.081633,19,0.193878,2,2,0,15,-15
163,AKAP8L,,1,268,Q9ULX6,activation_regions.txt,MSYTGFVQGSETTLQSTYSDTSAQPTCDYGYGTWNSGTNRGYEGYG...,268,13,0.048507,...,0.037313,30,0.111940,16,0.059701,17,21,2,13,-7
164,ESRRB,,203,433,O95718,transcriptionalactivity_regions.txt,PPAKKPLTKIVSYLLVAEPDKLYAMPPPGMPEGDIKALTTLCDLAD...,231,10,0.043290,...,0.056277,12,0.051948,19,0.082251,13,9,19,19,-4


In [29]:
len(new_GSL[new_GSL["Charge"] < -3].index) / len(new_GSL.index)

0.6287425149700598