# Generate ELM-Manual database

This notebook contains the code to generate the ELM-Manual database from ELM (The Eukaryotic Linear Motif resource for Functional Sites in Proteins), Mészarós et al. (2017) and Martínez-Jiménez et al. (2019).

## Import libraries

In [2]:
# to reload automatically the changes in the scripts.
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import os
import sys
from Bio import SeqIO
import functools
import operator

## my modules ##
sys.path.append("../scripts/Utils/")    # modules folder
from fasta_utils import readFasta_gzip
from sequence_utils import *

## Define paths

In [4]:
base = "../"

data = "data/"

elm_path = os.path.join(base, data, "external/elm_instances.tsv")
meszaros_path = os.path.join(base, data, "external/degrons_in_cancer.tsv")
mjimenez_path = os.path.join(base, data, "external/Martinez_degron_instances.tsv")

proteome_path = os.path.join(base, data, "external/uniprot/uniprot_proteome_UP000005640.fasta.gz")

elm_manual_path = os.path.join(base, data, "elm_manual/elm_manual_instances.tsv")

## Load data

Load each dataset and keep columns: substrate, DEG_ligase_motif and start-end positions.


### ELM

- Download from http://elm.eu.org/downloads.html -> tsv table of instances selecting Homo Sapiens.
    - The column ELMIdentifier contains the identifier of the E3ligase and motif of the ligase (E3ligase is the second word after the first "_")
    - The column Primary_Acc contains the ID of the substrate
- Select from that table the entries with DEG that correspond to degrons



In [12]:
# Load df
elm_complete = pd.read_csv(elm_path, sep = '\t', skiprows = [0, 1, 2, 3, 4], 
                           usecols = ['ELMIdentifier','Primary_Acc','Start','End'])

# Filter degrons
df_elm = elm_complete[elm_complete['ELMIdentifier'].apply(lambda x: x[0:3] == 'DEG')].copy()

# Generalize column names
df_elm = df_elm[['ELMIdentifier','Primary_Acc','Start','End']]
df_elm.columns = ['Degron','Substrate','Start','End']

# Annotate source
df_elm['Database'] = 'ELM'

df_elm.drop_duplicates(inplace = True, ignore_index = True)

# Handle isoforms
##to eliminate the '-' in some of the substrates (those that are from different isoforms)
##we will keep only the canonical isoform since the others may have a different position (indicated with "_1")
index_to_drop = []

for i, row in df_elm.iterrows():
    
    if '-' in row['Substrate'] and row['Substrate'].split('-')[1] != 1:
        index_to_drop.append(i)
        
    elif '-' in row['Substrate'] and row['Substrate'].split('-')[1] == 1:
        df_elm.loc[i,'Substrate']=row['Substrate'].split('-')[0]
        
    if row['Substrate'] == 'A0A0B4J1T2': # manually correct as the ID is different
        df_elm.loc[i, 'Substrate'] = 'P10275'

df_elm.drop(index_to_drop, axis = 0, inplace = True)
df_elm.reset_index(inplace = True,drop = True)

### "Degron in cancer" review - Mészaros et al. (2017)

- Download from http://dosztanyi.web.elte.hu/CANCER/DEGRON/TP.html 
    - The column Degron contains the identifier of the E3ligase and motif of the ligase (E3ligase is the second word after the first "_")
    - The column Acc contains the ID of the substrate



In [13]:
# Load df
df_degrons_cancer = pd.read_csv(meszaros_path, sep = '\t', usecols = ['Degron','Acc','Start','End'])

# Generalize column names
df_degrons_cancer = df_degrons_cancer[['Degron','Acc','Start','End']]
df_degrons_cancer.columns = ['Degron','Substrate','Start','End']

# Annotate source
df_degrons_cancer['Database'] = 'Degrons_cancer'

df_degrons_cancer.drop_duplicates(inplace = True, ignore_index = True)

# Formatting: eliminate a space at the end of both degrons and substrates
for i, row in df_degrons_cancer.iterrows():
    
    if row['Substrate'][-1] == ' ':
        df_degrons_cancer.loc[i,'Substrate'] = row['Substrate'].split(' ')[0]
        
    if row['Degron'][-1] == ' ':
        df_degrons_cancer.loc[i,'Degron'] = row['Degron'].split(' ')[0]
        
        
# Handle isoforms
##to eliminate the '-' in some of the substrates (those that are from different isoforms)
##we will keep only the canonical isoform since the others may have a different position (indicated with "_1") 
index_to_drop = []

for i, row in df_degrons_cancer.iterrows():
    
    if '-' in row['Substrate'] and row['Substrate'].split('-')[1] != 1:
        index_to_drop.append(i)
        
    elif '-' in row['Substrate'] and row['Substrate'].split('-')[1] == 1:
        df_degrons_cancer.loc[i,'Substrate'] = row['Substrate'].split('-')[0]
        
    if row['Substrate'] == 'A0A0B4J1T2': # manually correct as the ID is different
        df_degrons_cancer.loc[i,'Substrate'] = 'P10275'

df_degrons_cancer.drop(index_to_drop, axis = 0, inplace = True)
df_degrons_cancer.reset_index(inplace = True, drop = True)


### Martínez-Jiménez et al. (2019)

- TableS1-DegAnnInst from the paper
    - The column DEGRON contains the identifier of the E3ligase and motif of the ligase (E3ligase is the second word after the first "_")
    - The column Entry contains the ID of the substrate 

In [14]:
# Load df
df_martinez = pd.read_csv(mjimenez_path,sep = '\t', usecols = ['DEGRON','Entry','START','END'])

# Generalize column names
df_martinez = df_martinez[['DEGRON','Entry','START','END']]
df_martinez.columns = ['Degron','Substrate','Start','End']

# Annotate source
df_martinez['Database'] = 'Manual'

df_martinez.drop_duplicates(inplace = True, ignore_index = True)

# Handle isoforms
##to eliminate the '-' in some of the substrates (those that are from different isoforms)
##we will keep only the canonical isoform since the others may have a different position (indicated with "_1") 
index_to_drop = []

for i, row in df_martinez.iterrows(): 
    
    if '-' in row['Substrate'] and row['Substrate'].split('-')[1] != 1:
        index_to_drop.append(i)
        
    elif '-' in row['Substrate'] and row['Substrate'].split('-')[1] == 1:
        df_martinez.loc[i,'Substrate'] = row['Substrate'].split('-')[0]
        
    if row['Substrate'] == 'A0A0B4J1T2': # manually correct as the ID is different
        df_martinez.loc[i,'Substrate'] = 'P10275'

df_martinez.drop(index_to_drop, axis = 0, inplace = True)
df_martinez.reset_index(inplace = True, drop = True)

In [15]:
print('Instances from ELM: '+str(len(df_elm))+'\n'+
     'Instances from Degrons cancer: '+str(len(df_degrons_cancer))+'\n'+
     'Instances from Martinez: '+str(len(df_martinez))+'\n')

Instances from ELM: 96
Instances from Degrons cancer: 191
Instances from Martinez: 146



## Manual curation to check overlaps between the databases

In [16]:
# Manual curation to check the data from Martinez's paper (compared to degrons in cancer)

index_to_delete_martinez = []

# Iterate through Martinez degron motifs
for degron in set(df_martinez['Degron']):
    submatrix = df_martinez[df_martinez['Degron'] == degron]
    
    # Iterate through each motif's substrates
    for subs in set(submatrix['Substrate']):
        submatrix2 = submatrix[submatrix['Substrate'] == subs]
        
        # Degron in cancer corresponding substrate
        submatrix2_degrons_cancer = df_degrons_cancer[df_degrons_cancer['Substrate'] == subs]
        
        # If both datasets have the substrate
        if len(submatrix2_degrons_cancer) > 0:
            
            # Check overlaps
            mask = submatrix2.apply(lambda x: check_overlap(x, submatrix2_degrons_cancer), axis = 1)
            index_to_delete_martinez.append(submatrix2[mask].index.tolist())
            
            # print different degrons
            if any([x not in list(submatrix2_degrons_cancer['Degron']) for x in list(submatrix2[mask]['Degron'])]):
                print(submatrix2[mask])
                print(submatrix2_degrons_cancer)
                print('\n')

index_to_delete_martinez = functools.reduce(operator.iconcat, index_to_delete_martinez, [])

len(index_to_delete_martinez)            

              Degron Substrate  Start  End Database
108  DEG_APCC_DBOX_1    Q96GD4    315  323   Manual
              Degron Substrate  Start  End        Database
6  DEG_APCC_KENBOX_2    Q96GD4      3    7  Degrons_cancer
7              Other    Q96GD4     26   29  Degrons_cancer
8              D-box    Q96GD4    315  321  Degrons_cancer


            Degron Substrate  Start  End Database
8  LIG_APCC_ABBA_1    O43683    527  532   Manual
               Degron Substrate  Start  End        Database
19  DEG_APCC_KENBOX_2    O43683    534  538  Degrons_cancer
20  DEG_APCC_KENBOX_2    O43683    624  628  Degrons_cancer
21               ABBA    O43683    527  532  Degrons_cancer


             Degron Substrate  Start  End Database
11  LIG_APCC_ABBA_1    O60566    272  277   Manual
12  LIG_APCC_ABBA_1    O60566    340  345   Manual
13  LIG_APCC_ABBA_1    O60566    528  533   Manual
               Degron Substrate  Start  End        Database
13  DEG_APCC_KENBOX_2    O60566     25   29  Degrons

106

It can be seen that in the cases that the Degron is not totally equal, it is actually the same Degron but with different name.

In [17]:
# Delete those sequences that are duplicated in Martinez and degrons in cancer

df_martinez.drop(index_to_delete_martinez, inplace = True)
df_martinez.reset_index(inplace = True, drop = True)

In [18]:
# Manual curation to check the data from Martinez's paper (compared to ELM)

index_to_delete_martinez2 = []

# Iterate through Martinez degron motifs
for degron in set(df_martinez['Degron']):
    submatrix = df_martinez[df_martinez['Degron'] == degron]
    
    # Iterate through each motif's substrates
    for subs in set(submatrix['Substrate']):
        submatrix2 = submatrix[submatrix['Substrate'] == subs]
        
        # ELM corresponding substrate
        submatrix2_elm = df_elm[df_elm['Substrate'] == subs]
        
        # If both datasets have the substrate
        if len(submatrix2_elm) > 0:
            mask = submatrix2.apply(lambda x: check_overlap(x, submatrix2_elm), axis = 1)
            index_to_delete_martinez2.append(submatrix2[mask].index.tolist())
            
            # print different degrons
            if any([x not in list(submatrix2_elm['Degron']) for x in list(submatrix2[mask]['Degron'])]):
                print(submatrix2[mask])
                print(submatrix2_elm)
                print('\n')

index_to_delete_martinez2 = functools.reduce(operator.iconcat, index_to_delete_martinez2, [])

len(index_to_delete_martinez2)         

17

In [19]:
# Delete those sequences that are duplicated in Martinez and ELM

df_martinez.drop(index_to_delete_martinez2,inplace=True)
df_martinez.reset_index(inplace=True,drop=True)

In [20]:
# Manual curation to check the data from degrons cancer (compared to ELM)

index_to_delete_degrons_cancer = []

# Iterate through Degrons cancer degron motifs
for degron in set(df_degrons_cancer['Degron']):
    submatrix = df_degrons_cancer[df_degrons_cancer['Degron'] == degron]
    
    # Iterate through each motif's substrates
    for subs in set(submatrix['Substrate']):
        submatrix2 = submatrix[submatrix['Substrate'] == subs]
        
        # ELM corresponding substrate
        submatrix2_elm = df_elm[df_elm['Substrate'] == subs]
        
        # If both datasets have the substrate
        if len(submatrix2_elm) > 0:
            mask = submatrix2.apply(lambda x: check_overlap(x, submatrix2_elm), axis = 1)
            index_to_delete_degrons_cancer.append(submatrix2[mask].index.tolist())

            # print different degrons
            if any([x not in list(submatrix2_elm['Degron']) for x in list(submatrix2[mask]['Degron'])]):
                print(submatrix2[mask])
                print(submatrix2_elm)
                print('\n')

index_to_delete_degrons_cancer = functools.reduce(operator.iconcat, index_to_delete_degrons_cancer, [])

len(index_to_delete_degrons_cancer)               

       Degron Substrate  Start  End        Database
30  SCF_FBXL2    P30281    279  292  Degrons_cancer
             Degron Substrate  Start  End Database
0  DEG_SCF_FBXO31_1    P30281    286  292      ELM


  Degron Substrate  Start  End        Database
8  D-box    Q96GD4    315  321  Degrons_cancer
               Degron Substrate  Start  End Database
51  DEG_APCC_KENBOX_2    Q96GD4      3    7      ELM
95    DEG_APCC_DBOX_1    Q96GD4    314  322      ELM


       Degron Substrate  Start  End        Database
50  CRL4_Cdt2    P49918    270  282  Degrons_cancer
                 Degron Substrate  Start  End Database
41  DEG_SCF_SKP2-CKS1_1    P49918    306  313      ELM
50      DEG_CRL4_CDT2_1    P49918    270  282      ELM




78

As in the previous df, those cases with different degron are the same but having a different name.

In [21]:
# Delete those sequences that are duplicated in degrons in cancer and ELM

df_degrons_cancer.drop(index_to_delete_degrons_cancer,inplace=True)
df_degrons_cancer.reset_index(inplace=True,drop=True)


### Remove "Unknown degron" and others

In [22]:
# visualization of those sequences
## UNKNOWN very likely is DEG_SCF_FBX031_1 of ELM
print(df_degrons_cancer[(df_degrons_cancer['Degron'] == 'UNKNOWN') 
                        & (df_degrons_cancer['Substrate'] == 'P30281') 
                        & (df_degrons_cancer['Start'] == 278) 
                        & (df_degrons_cancer['End'] == 290)])
print(df_elm[(df_elm['Degron'] == 'DEG_SCF_FBXO31_1') 
             & (df_elm['Substrate'] == 'P30281') 
             & (df_elm['Start'] == 286) 
             & (df_elm['End'] == 292)])
print('\n')

## DEG_SCF_TRCP1_1 of ELM very likely SCF_beta-TrCP2 of Degrons cancer
print(df_elm[(df_elm['Degron'] == 'DEG_SCF_TRCP1_1')
             & (df_elm['Substrate'] == 'P16471')])
print(df_degrons_cancer[(df_degrons_cancer['Degron'] == 'SCF_beta-TrCP2') 
                        & (df_degrons_cancer['Substrate'] == 'P16471') 
                        & (df_degrons_cancer['Start'] == 347) 
                        & (df_degrons_cancer['End'] == 351)])
print('\n')

# removal of these sequences
df_degrons_cancer = df_degrons_cancer.drop(df_degrons_cancer[
    (df_degrons_cancer['Degron'] == 'UNKNOWN') 
    & (df_degrons_cancer['Substrate'] == 'P30281')
    & (df_degrons_cancer['Start'] == 278) 
    & (df_degrons_cancer['End'] == 290)].index.tolist(), axis = 0)
df_degrons_cancer = df_degrons_cancer.drop(df_degrons_cancer[
    (df_degrons_cancer['Degron'] == 'SCF_beta-TrCP2') 
    & (df_degrons_cancer['Substrate'] == 'P16471') 
    & (df_degrons_cancer['Start'] == 347) 
    & (df_degrons_cancer['End'] == 351)].index.tolist(), axis = 0)


# reset indexes
df_degrons_cancer.reset_index(inplace = True, drop = True)


     Degron Substrate  Start  End        Database
18  UNKNOWN    P30281    278  290  Degrons_cancer
             Degron Substrate  Start  End Database
0  DEG_SCF_FBXO31_1    P30281    286  292      ELM


             Degron Substrate  Start  End Database
58  DEG_SCF_TRCP1_1    P16471    348  353      ELM
            Degron Substrate  Start  End        Database
87  SCF_beta-TrCP2    P16471    347  351  Degrons_cancer




In [23]:
print('After removing manually the duplicated sequences we have: \n'+
      'Instances from ELM: '+str(len(df_elm))+'\n'+
      'Instances from Degrons cancer: '+str(len(df_degrons_cancer))+'\n'+
      'Instances from Martinez: '+str(len(df_martinez))+'\n')

After removing manually the duplicated sequences we have: 
Instances from ELM: 96
Instances from Degrons cancer: 111
Instances from Martinez: 23



## Generate ELM-Manual database (concatenate dfs)

Join the 3 dataframes in a unique dataframe keeping the origin of the data.

In [24]:
# concat ELM and Degron cancer, and drop duplicates (safety measure)
new_df = pd.concat([df_elm, df_degrons_cancer], axis = 0)

new_df.reset_index(inplace = True, drop = True)
new_df.drop_duplicates(inplace = True, ignore_index = True, subset = ['Degron','Substrate','Start','End'])

# concat with Martines
new_df2 = pd.concat([new_df, df_martinez], axis = 0)
new_df2.reset_index(inplace = True, drop = True)
new_df2.drop_duplicates(inplace = True, ignore_index = True, subset = ['Degron','Substrate','Start','End'])
new_df2.reset_index(inplace = True, drop = True)


print('Instances from ELM: '+str(len(df_elm))+'\n'+
     'Instances from Degrons cancer: '+str(len(df_degrons_cancer))+'\n'+
     'Instances from Martinez: '+str(len(df_martinez))+'\n'+
     'Instances unique from the ELM and Degrons cancer: '+str(len(new_df))+'\n'
     'Instances unique from the 3 sets: '+str(len(new_df2))+'\n')

Instances from ELM: 96
Instances from Degrons cancer: 111
Instances from Martinez: 23
Instances unique from the ELM and Degrons cancer: 207
Instances unique from the 3 sets: 230



### Add amino acid sequence per substrate

In [25]:
# Load proteome as dict
proteome = readFasta_gzip(proteome_path)

In [26]:
for i, row in new_df2.iterrows():
    if row['Substrate'] in proteome:
        new_df2.loc[i,'Sequence'] = proteome[row['Substrate']][row['Start']-1:row['End']]

### Add extended amino acid sequence (20 aa)

In [28]:
for i, row in new_df2.iterrows():
    new_df2.loc[i, ['Sequence_amplified','Start_amplified','End_amplified']] = completed_sequence(
    row['Substrate'], row['Sequence'], row['Start'], row['End'], total_len = 20,
        dic_fasta_seqs = proteome)

### Save the dataframe

In [29]:
new_df2.to_csv(elm_manual_path, sep = '\t', index = False)