# Analysis of the overlap of discovered degrons

This notebook contains the code to merge the overlapping degrons discovered for downstream functional validation.

Some degrons motifs are very degenerated and, thus, degron boundaries can be difficult to define. Steps:
1. Study the amount of degrons having this condition.
2. Pool degrons that overlap (even in a single position).

## Import libraries

In [3]:
import pandas as pd
import os
import sys

## my modules ##
sys.path.append("../scripts/Utils/")    # modules folder
from fasta_utils import readFasta_gzip

## Define variables and paths

In [4]:
base = "../"

results = "results/"

all_discov_degrons_path = os.path.join(base, results, "discovered_degrons/proteome_20k/")
all_discov_degrons_overlapool_path = os.path.join(base, results, "discovered_degrons/proteome_20k_overlapping_pooled/")

proteome_path = os.path.join(base, data, "external/biomart/biomart92_proteome_cantranscripts_uniq_prepro.fasta.gz")

In [5]:
# Load proteome 

proteome = readFasta_gzip(proteome_path)

Number of retrieved sequences: 19302



## 1. Find overlapping degrons in the raw tables of discovered degrons per motif

These are the tables directly generated from the scan files.

In [4]:
motifs = os.listdir(all_discov_degrons_path)

lengths = []  # to monitor max degron length obtain after pooling

# One by one motif analysis
for motif in motifs:
    
    print()
    print(f'Motif: {motif}')
    print()
    
    discov_degrons = pd.read_csv(all_discov_degrons_path+motif, sep = "\t", compression = "gzip")
    
    # Obtain proteins having the same degron discovered more than once
    discov_degrons_c = discov_degrons.groupby(["protein_id"]).size(
    ).reset_index().rename(columns = {0:'count'})
    discov_degrons_c_more1_id = discov_degrons_c.loc[discov_degrons_c["count"] > 1, "protein_id"].values
    
    # For each protein, obtain all the coordinates of the found degrons
    for id in discov_degrons_c_more1_id:
        
        degron_starts = discov_degrons.loc[discov_degrons.protein_id == id, "start"].values
        degron_ends = discov_degrons.loc[discov_degrons.protein_id == id, "end"].values
            
        first = True   # flag variable
        
        # List for new pooled intervals
        degrons_pooled_intervals = []
        
        for ds, de in zip(degron_starts, degron_ends):  # can be done this way because degrons are linearly ordered
                                                        # upon discovery
            # First degron interval in the protein
            if first:
                first = False
                degron_interval = (ds, de)
            
            # Pool degrons: overlap
            elif (degron_interval[0] < ds and degron_interval[1] > ds) or \
                (degron_interval[0] < de and degron_interval[1] > de):
                    degron_interval = (min([degron_interval[0], ds]), 
                                       max([degron_interval[1], de]))
            
            # New interval starting            
            else:
                degrons_pooled_intervals.append(degron_interval)
                degron_interval = (ds, de)
        
        # Final list of degron intervals for the specific protein
        degrons_pooled_intervals.append(degron_interval)
        
        # Per motif: print those proteins for which we have pooled overlapping degrons
        if len(degron_starts) > len(degrons_pooled_intervals):
            print(f'\tDegron discovered in protein {id}')
            print(f'\tInitial number of discovered degrons: {len(degron_starts)}')
            for ds, de in zip(degron_starts, degron_ends):
                print(f'\t\tLength of{ds, de}: {de-ds}')
            print(f'\tFinal number of discovered degrons: {len(degrons_pooled_intervals)}')
            print(f'\t\t{degrons_pooled_intervals}')
            for inter in degrons_pooled_intervals:
                print(f'\t\tLength of {inter}: {inter[1]-inter[0]}')
                lengths.append(inter[1]-inter[0])
            print()

# Print the maximum degron length we obtain after overlapping
print(f'Maximum degron length obtained after pooling all overlapping degrons: {max(lengths)}')
                
        
    
    
    


Motif: APC_ABBA.tsv.gz


Motif: APC_KENBOX.tsv.gz

	Degron discovered in protein ENST00000319653
	Initial number of discovered degrons: 2
		Length of(1602, 1611): 9
		Length of(1609, 1618): 9
	Final number of discovered degrons: 1
		[(1602, 1618)]
		Length of (1602, 1618): 16

	Degron discovered in protein ENST00000320005
	Initial number of discovered degrons: 2
		Length of(720, 729): 9
		Length of(727, 736): 9
	Final number of discovered degrons: 1
		[(720, 736)]
		Length of (720, 736): 16

	Degron discovered in protein ENST00000334352
	Initial number of discovered degrons: 2
		Length of(176, 185): 9
		Length of(181, 190): 9
	Final number of discovered degrons: 1
		[(176, 190)]
		Length of (176, 190): 14


Motif: BTRC.tsv.gz


Motif: CBLL1.tsv.gz


Motif: CBL_MET.tsv.gz

	Degron discovered in protein ENST00000260988
	Initial number of discovered degrons: 2
		Length of(133, 148): 15
		Length of(135, 150): 15
	Final number of discovered degrons: 1
		[(133, 150)]
		Length of (133, 150):

## 2. Pool those degrons overlapping

In [7]:
def pool_seqs(idx, start_coords, end_coords, entire_protein):
    
    first = True   # Flag

    new_start_coords = []
    new_end_coords = []
    new_seqs = []
    idxs = []
    
    for s, e in zip(start_coords, end_coords):
        if first:
            new_s = s
            new_e = e
            first = False
        
        elif (new_s < s and new_e > s) or \
        (new_s < e and new_e > e):   # the 2nd option should not happen as the scan moves forward, but just in case
            new_s = min(new_s, s)
            new_e = max(new_e, e)
        
        else:
            new_seqs.append(entire_protein[new_s-1: new_e])
            new_start_coords.append(new_s)
            new_end_coords.append(new_e)
            idxs.append(idx)
            
            new_s = s
            new_e = e
    
    new_seqs.append(entire_protein[new_s-1: new_e])
    new_start_coords.append(new_s)
    new_end_coords.append(new_e)
    idxs.append(idx)
    
    return idxs, new_seqs, new_start_coords, new_end_coords
            
            


In [10]:
motifs = os.listdir(all_discov_degrons_path)

# One by one motif analysis
for motif in motifs:
    
    print(f'Motif: {motif}')    
    discov_degrons = pd.read_csv(all_discov_degrons_path+motif, sep = "\t", compression = "gzip")
    print(f'\tInitial number of degrons: {len(discov_degrons)}')
    
    all_idxs = []
    all_new_seqs = []
    all_new_start_coords = []
    all_new_end_coords = []
    
    # Obtain proteins having the same degron discovered more than once
    discov_degrons_c = discov_degrons.groupby(["protein_id"]).size(
    ).reset_index().rename(columns = {0:'count'})
    discov_degrons_c_more1_id = discov_degrons_c.loc[discov_degrons_c["count"] > 1, "protein_id"].values
    
    # For each protein, obtain all the coordinates of the found degrons
    for idx in discov_degrons_c_more1_id:
        degron_starts = discov_degrons.loc[discov_degrons.protein_id == idx, "start"].values
        degron_ends = discov_degrons.loc[discov_degrons.protein_id == idx, "end"].values
        entire_protein = proteome[idx]    # to retrieve new pooled degron sequence
        
        # Remove old degrons found in a certain protein
        discov_degrons.drop(discov_degrons[discov_degrons.protein_id == idx].index, inplace = True)
        
        # Generate and add new degrons found in a certain protein (some could be the same if not pooled)
        idxs, new_seqs, new_start_coords, new_end_coords = pool_seqs(idx, degron_starts, degron_ends, entire_protein)
        all_idxs = all_idxs + idxs
        all_new_seqs = all_new_seqs + new_seqs
        all_new_start_coords = all_new_start_coords + new_start_coords
        all_new_end_coords = all_new_end_coords + new_end_coords
    
    new_degrons = pd.DataFrame(list(zip(all_idxs, all_new_seqs, 
                                     all_new_start_coords, 
                                     all_new_end_coords)),
                             columns = ["protein_id", "sequence",
                                        "start", "end"])
    
    discov_degrons = pd.concat([discov_degrons, new_degrons], ignore_index = True)
    print(f'\tFinal number of degrons: {len(discov_degrons)}') 
    
    discov_degrons.to_csv(all_discov_degrons_overlapool_path+motif, sep = "\t", index = False)
        
        


Motif: APC_ABBA.tsv.gz
	Initial number of degrons: 44
	Final number of degrons: 44
Motif: APC_KENBOX.tsv.gz
	Initial number of degrons: 978
	Final number of degrons: 975
Motif: BTRC.tsv.gz
	Initial number of degrons: 152
	Final number of degrons: 152
Motif: CBLL1.tsv.gz
	Initial number of degrons: 4
	Final number of degrons: 4
Motif: CBL_MET.tsv.gz
	Initial number of degrons: 777
	Final number of degrons: 755
Motif: CBL_PTK.tsv.gz
	Initial number of degrons: 1849
	Final number of degrons: 1847
Motif: DEG_APCC_TPR_1.tsv.gz
	Initial number of degrons: 7330
	Final number of degrons: 6315
Motif: DEG_Kelch_KLHL3_1.tsv.gz
	Initial number of degrons: 4
	Final number of degrons: 4
Motif: DEG_Nend_UBRbox_4.tsv.gz
	Initial number of degrons: 3
	Final number of degrons: 3
Motif: VHL.tsv.gz
	Initial number of degrons: 9
	Final number of degrons: 9
Motif: KEAP1.tsv.gz
	Initial number of degrons: 611
	Final number of degrons: 610
Motif: SPOP.tsv.gz
	Initial number of degrons: 13446
	Final number of 