# Find destabilizing motifs in yeast CDSs

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
np.random.seed(111)
from Bio.Seq import Seq
import pyfaidx
import regex as re
from scipy.signal import find_peaks

In [None]:
if not os.path.exists("../data"):
    os.makedirs("../data")

if not os.path.exists("../data/stallscores"):
    os.makedirs("../data/stallscores")

### Annotations
- generated in ```get_sgd_orf_annotations.ipynb```
  - note that Dubious and Uncharacterized ORFs were included

In [None]:
cdsseqs = pyfaidx.Fasta('../data/sgd/cds_coding.fasta')

### Motif type designations

In [None]:
bulky=['L', 'I', 'V']
more_bulky = ['W', 'F', 'Y']
hydrophobic=['F', 'M', 'W', 'I', 'V', 'L', 'P', 'A']
charged=['R', 'D', 'E', 'H', 'K']
pos_charge=['R', 'K']
not_bulky=['A', 'G']

## Score peptides for bulkiness and charge

### Score for presence of bulky/pos peptides in a gene
- if you encounter either a positively charged or bulky amino acid, give it a +1
- if you encounter a posbulky dipeptide, give it a +2
- use np.convolve to reduce noise/smooth out the signal, or np.find_peaks to find local maxima

In [None]:
posbulky_scores = dict()
for tx in cdsseqs.keys():
    posbulky_scores[tx] = np.zeros(int(len(cdsseqs[tx])/3) - 1)
    aa_seq = str(Seq(str(cdsseqs[tx])).translate()) # get peptide from DNA sequence
    for pos, n in enumerate(range(len(aa_seq) - 1)):
        if ((aa_seq[pos] in bulky) | (aa_seq[pos] in pos_charge)): # if you encounter either a poscharge or bulky aa
            posbulky_scores[tx][pos] = 1
        if ((aa_seq[pos] in more_bulky) | (aa_seq[pos] in pos_charge)):
            posbulky_scores[tx][pos] = 1.5

In [None]:
scores = list()

window_size = 16
motif_size = 20 # actually 21 but keep it even

for tx in posbulky_scores:
    aa_seq = str(Seq(str(cdsseqs[tx])).translate())
    # if len(aa_seq) < 50:
    #     continue
    convolution = np.convolve(posbulky_scores[tx], np.ones(window_size), mode='valid')
    peaks, _ = find_peaks(convolution, height=10, distance=8)
    mins = np.argmin(convolution)
    pos_2N = np.hstack((peaks, mins))
    for (loc, value) in zip(pos_2N, convolution[pos_2N]):
        data = {'transcript_id': tx, 
                    'strength': value, 
                    'loc': loc,
                    'motif': aa_seq[ int(loc - (motif_size/2)) -1: int(loc + (motif_size/2)) ],
                    'nt63': str(cdsseqs[tx][ int( (3*loc) - ((3*motif_size)/2) -3) : int( (3*loc) + (((3*motif_size)/2)) ) ]),
                }
        scores.append(data)

posbulky_score_df = pd.DataFrame(scores)
posbulky_score_df['type'] = 'posbulky'
posbulky_score_df.to_csv("../data/stallscores/posbulky_motif_scores_sgd_orfs.tsv", sep="\t", index=False)

## Scoring other motifs

In [None]:
def get_motifs_from_score(dicodon_scores, cdsseqs):
    scores = list()
    window_size = 16
    motif_size=20

    for tx in dicodon_scores:
        aa_seq = str(Seq(str(cdsseqs[tx])).translate())
        convolution = np.convolve(dicodon_scores[tx], np.ones(window_size), mode='same')
        peaks, _ = find_peaks(convolution, height=10, distance=8)
        mins = np.argmin(convolution)
        pos_2N = np.hstack((peaks, mins))
        for (loc, value) in zip(pos_2N, convolution[pos_2N]):
            data = {'transcript_id': tx, 
                        'strength': value, 
                        'loc': loc,
                        'motif': aa_seq[ int(loc - (motif_size/2)) -1: int(loc + (motif_size/2)) ],
                        'nt63': str(cdsseqs[tx][ int( (3*loc) - ((3*motif_size)/2) -3) : int( (3*loc) + (((3*motif_size)/2)) ) ]),
                    }
            scores.append(data)

    score_df = pd.DataFrame(scores)

    return(score_df)

### Score for specific stalling dipeptides

In [None]:
stall_dipeptides = ['FK', 'KF']
pep = ['F', 'K', 'Y', 'W', 'R']

In [None]:
cds_dicodon_scores = dict()
for tx in cdsseqs.keys():
    cds_dicodon_scores[tx] = np.zeros(int(len(cdsseqs[tx])/3) - 1)
    aa_seq = str(Seq(str(cdsseqs[tx])).translate())
    for pos, n in enumerate(range(len(aa_seq) - 1)):
        new_dipeptide = aa_seq[pos:pos+2]
        if new_dipeptide in stall_dipeptides:
            cds_dicodon_scores [tx][pos] = 2
        elif ((aa_seq[pos] in pep) | (aa_seq[pos] in pep)):
            cds_dicodon_scores [tx][pos] = 1

In [None]:
dipeptide_scores = get_motifs_from_score(cds_dicodon_scores, cdsseqs)
dipeptide_scores['type'] = 'FK'
dipeptide_scores.sort_values(by='strength', ascending=False)[:20]

## Score for PG and PD

In [None]:
stall_dipeptides = ['PG', 'GP', 'PD', 'DP']
pep = ['P', 'D', 'G']

In [None]:
cds_dicodon_scores = dict()
for tx in cdsseqs.keys():
    cds_dicodon_scores[tx] = np.zeros(int(len(cdsseqs[tx])/3) - 1)
    aa_seq = str(Seq(str(cdsseqs[tx])).translate())
    for pos, n in enumerate(range(len(aa_seq) - 1)):
        new_dipeptide = aa_seq[pos:pos+2]
        if new_dipeptide in stall_dipeptides:
            cds_dicodon_scores [tx][pos] = 2
        elif ((aa_seq[pos] in pep) | (aa_seq[pos] in pep)):
            cds_dicodon_scores [tx][pos] = 1

In [None]:
pdg_scores = get_motifs_from_score(cds_dicodon_scores, cdsseqs)
pdg_scores['type'] = 'pd_pg'
pdg_scores.sort_values(by='strength', ascending=False)[:20]

In [None]:
dipep_scores = pd.concat([dipeptide_scores, pdg_scores]).reset_index().drop('index', axis=1)
dipep_scores.to_csv("../data/stallscores/dipeptide_scores_sgd_orfs.tsv", sep='\t', index=False)