# Utils

> Functions to preprocess sequence to prepare kinase substrate dataset

## Setup

In [None]:
#| default_exp utils

In [None]:
#| export
import numpy as np, pandas as pd
from tqdm import tqdm
from katlas.data import *
from fastcore.meta import delegates
from pathlib import Path

# for alignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
import subprocess


In [None]:
#| hide
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', 100) # show all columns

```python
from katlas.utils import *
```

## Common funcs

In [None]:
#| export
def prepare_path(path):
    """Ensure the parent directory exists and return the full file path."""
    full_path = Path(path).expanduser()
    full_path.parent.mkdir(parents=True, exist_ok=True)
    return full_path

```python
pssm_df = get_prob(df_k,'site_seq')
plot_logo_heatmap(pssm_df,title=f'{k} (n={len(df_k):,})',figsize=(17,10))
# check if directory exist; if not, create one, then return the full path
path1=prepare_path(f'fig/cddm/{k}/pssm_freq.svg')
save_show(path1,show_only=SHOW)
```

In [None]:
#| export
def get_diff(df1, df2, col1, col2=None):
    "Get non-overlap parts of two dataframes."
    if col2 is None:
        col2=col1
    df1_unique = df1[~df1[col1].isin(df2[col2])]
    df2_unique = df2[~df2[col2].isin(df1[col1])]
    return df1_unique, df2_unique

In [None]:
df1 = pd.DataFrame({'gene': ['A', 'B', 'C']})
df2 = pd.DataFrame({'gene': ['B', 'C', 'D']})

df1_unq, df2_unq = get_diff(df1, df2, 'gene')

In [None]:
df1_unq

Unnamed: 0,gene
0,A


In [None]:
df2_unq

Unnamed: 0,gene
2,D


## Checker

In many phosphorylation datsets, there are amino acids in the site sequence that are in lower case but does not belong to s/t/y. Also, there are uncommon amino acids such as U or O that appear in the sequence. Therefore, it is essential to convert the sequence string for kinase ranking.

In [None]:
#| export
def check_seq(seq):
    """Convert non-s/t/y characters to uppercase and replace disallowed characters with underscores."""
    acceptor = seq[len(seq) // 2]
    assert acceptor.lower() in {'s', 't', 'y'}, f"{seq} has {acceptor} at position {len(seq) // 2}; need to have one of 's', 't', or 'y' in the center"

    allowed_chars = set("PGACSTVILMFYWHKRQNDEsty")
    return "".join(char if char in {'s', 't', 'y'} else (char.upper() if char.upper() in allowed_chars else '_') for char in seq)

In [None]:
try:
    check_seq('aaadaaa')
except Exception as e:
    print(e)

aaadaaa has d at position 3; need to have one of 's', 't', or 'y' in the center


In [None]:
check_seq('AAkUuPSFstTH') # if the center amino acid does not belong to sty/STY, will raise an error

'AAK__PSFstTH'

In [None]:
#| export
def check_seqs(seqs:pd.Series):
    "Convert non-s/t/y to upper case & replace with underscore if the character is not in the allowed set"
    assert len(seqs.str.len().value_counts())==1, 'inconsistent sequence length detected'
    return seqs.apply(check_seq)

In [None]:
#| export
def check_seq_df(df,col):
    "Convert non-s/t/y to upper case & replace with underscore if the character is not in the allowed set"
    assert len(df[col].str.len().value_counts())==1, 'inconsistent sequence length detected'
    return df[col].apply(check_seq)

In [None]:
df=Data.get_human_site()
df.head()

Unnamed: 0,substrate_uniprot,substrate_genes,site,source,AM_pathogenicity,substrate_sequence,substrate_species,sub_site,substrate_phosphoseq,position,site_seq
0,A0A024R4G9,C19orf48 MGC13170 hCG_2008493,S20,psp,,MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,Homo sapiens (Human),A0A024R4G9_S20,MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...,20,_MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1,A0A075B6Q4,,S24,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S24,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,24,QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2,A0A075B6Q4,,S35,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S35,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,35,EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3,A0A075B6Q4,,S57,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S57,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,57,EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4,A0A075B6Q4,,S68,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S68,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,68,RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE


In [None]:
check_seq_df(df.head(),'site_seq')

0    _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1    QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2    EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3    EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4    RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: object

In [None]:
#| export
def validate_site(site_info,
                  seq):
    "Validate site position residue match with site residue."
    pos=int(site_info[1:])-1 # python index starts from zero
    if pos >= len(seq) or pos < 0: 
        return int(False)
    return int(seq[pos]==site_info[0])

In [None]:
site='S610'
seq = 'MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVGPKDLRSAMGEGGGPEPGPANAKWLKEGQNQLRRAATAHRDQNRNVTLTLAEEASQEPEMAPLGPKGLIHLYSELELSAHNAANRGLRGPGLIISTQEQGPDEGEEKAAGEAEEEEEDDDDEEEEEDLSSPPGLPEPLESVEAPPRPQALTDGPREHSKSASLLFGMRNSAASDEDSSWATLSQGSPSYGSPEDTDSFWNPNAFETDSDLPAGWMRVQDTSGTYYWHIPTGTTQWEPPGRASPSQGSSPQEESQLTWTGFAHGEGFEDGEFWKDEPSDEAPMELGLKEPEEGTLTFPAQSLSPEPLPQEEEKLPPRNTNPGIKCFAVRSLGWVEMTEEELAPGRSSVAVNNCIRQLSYHKNNLHDPMSGGWGEGKDLLLQLEDETLKLVEPQSQALLHAQPIISIRVWGVGRDSGRERDFAYVARDKLTQMLKCHVFRCEAPAKNIATSLHEICSKIMAERRNARCLVNGLSLDHSKLVDVPFQVEFPAPKNELVQKFQVYYLGNVPVAKPVGVDVINGALESVLSSSSREQWTPSHVSVAPATLTILHQQTEAVLGECRVRFLSFLAVGRDVHTFAFIMAAGPASFCCHMFWCEPNAASLSEAVQAACMLRYQKCLDARSQASTSCLPAPPAESVARRVGWTVRRGVQSLWGSLKPKRLGAHTP'

In [None]:
validate_site(site,seq)

1

In [None]:
#| export
def validate_site_df(df, 
                     site_info_col,
                     protein_seq_col): 
    "Validate site position residue match with site residue in a dataframe."
    return df.apply(lambda r: validate_site(r[site_info_col],r[protein_seq_col]) , axis=1)

In [None]:
validate_site_df(df.head(),'site','substrate_sequence')

0    1
1    1
2    1
3    1
4    1
dtype: int64

## Phosphorylate protein seq

In [None]:
#| export
def phosphorylate_seq(seq, # full protein sequence
                      *sites, # site info, e.g., S140
                      ):
    "Phosphorylate protein sequence based on phosphosites (e.g.,S140). "
    seq = list(seq)

    for site in sites:
        char = site[0] 
        position = int(site[1:]) - 1 # substract 1 as python index starts from 0

        if 0 <= position < len(seq):
            if seq[position] == char:
                seq[position] = char.lower()  
            else:
                raise ValueError(f"Mismatch at position {position+1}: expected {char}, found {seq[position]}")
        else:
            raise IndexError(f"Position {position+1} out of range for sequence length {len(seq)}")

    return ''.join(seq)

In [None]:
seq = 'MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDSQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDSVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq(seq,*['S95', 'S22', 'T25', 'S6', 'S158'])

'MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'

In [None]:
#| export
def phosphorylate_seq_df(df,
                         id_col='substrate_uniprot', # column of sequence ID
                         seq_col='substrate_sequence', # column that contains protein sequence
                         site_col='site', # column that contains site info, e.g., S140
                         
                        ):
    "Phosphorylate whole sequence based on phosphosites in a dataframe"
    df_seq = df.groupby(id_col).agg({site_col:lambda r: r.unique(),seq_col:'first'}).reset_index()
    df_seq['phosphoseq'] = df_seq.apply(lambda r: phosphorylate_seq(r[seq_col],*r[site_col]),axis=1)
    return df_seq

In [None]:
df=Data.get_human_site()
df.head()

Unnamed: 0,substrate_uniprot,substrate_genes,site,source,AM_pathogenicity,substrate_sequence,substrate_species,sub_site,substrate_phosphoseq,position,site_seq
0,A0A024R4G9,C19orf48 MGC13170 hCG_2008493,S20,psp,,MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,Homo sapiens (Human),A0A024R4G9_S20,MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...,20,_MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1,A0A075B6Q4,,S24,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S24,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,24,QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2,A0A075B6Q4,,S35,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S35,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,35,EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3,A0A075B6Q4,,S57,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S57,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,57,EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4,A0A075B6Q4,,S68,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S68,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,68,RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE


In [None]:
phosphorylate_seq_df(df.head(100),'substrate_uniprot','substrate_sequence','site')

Unnamed: 0,substrate_uniprot,site,substrate_sequence,phosphoseq
0,A0A024R4G9,[S20],MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...
1,A0A075B6Q4,"[S24, S35, S57, S68, S71, S72]",MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...
...,...,...,...,...
22,A0A0A6YYL6,"[S5, Y139, S141, S142]",MVRYSLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK...,MVRYsLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK...
23,A0A0B4J1R7,"[T6, S43, S45, S46]",MMATGTPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGSRSS...,MMATGtPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGsRss...


## Extract site seq

In [None]:
#| export
def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
                     seq_col: str, # column name of protein sequence
                     site_col: str, # column name of site information (e.g., S10)
                     n=7, # length of surrounding sequence (default -7 to +7)
                    ):
    "Extract -n to +n site sequence from protein sequence"
    
    data = []
    for i, r in tqdm(df.iterrows(),total=len(df)):
        position = int(r[site_col][1:]) - 1
        start = position - n
        end = position + n +1

        # Extract the subsequence
        subseq = r[seq_col][max(0, start):min(len(r[seq_col]), end)]

        # Pad the subsequence if needed
        if start < 0:
            subseq = "_" * abs(start) + subseq
        if end > len(r[seq_col]):
            subseq = subseq + "_" * (end - len(r[seq_col]))

        data.append(subseq)
        
    return np.array(data)

As some datasets only contains protein information and position of phosphorylation sites, but not phosphorylation site sequence, we can retreive protein sequence and use this function to get -7 to +7 phosphorylation site sequence (as numpy array).

Remember to validate the phospho-acceptor at position 0 before extract the site sequence, as there could be mismatch due to the protein sequence database updates.

In [None]:
df.head()

Unnamed: 0,substrate_uniprot,substrate_genes,site,source,AM_pathogenicity,substrate_sequence,substrate_species,sub_site,substrate_phosphoseq,position,site_seq
0,A0A024R4G9,C19orf48 MGC13170 hCG_2008493,S20,psp,,MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,Homo sapiens (Human),A0A024R4G9_S20,MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...,20,_MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1,A0A075B6Q4,,S24,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S24,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,24,QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2,A0A075B6Q4,,S35,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S35,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,35,EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3,A0A075B6Q4,,S57,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S57,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,57,EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4,A0A075B6Q4,,S68,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S68,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,68,RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE


In [None]:
extract_site_seq(df.head(),
                 seq_col='substrate_sequence',
                 site_col='site',
                 n=30
                 )

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 9493.67it/s]


array(['___________MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSHTPRR',
       '_______MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHL',
       'KSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFT',
       'DYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERF',
       'DCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERFEKFYEQYDDDE'],
      dtype='<U61')

## Alignment

In [None]:
#| export
def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
    "Generate fasta file from sequences."
    records = [
        SeqRecord(Seq(str(row[seq_col])), id=str(row[id_col]), description="")
        for _, row in df.iterrows()
    ]
    SeqIO.write(records, path, "fasta")
    print(len(records))

```python
get_fasta(kd,seq_col='kd_seq',id_col='kd_ID',path='raw/kinase_domains.fasta')
```

To run clustalo alignment, can run either through terminal or the function

```bash
sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clu
```

In [None]:
#| export
def run_clustalo(input_fasta,  # .fasta fname
                 output_aln, # .aln output fname
                 outfmt="clu"):
    "Run Clustal Omega to perform multiple sequence alignment."
    # if the output directory does not exist, create one
    output_aln = Path(output_aln)
    output_aln.parent.mkdir(parents=True, exist_ok=True)

    # run clustalo
    subprocess.run([
        "clustalo", "-i", str(input_fasta),
        "-o", str(output_aln),
        "--force", "--outfmt=clu"
    ], check=True)

```python
run_clustalo("kinase_domains.fasta", "raw/kinase_domains.aln")
```

In [None]:
#| export
def aln2df(fname):
    alignment = AlignIO.read(fname, "clustal")
    alignment_array = [list(str(record.seq)) for record in alignment]
    ids = [record.id for record in alignment]
    df = pd.DataFrame(alignment_array, index=ids)
    df.columns = df.columns+1
    return df

```python
df = aln2df("raw/kinase_domains.aln")
```

In [None]:
#| export
def get_aln_freq(df):
    "Get frequency of each amino acid across each position from the aln2df output."
    counts_df = df.apply(lambda col: col.value_counts(), axis=0).fillna(0)
    return counts_df.div(counts_df.sum(axis=0), axis=1)

```python
freq_df = get_aln_freq(df)
```

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()