In [1]:
"""
ptm_site_align.ipynb

"""
__author__ = "github.com/simhc0714"
__version__ = "0.1.0"

# import packages
import pandas as pd
import numpy as np

# Generate Logos data. Test ~10 rows
def logo_align(df_site, df_sequence, d:int=7, aa='K'):
    """
    logo_align(df_site, df_sequence) -> .csv

    Parameters
    ----------
    - df_site: Dataframe that contains UniprotAC and site, must be formed (UniprotAC_Site) (pandas.DataFrame).
    - df_sequence: Dataframe that contains UniprotAC and its sequence (pandas.DataFrame).
    - d: 7aa-X-7aa (default=7) (int).
    - aa: Amino acid one letter code (default='K').

    Notes
    -----
    Regex excess: delimiter is _ (underscore)
    1) left_ : Uniprot AC/ID
    2) _right : Kac site

    - left_ matches with df_ref_sequence (From)
    - _right access df_ref_sequence (Sequence)

    3) Cut forward and backward 7 amino acids.
    """
    # Initialize params
    aa = aa.upper()
    entries = []
    logo = []
    start = []
    end = []

    # 1. UniprotAC-ID_Site
    for (i, entry) in df_site.iterrows():
        entries.append(entry[0].replace('_', '_'+aa))
        ent_name, ent_site = entry[0].split('_')
        ent_site = int(ent_site)
        # 2. Match entry in reference_sequence.
        try :
            seq = df_sequence.loc[ent_name]['Sequence']
            # For ent_site on sequence residue is lysine (K) if not -> else
            if seq[ent_site-1] == aa:
                # 3. Cases.
                # 3.1. If site residue locates too close forward of sequence.
                if ent_site-d-1 < 0:
                    start.append(int(1))
                    space = ""
                    for i in range(d+1-ent_site):
                        space += " "
                    logo.append(space+seq[:ent_site+d])
                    end.append(ent_site+d)
                # 3.2. If site residue locates too close backward of sequence.
                elif ent_site+d > len(seq):
                    start.append(ent_site-d)
                    space = ""
                    for i in range(ent_site+d-len(seq)):
                        space+=" "
                    logo.append(seq[ent_site-d-1:]+space)
                    end.append(len(seq))
                # 3.3. Normal condition.
                else:
                    start.append(ent_site-d)
                    logo.append(seq[ent_site-d-1:ent_site+d])
                    end.append(ent_site+d)
            else:
                start.append("NaN")
                logo.append("")
                end.append("NaN")
        # ent_site on sequence is not lysine (K) it is other amino acid.
        except:
            start.append("NaN")
            logo.append("")
            end.append("NaN")
    # Organized DataFrame
    logos = pd.DataFrame(np.column_stack([entries, logo, start, end]), columns=['Entry', 'Logo', 'Start residue', 'End residue'])
    return logos

In [9]:
import pandas as pd

site = pd.read_csv(filepath_or_buffer='..\\example\\Phospho (ST)Sites.txt', delimiter='\\t', encoding='utf-8')

  site = pd.read_csv(filepath_or_buffer='..\\example\\Phospho (ST)Sites.txt', delimiter='\\t', encoding='utf-8')


In [7]:
# Load example data
site = pd.read_csv(filepath_or_buffer='..\\example\\Phospho (ST)Sites.txt', delimiter='\\t', encoding='utf-8')
ref_sequence = pd.read_csv(filepath_or_buffer='F:/_ProteomicData/(202303)Global_Kac_HFD/logos/ref_sequence.CSV', index_col=0, encoding='utf-8')

# Run
logos = logo_align(df_site=site, df_sequence=ref_sequence)
logos.to_csv(path_or_buf='../../output/temp.csv', index=False, encoding='utf-8')

In [6]:
if __name__ == "__main__":
    # Load example data
    site = pd.read_csv(filepath_or_buffer='../../example/Acetyl(K)Sites.CSV', encoding='utf-8')
    ref_sequence = pd.read_csv(filepath_or_buffer='../../example/reference_sequence.csv', index_col=0, encoding='utf-8')

    # Run
    logos = logo_align(df_site=site, df_sequence=ref_sequence)
    logos.to_csv(path_or_buf='../../output/logos.csv', index=False, encoding='utf-8')

    # Show the first 5 rows
    print(logos.head())

         Entry             Logo Start residue End residue
0  A8DUK4_K145  VAAALAHKYH                138         147
1   A8DUK4_K18  AVSGLWGKVNADEVG            11          25
2   A8DUK4_K67  KVKAHGKKVITAFND            60          74
3   A8DUK4_K96  LSELHCDKLHVDPEN            89         103
4  D3Z7X0_K535  RLGDDQLKVAKMELK           528         542


In [5]:
# Generate Logos data. Test ~10 rows
import pandas as pd
df_site = pd.read_csv(filepath_or_buffer='../../example/Acetyl(K)Sites.CSV', encoding='utf-8')
df_ref_sequence = pd.read_csv(filepath_or_buffer='../../example/reference_sequence.csv', index_col=0, encoding='utf-8')

for (i, entry) in df_site[:10].iterrows():
    ent_name, ent_site = entry[0].split('_')
    ent_site = int(ent_site)
    seq = df_ref_sequence.loc[ent_name]['Sequence']
    if ent_site-8 < 0:
        space = ""
        for i in range(8-ent_site):
            space += " "
        logo = space+seq[:ent_site+7]
    else:
        logo = seq[ent_site-8:ent_site+7]
    print(logo)

VAAALAHKYH
AVSGLWGKVNADEVG
KVKAHGKKVITAFND
LSELHCDKLHVDPEN
RLGDDQLKVAKMELK
GGGVGRGKDISTITG
WLLELSKKNIFPYHE
   MEHSKQIRILLL
NRFDYKDKDFLSLIG
NIKAAWGKIGGHGAE
