In [2]:
"""
ptm_site_logo.ipynb

"""
__author__ = "simga"
__version__ = "0.0.1"

# import packages
import pandas as pd
import numpy as np

# Sequence Loading
df_site = pd.read_csv(filepath_or_buffer='../example/Acetyl(K)Sites.CSV', encoding='utf-8')
df_ref_sequence = pd.read_csv(filepath_or_buffer='../example/reference_sequence.csv', index_col=0, encoding='utf-8')

# Generate Logos data. Test ~10 rows
def logo_align(df_site, df_sequence, d=7, aa='K'):
    """
    logo_align(df_site, df_sequence) -> .csv

    Parameters
    ----------
    - df_site: Dataframe that contains UniprotAC and site, must be formed (UniprotAC_Site) (pandas.DataFrame).
    - df_sequence: Dataframe that contains UniprotAC and its sequence (pandas.DataFrame).
    - d: 7aa-X-7aa (default=7) (int).
    - aa: Amino acid one letter code (default='K').

    Notes
    -----
    Regex excess: delimiter is _ (underscore)
    1) left_ : Uniprot AC/ID
    2) _right : Kac site

    - left_ matches with df_ref_sequence (From)
    - _right access df_ref_sequence (Sequence)

    3) Cut forward and backward 7 amino acids.
    """
    # params
    aa = aa.upper()
    entries = []
    logo = []
    start = []
    end = []
    # here
    for (i, entry) in df_site.iterrows():
        # entries.append(entry[0])
        entries.append(entry[0].replace('_', '_'+aa))
        ent_name, ent_site = entry[0].split('_')
        ent_site = int(ent_site)
        try :
            seq = df_sequence.loc[ent_name]['Sequence']
            # For ent_site on sequence residue is lysine (K) if not -> else
            if seq[ent_site-1] == aa:
                # If site residue locates too close sequence forward side.
                if ent_site-d-1 < 0:
                    start.append(int(1))
                    space = ""
                    for i in range(d+1-ent_site):
                        space += " "
                    logo.append(space+seq[:ent_site+d])
                    end.append(ent_site+d+1)
                # If site residue locates too close sequence backward side.
                elif ent_site+d > len(seq):
                    start.append(ent_site-d-1)
                    logo.append(seq[ent_site-d-1:ent_site+d])
                    end.append(len(seq))
                # Normal condition.
                else:
                    start.append(ent_site-d-1)
                    logo.append(seq[ent_site-d-1:ent_site+d])
                    end.append(ent_site+d+1)
            else:
                start.append("NaN")
                logo.append("")
                end.append("NaN")
        # ent_site on sequence is not lysine (K) it is other amino acid.
        except:
            start.append("NaN")
            logo.append("")
            end.append("NaN")
    # save file.
    logos = pd.DataFrame(np.column_stack([entries, logo, start, end]), columns=['Entry', 'Logo', 'Start residue', 'End residue'])
    logos.to_csv(path_or_buf='../output/logos.csv', index=False, encoding='utf-8')

logo_align(df_site, df_ref_sequence)

In [5]:
# Generate Logos data. Test ~10 rows
import pandas as pd
df_site = pd.read_csv(filepath_or_buffer='../example/Acetyl(K)Sites.CSV', encoding='utf-8')
df_ref_sequence = pd.read_csv(filepath_or_buffer='../example/reference_sequence.csv', index_col=0, encoding='utf-8')

for (i, entry) in df_site[:10].iterrows():
    ent_name, ent_site = entry[0].split('_')
    ent_site = int(ent_site)
    seq = df_ref_sequence.loc[ent_name]['Sequence']
    if ent_site-8 < 0:
        space = ""
        for i in range(8-ent_site):
            space += " "
        logo = space+seq[:ent_site+7]
    else:
        logo = seq[ent_site-8:ent_site+7]
    print(logo)

VAAALAHKYH
AVSGLWGKVNADEVG
KVKAHGKKVITAFND
LSELHCDKLHVDPEN
RLGDDQLKVAKMELK
GGGVGRGKDISTITG
WLLELSKKNIFPYHE
   MEHSKQIRILLL
NRFDYKDKDFLSLIG
NIKAAWGKIGGHGAE
