In [2]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import os
import re

In [3]:
sites = pd.read_csv("../../../../results/hijazi/01_processed_data/benchmark_data.csv")
proteome = SeqIO.parse('../../../../data/decryptm/uniprot_proteome_up000005640_03112020.fasta', 'fasta')

In [4]:
# process sites: split into protein, aa and position
sites_df = sites['ID'].str.split('|', expand=True)
sites_df.columns = ['full', 'symbol', 'aaposition']
sites_df['aa'] = sites_df['aaposition'].str.extract('([A-Z])')
sites_df['position'] = sites_df['aaposition'].str.extract('(\d+)')

ValueError: Length mismatch: Expected axis has 4 elements, new values have 3 elements

In [None]:
# Prepare proteome dictionary
proteome_dict = {}

for record in proteome:
    input_string = record.description
    match = re.search(r'GN=([\w]+)', input_string)    
    if match:
        symbol = match.group(1)
        
        # Check if the gene symbol is already in the dictionary
        if symbol in proteome_dict:
            # If yes, create a new key by appending isoform information
            isoform_number = sum(f'{symbol}_' in key for key in proteome_dict) + 1
            new_key = f'{symbol}_{isoform_number}'
        else:
            # If no, use the original symbol as the key
            new_key = symbol

        # Update the dictionary with the new key and the sequence
        proteome_dict[new_key] = record.seq
    else:
        symbol = 'NA'

In [None]:
n_surrounding = 7

seq_list = []
aa = []
prot = []
pos = []
for i, int_row in sites_df.iterrows():
    symb = int_row['symbol']
    filtered_dict = {key: value for key, value in proteome_dict.items() if key == symb or key.startswith(f'{symb}_')}
    for key_i in filtered_dict:
        try:
            seq = filtered_dict[key_i]
            aa_pos = int(int_row['position']) - 1
            assert seq[aa_pos] == int_row['aa']
            if aa_pos >= n_surrounding+1:
                site_seq = seq[aa_pos - n_surrounding:aa_pos] + seq[aa_pos] + '(ph)' + seq[aa_pos + 1:aa_pos + n_surrounding + 1]
            else:
                site_seq = seq[0:aa_pos] + seq[aa_pos] + '(ph)' + seq[aa_pos + 1:aa_pos + n_surrounding + 1]    
            seq_list.append(str(site_seq))
            aa.append(str(int_row['aa']))
            prot.append(str(key_i))
            pos.append(str(aa_pos + 1))
        except: 
            seq_list.append(np.nan)
            prot.append(str(key_i))
            pos.append(str(aa_pos + 1))
            aa.append(str(int_row['aa']))

df = pd.DataFrame({
    'Protein': prot,
    'Position': pos,
    'Aminoacid': aa,
    'Sequence': seq_list
})

In [None]:
filtered_df = (df
               .loc[lambda x: x['Sequence'].notna()]
               .reset_index(drop=True))
filtered_df['Protein'] = filtered_df['Protein'].apply(lambda x: x.split('_')[0])
distinct_df = filtered_df.drop_duplicates(keep='first')

In [None]:
distinct_df.to_csv('../../../../results/hijazi/01_processed_data/fifteenmer.csv', index=False)