In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from typing import List, Dict
from datasets import load_dataset 
# from sklearn.model_selection import train_test_split

In [52]:
def extract_fragment_ion_probabilities(data):
    """
    Gom tất cả các cột fragment ion probabilities vào một mảng
    
    Returns:
        List of arrays, mỗi array chứa probabilities của một peptide
    """
    all_keys = list(data.keys())
    print(f'All keys: {all_keys}')
    # Lọc ra các cột là fragment ion probabilities
    # Các cột này có format tuple: ('a'/'b'/'y', charge, position)
    prob_columns = [k for k in all_keys if isinstance(k, str) and k.startswith("(")]
    
    # Sort các cột theo thứ tự: ion_type, charge, position
    prob_columns = sorted(prob_columns)
    
    print(f"Tìm thấy {len(prob_columns)} cột fragment ion probabilities")
    print(f"Ví dụ các cột: {prob_columns[:5]}...")
    
    # Tạo danh sách để chứa probabilities cho mỗi peptide
    num_peptides = len(data['peptide'])
    fragment_ion_probabilities = []
    
    for i in range(num_peptides):
        # Lấy tất cả probabilities của peptide thứ i
        probs = []
        for col in prob_columns:
            value = data[col][i]
            probs.append(value)
        
        fragment_ion_probabilities.append(np.array(probs))
    
    return fragment_ion_probabilities, prob_columns

In [None]:
vocab = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
vocab_size = len(vocab) + 1
aa_to_id = {aa: i for i, aa in enumerate(vocab)}
pad_id = vocab_size - 1

num_samples = 1000
ds = load_dataset("bandeiralab/Pep2Prob")

train_data = ds['train'][:num_samples]
test_data = ds['test'][:num_samples]

peptides = train_data['peptide']
charges = train_data['charge']
probs, prob_columns = extract_fragment_ion_probabilities(train_data)

Tìm thấy 0 cột fragment ion probabilities
Ví dụ các cột: []...


All keys: ['precursor_index', 'peptide', 'charge', '#PSM', 'peptide_length', "('a', '1', '2')", "('b', '1', '1')", "('b', '1', '2')", "('b', '1', '3')", "('b', '1', '4')", "('b', '1', '5')", "('b', '1', '6')", "('b', '1', '7')", "('b', '1', '8')", "('b', '1', '9')", "('b', '1', '10')", "('b', '1', '11')", "('b', '1', '12')", "('b', '1', '13')", "('b', '1', '14')", "('b', '1', '15')", "('b', '1', '16')", "('b', '1', '17')", "('b', '1', '18')", "('b', '1', '19')", "('b', '1', '20')", "('b', '1', '21')", "('b', '1', '22')", "('b', '1', '23')", "('b', '1', '24')", "('b', '1', '25')", "('b', '1', '26')", "('b', '1', '27')", "('b', '1', '28')", "('b', '1', '29')", "('b', '1', '30')", "('b', '1', '31')", "('b', '1', '32')", "('b', '1', '33')", "('b', '1', '34')", "('b', '1', '35')", "('b', '1', '36')", "('b', '1', '37')", "('b', '1', '38')", "('b', '1', '39')", "('b', '2', '1')", "('b', '2', '2')", "('b', '2', '3')", "('b', '2', '4')", "('b', '2', '5')", "('b', '2', '6')", "('b', '2', '7')", 

In [54]:
probs

[array([ 0.980392,  0.      ,  0.039216, -1.      , -1.      , -1.      ,
        -1.      , -1.      , -1.      , -1.      , -1.      , -1.      ,
         1.      , -1.      , -1.      , -1.      , -1.      , -1.      ,
        -1.      , -1.      , -1.      , -1.      , -1.      ,  1.      ,
        -1.      , -1.      , -1.      , -1.      , -1.      , -1.      ,
        -1.      , -1.      , -1.      , -1.      ,  1.      ,  1.      ,
         0.931373,  0.803922,  0.323529,  0.068627,  0.      ,  0.078431,
        -1.      , -1.      , -1.      , -1.      , -1.      , -1.      ,
        -1.      , -1.      , -1.      ,  0.      , -1.      , -1.      ,
        -1.      , -1.      , -1.      , -1.      , -1.      , -1.      ,
        -1.      , -1.      ,  0.009804, -1.      , -1.      , -1.      ,
        -1.      , -1.      , -1.      , -1.      , -1.      , -1.      ,
        -1.      ,  0.205882,  0.      ,  0.068627,  0.      ,  0.098039,
         0.009804, -1.      , -1.     

In [None]:
def pep_to_prob(pep):
    indices = [aa_to_id.get(aa, pad_id) for aa in pep]
    padded = indices + [-1] * (235 - len(indices))
    return torch.tensor(padded).unsqueeze(0)

In [42]:
pep = "PEPTIDE"
pep_to_prob(pep)

tensor([12,  3, 12, 16,  7,  2,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, 

In [None]:
class Flow(nn.Module):
    def __init__(self, d_in=235, hidden=50):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in + 1, hidden),
            nn.SiLU(),
            nn.Linear(hidden, hidden),
            nn.SiLU(),
            nn.Linear(hidden, hidden),
            nn.SiLU(),
            nn.Linear(hidden, d_in),
            nn.LayerNorm(d_in)
        )
        
    def forward(self, x_t, t):
        input = torch.cat([x_t, t], dim=1)
        self.net(input)
        return input
    

tensor([[12.,  3., 12., 16.,  7.,  2.,  3., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1.

In [45]:
ds = load_dataset("bandeiralab/Pep2Prob")
precursors = ds['train'][:100]


In [17]:
precursors["peptide"]

['AAAAAAAAAAR',
 'AAAAAAAAGAFAGR',
 'AAAAAAAAAR',
 'AAAAAAAK',
 'AAAAAADGPPAADGEDGQDPHSK',
 'AAAAAAGGAGGPGSGLAPLPGLPPSAAAH',
 'AAAAAAGGAGGPGSGLAPLPGLPPSAAAH',
 'AAAAAAGGGYTAMPAPMSVYSHPAHAEQYPGGMAR',
 'AAAAAAGGGYTAMPAPMSVYSHPAHAEQYPGGMAR',
 'AAAAADGPPAADGEDGQDPHSK',
 'AAAAAATPAVR',
 'AAAAAAVSGSAAAEAK',
 'AAAAAAVSGSAAAEAK',
 'AAAAADGPPAADGEDGQDPHSK',
 'AAAAADGPPAADGEDGQDPHSK',
 'AAAAAGGAAAAAAR',
 'AAAAAHHHHHHHHHPGAFFR',
 'AAAAAHHHHHHHHHPGAFFR',
 'AAAAATFSEQVGGGSGGAGR',
 'AAAAEVLGLILR',
 'AAAADGPPAADGEDGQDPHSK',
 'AAAADGPPAADGEDGQDPHSK',
 'AAAAELER',
 'AAAAELLAQTTHFLHDPK',
 'AAAAEVEAEGGGGGGGGK',
 'AAAAEPPVIELGAR',
 'AAAAFVLANENNIALFK',
 'AAAAGMLLLGLLQAGGSVLGQAMEK',
 'AAAAGPGAALSPR',
 'AAAAMAPIK',
 'AAAAPAESAAPAAGEEPSKEEGEPK',
 'AAAAPAESAAPAAGEEPSK',
 'AAAAPAYSPNMYPGANPTFQTGYTPGTPYK',
 'AAAASPPLLR',
 'AAAATFSEQVGGGSGGAGR',
 'AAAATFSEQVGGGSGGAGR',
 'AAAAVAPGGLQSTPGR',
 'AAAAVEPDVVVKR',
 'AAAAVGAGHGAGGPGAASSSGGAR',
 'AAAAVVAAAAR',
 'AAAAVSGSAAAEAK',
 'AAADGPPAADGEDGQDPHSK',
 'AAADGPPAADGEDGQ