In [1]:
%config Completer.use_jedi = False



In [2]:
from torch.utils.data import Dataset
import pandas as pd
from pathlib import Path
from typing import Union, List, Tuple, Any, Dict
from tape import TAPETokenizer
import torch
import numpy as np
from torch.utils.data import DataLoader
from prosittransformer.DataHandler import pad_sequences
from tape import ProteinBertForValuePredictionFragmentationProsit

In [3]:
class DataframeDataset(Dataset):
    """Creates a dataset from an lmdb file.
    Args:
        data_file (Union[str, Path]): Path to lmdb file.
        in_memory (bool, optional): Whether to load the full dataset into memory.
            Default: False.
    """

    def __init__(self,
                 data_file: Union[str, Path],
                 in_memory: bool = False):

        data_file = Path(data_file)
        if not data_file.exists():
            raise FileNotFoundError(data_file)
            
        data = pd.read_csv(data_file, sep=",")
        self._data = data.to_dict('records')
        
        self._num_examples = len(self._data)

    def __len__(self) -> int:
        return self._num_examples

    def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        item = self._data[index]
        return item

In [4]:
CHARGES = [1, 2, 3, 4, 5, 6]
def get_precursor_charge_onehot(charges):
    array = np.zeros([len(charges), max(CHARGES)], dtype=int)
    for i, precursor_charge in enumerate(charges):
        array[i, precursor_charge - 1] = 1
    return array

In [5]:
class PrositInputDataset(Dataset):

    def __init__(self,
                 data_path: Union[str, Path],
                 tokenizer: Union[str, TAPETokenizer] = 'iupac'
                ):
        
        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer
        data_path = Path(data_path)
        self.data = DataframeDataset(data_path)
        self.keys = [
                     'modified_sequence',
                     'collision_energy',
                     'precursor_charge'
                     ]
    
    
    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['modified_sequence'])
        input_mask = np.ones_like(token_ids)
        collision_energy = item['collision_energy'] / 100
        charge = item['precursor_charge']
        return (token_ids, input_mask, collision_energy, charge)

    def collate_fn(self, batch: List[Tuple[Any, ...]]) -> Dict[str, torch.Tensor]:
        input_ids, input_mask, collision_energy, charge = tuple(zip(*batch))
        charge = get_precursor_charge_onehot(charge)

        collision_energy = np.stack(collision_energy)
        input_ids = torch.from_numpy(pad_sequences(input_ids, 0))
        input_mask = torch.from_numpy(pad_sequences(input_mask, 0))

        collision_energy_tensor = torch.FloatTensor(collision_energy)
        charge_tensor = torch.FloatTensor(charge)

        return {'input_ids': input_ids,
                'input_mask': input_mask,
                'collision_energy': collision_energy_tensor,
                'charge': charge_tensor}

In [6]:
dataset1 = PrositInputDataset("/sdd/prosittransformer/data/prosit_input.csv")

In [7]:
dataloder1 = DataLoader(dataset1, num_workers=6,
                    collate_fn=dataset1.collate_fn,
                    batch_size=3)

In [8]:
b1 = next(iter(dataloder1))

b1 = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in b1.items()}

In [9]:
model = ProteinBertForValuePredictionFragmentationProsit.from_pretrained("/sdd/berzelius/final_results/delta_0.15/")
model = model.to(torch.device('cuda:0'))

In [10]:
x1 = model(**b1)[0].cpu().detach().numpy()

In [None]:
import torch
from tape import TAPETokenizer

In [None]:
from tape import ProteinBertForValuePredictionFragmentationProsit

In [None]:
import pickle as pkl

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from tqdm import tqdm

In [None]:
from prosittransformer.DataHandler import pad_sequences

In [None]:
from prosittransformer.utils import hdf5Loader
from prosittransformer.utils import cleanTapeOutput

In [None]:
from tape.datasets import PrositFragmentationDataset
from torch.utils.data import DataLoader

In [None]:
from tape.datasets import PrositFragmentationDataset
from prosittransformer.prositUtils import sanitize

In [16]:
dataset = PrositFragmentationDataset("/sdd/PrositToTapeDataConverter/LMDB/", split="test")

In [17]:
loader = DataLoader(dataset, 
                    num_workers=6,
                    collate_fn=dataset.collate_fn,
                    batch_size=3)

In [18]:
b = next(iter(loader))

In [19]:
b = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in b.items()}

In [20]:
x2 = model(**b)[1].cpu().detach().numpy()

In [21]:
np.allclose(x2[2],x1[2])

True

In [22]:
x2[0]

array([ 5.32360077e-02, -9.01104417e-03, -8.20380263e-03, -6.52382709e-03,
       -8.96532089e-03, -8.20204709e-03,  1.05591372e-01, -1.01910308e-02,
       -8.30774009e-03,  2.46345937e-01, -9.67752188e-03, -7.97981117e-03,
        1.16022319e-01, -9.50901583e-03, -8.15635268e-03, -2.01028213e-03,
       -9.89710446e-03, -8.71482491e-03,  2.18849808e-01,  9.43607930e-03,
       -8.53640959e-03,  4.24455851e-04, -7.14800833e-03, -8.31007212e-03,
        4.38355148e-01,  1.12281423e-02, -8.66800267e-03, -5.71816973e-03,
       -7.53244292e-03, -8.58819112e-03,  2.22736269e-01,  5.47584444e-02,
       -1.00560375e-02, -3.55597399e-03, -7.87417870e-03, -6.74730074e-03,
        6.34075254e-02,  6.80070639e-01, -2.50564306e-03, -9.92926024e-03,
       -7.86462240e-03, -7.83468969e-03,  4.08373401e-02,  3.62772644e-01,
       -3.57127911e-03, -9.26793925e-03, -7.70735368e-03, -8.01162422e-03,
       -1.77023020e-02,  3.71635854e-01, -5.36110066e-03, -1.01088379e-02,
       -1.07557513e-03, -

In [23]:
x1[0]

array([ 5.32360077e-02, -9.01104417e-03, -8.20380263e-03, -6.52382709e-03,
       -8.96532089e-03, -8.20204709e-03,  1.05591401e-01, -1.01910299e-02,
       -8.30774009e-03,  2.46346027e-01, -9.67752188e-03, -7.97981024e-03,
        1.16022319e-01, -9.50901583e-03, -8.15635268e-03, -2.01027095e-03,
       -9.89710353e-03, -8.71482585e-03,  2.18849838e-01,  9.43606254e-03,
       -8.53640959e-03,  4.24463302e-04, -7.14800786e-03, -8.31007212e-03,
        4.38355148e-01,  1.12281367e-02, -8.66800267e-03, -5.71816415e-03,
       -7.53244292e-03, -8.58819112e-03,  2.22736239e-01,  5.47584370e-02,
       -1.00560384e-02, -3.55597027e-03, -7.87417777e-03, -6.74730120e-03,
        6.34075254e-02,  6.80070639e-01, -2.50563584e-03, -9.92925465e-03,
       -7.86462240e-03, -7.83468969e-03,  4.08373177e-02,  3.62772644e-01,
       -3.57127818e-03, -9.26793646e-03, -7.70735415e-03, -8.01162422e-03,
       -1.77023020e-02,  3.71635944e-01, -5.36109880e-03, -1.01088360e-02,
       -1.07557885e-03, -

In [None]:
OUT[1]

In [None]:
prosit_pp = hdf5Loader.from_hdf5("/sdd/berzelius/final_results/delta_0.15/torchResult.hdf5")

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        X = min(len(lst), i+n)
        yield lst[i:X]

In [None]:
collision_energy_aligned_normed = getVals(prosit_pp, "collision_energy_aligned_normed")
precursor_charge_onehot = getVals(prosit_pp, "precursor_charge_onehot")

In [None]:
sequence_integer = getVals(prosit_pp, "sequence_integer")

In [None]:
collision_energy_aligned_normed[0]

In [None]:
precursor_charge_onehot[0]

In [None]:
sequence_integer[0]

In [None]:
seq = sequence_integer[0]
seq = seq[np.nonzero(seq)]

In [None]:
peptide = "".join([ALPHABET_S[s] for s in seq])

In [None]:
model = ProteinBertForValuePredictionFragmentationProsit.from_pretrained("/sdd/berzelius/final_results/delta_0.15/")
model = model.to(torch.device('cuda:0'))

In [None]:
ix = 1

In [None]:
collision_energy = np.array([dataset[ix][3]])
charge = dataset[ix][4][None,:]
input_ids = dataset[ix][0][None,:]
input_mask = dataset[ix][1][None,:]

In [None]:
toy_data1 = {
    'collision_energy': torch.FloatTensor(collision_energy.astype(np.float32)),
    'charge': torch.FloatTensor(charge.astype(np.float32)),
    'input_ids' : torch.from_numpy(input_ids.astype(np.int64)),
    'input_mask' : torch.from_numpy(input_mask.astype(np.int64))
        }

toy_data1 = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in toy_data1.items()}

In [None]:
model(**toy_data1)[0].cpu().detach().numpy()

In [None]:
CHARGES = [1, 2, 3, 4, 5, 6]
def get_precursor_charge_onehot(charges):
    array = np.zeros([len(charges), max(CHARGES)], dtype=int)
    for i, precursor_charge in enumerate(charges):
        array[i, precursor_charge - 1] = 1
    return array

In [None]:
df = pd.read_csv("/sdd/prosittransformer/data/prosit_input.csv", sep=",")

In [None]:
chunk = df.iloc[0,:].values

In [None]:
collision_energy1 = np.array([chunk[1] / 100.0])

In [None]:
charge1 = get_precursor_charge_onehot([chunk[2]])

In [None]:
peptide_sequences1 = chunk[0]

In [None]:
tokenizer = TAPETokenizer()
def TokenizePeptides(peptides):
    input_ids = np.array([tokenizer.encode(p) for p in peptides])
    return input_ids, np.ones_like(input_ids)

In [None]:
input_ids1, input_mask1 = TokenizePeptides([peptide_sequences1])

In [None]:
toy_data2 = {
    'collision_energy': torch.FloatTensor(collision_energy1.astype(np.float32)),
    'charge': torch.FloatTensor(charge1.astype(np.float32)),
    'input_ids' : torch.from_numpy(input_ids1.astype(np.int64)),
    'input_mask' : torch.from_numpy(input_mask1.astype(np.int64))
        }

toy_data2 = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in toy_data2
             .items()}

In [None]:
toy_data2

In [None]:
toy_data1

In [None]:
model(**toy_data2)[0].cpu().detach().numpy()

In [None]:
collision_energy, charge, peptide_sequences = np.hstack(list(chunk[1] / 100.0)), get_precursor_charge_onehot(list(chunk[2])), list(chunk[0])



In [None]:
DATA[0]

In [None]:
prosit_pp = hdf5Loader.from_hdf5("/sdd/berzelius/final_results/delta_0.15/torchResult.hdf5")
prosit_vanilla = hdf5Loader.from_hdf5("/sdd/PrositToTapeDataConverter/hdf5/hcd/HDF5/prediction_hcd_ho.hdf5")

In [None]:
ALPHABET = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "M(ox)": 21,
}
ALPHABET_S = {integer: char for char, integer in ALPHABET.items()}

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        X = min(len(lst), i+n)
        yield lst[i:X]

In [None]:
prosit_pp.keys()

In [None]:
def getVals(data, key):
    sa_list = list()
    M = data[key]
    for b in tqdm(chunks(M, 1000),total =int(len(M) / 1000)):
        sa_list.append(b)
    all_sa = [s for sa in sa_list for s in sa]
    return np.array(all_sa)

In [None]:
prosit_pp.keys()

In [None]:
collision_energy_aligned_normed = getVals(prosit_pp, "collision_energy_aligned_normed")
precursor_charge_onehot = getVals(prosit_pp, "precursor_charge_onehot")


In [None]:
sequence_integer = getVals(prosit_pp, "sequence_integer")

In [None]:
pp_intensities_pred = getVals(prosit_pp, "intensities_pred")
pp_intensities_raw = getVals(prosit_pp, "intensities_raw")

In [None]:
vanilla_intensities_pred = getVals(prosit_vanilla, "intensities_pred")
vanilla_intensities_raw = getVals(prosit_vanilla, "intensities_raw")

In [None]:
X = np.load("/sdd/prosittransformer/data/delta_0.15/predicted_spectra.npy")

In [None]:
X[1]

In [None]:
pp_intensities_pred[1]

In [None]:
charge = np.where(precursor_charge_onehot)[1]

In [None]:
mask = [True if 0 < c and c <3 else False for c in charge]

In [None]:
mask2 = [True if 21 not in seq else False for seq in sequence_integer]

In [None]:
sum(mask2)

In [None]:
MASK = [all([m1, m2]) for m1, m2 in zip(mask, mask2)]

In [None]:
pp_sa = cleanTapeOutput.masked_spectral_distance(pp_intensities_raw[MASK], pp_intensities_pred[MASK])
1 - np.median(pp_sa)

In [None]:
vanilla_sa = cleanTapeOutput.masked_spectral_distance(vanilla_intensities_raw[MASK], vanilla_intensities_pred[MASK])
1 - np.median(vanilla_sa)

In [None]:
modified_sequence = ["".join([ALPHABET_S[s] for s in seq[np.nonzero(seq)]]) for seq in sequence_integer[MASK]]

In [None]:
charge_val = charge[MASK] + 1
    

In [None]:
col_energy = collision_energy_aligned_normed[MASK] * 100

In [None]:
len(col_energy)

In [None]:
len(charge_val)

In [None]:
len(modified_sequence)

In [None]:
df = pd.DataFrame([[p, col, c] for p, c, col in zip(modified_sequence, charge_val, col_energy)], 
             columns=["modified_sequence","collision_energy","precursor_charge"])

In [None]:
df.to_csv("/sdd/prosittransformer/data/prosit_input.csv", index=False, sep=",")

In [None]:
'collision_energy_aligned_normed', 'sequence_integer', 'precursor_charge_onehot', 'masses_pred', 'intensities_pred', 'iRT'

In [None]:
all_results = pkl.load(open("/sdd/prosittransformer/data/all_result.pkl", "rb"))

In [None]:
all_results.keys()

In [None]:
all([p1==p2 for p1, p2 in zip(modified_sequence, all_results['modified_sequence'])])

In [None]:
all_results['modified_sequence']

In [None]:
all_results['iRT']

In [None]:
0 < charge[0] and charge[0] <3

In [None]:
pp_sa = cleanTapeOutput.masked_spectral_distance(pp_intensities_raw, pp_intensities_pred)
1 - np.median(pp_sa)

In [None]:
vanilla_sa = cleanTapeOutput.masked_spectral_distance(vanilla_intensities_raw, vanilla_intensities_pred)
1 - np.median(vanilla_sa)

In [None]:
1 - np.median(pp_sa)

In [None]:
np.median(all_sa)