In [21]:
%config Completer.use_jedi = False

import numpy as np
from tqdm import tqdm

In [3]:
from tape.datasets import PrositFragmentationDataset

In [39]:
PrositData = PrositFragmentationDataset("/sdd/PrositToTapeDataConverter/LMDB", "test")

In [40]:
x = {
    0.25 : [],
    0.3 : [],
    0.35 : [],
    0.4 : [],
    0.45 : [],
    0.5 : []
}

In [43]:
def getCEdata(ceDataDict:dict, Dataset: PrositFragmentationDataset)->dict:
    #Loop each element in dataset
    for i in tqdm(range(len(Dataset))):
        for k in ceDataDict.keys():
            if np.round(Dataset[i][3], 2) == np.array(k, dtype=np.float32):
                ceDataDict[k].append(Dataset[i])
    return ceDataDict


In [59]:
from torch.utils.data import Dataset
from tape.tokenizers import TAPETokenizer
from typing import List, Tuple, Any, Dict
import torch

In [83]:
from torch.utils.data import DataLoader, RandomSampler, Dataset

In [85]:
from tape.utils._sampler import BucketBatchSampler

In [108]:
class PrositFragmentationDataset(Dataset):

    def __init__(self,
                 data: dict,
                 ce: float):

        tokenizer = TAPETokenizer(vocab="iupac")
        self.tokenizer = tokenizer
        self.data = data
        self.ce = ce
        self.keys = [
                     'intensities_raw',
                     'collision_energy_aligned_normed',
                     'precursor_charge_onehot'
                     ]
                     
    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int):
        return self.data[index][:3] + tuple([np.array(self.ce, dtype=np.float32)]) + self.data[index][4:]

    def collate_fn(self, batch: List[Tuple[Any, ...]]) -> Dict[str, torch.Tensor]:
        input_ids, input_mask, intensities_raw_true_value, collision_energy, charge = tuple(zip(*batch))

        collision_energy = np.stack(collision_energy)
        input_ids = torch.from_numpy(pad_sequences(input_ids, 0))
        input_mask = torch.from_numpy(pad_sequences(input_mask, 0))
        intensities_raw_true_value = torch.FloatTensor(intensities_raw_true_value)  # type: ignore

        collision_energy_tensor = torch.FloatTensor(collision_energy)
        charge_tensor = torch.FloatTensor(charge)

        return {'input_ids': input_ids,
                'input_mask': input_mask,
                'targets': intensities_raw_true_value,
                'collision_energy': collision_energy_tensor,
                'charge': charge_tensor}

In [122]:
from tape.datasets import pad_sequences

In [115]:
import pickle
import multiprocessing
#ceDataDict = getCEdata(x, PrositData)
#pickle.dump(ceDataDict, open("./data/ceDataDict.pkl", "wb"))

In [110]:
ce25Dataset = PrositFragmentationDataset(ceDataDict[0.25], 0.3)

In [116]:
ce25Dataset[0]

(array([ 2, 25, 22, 20, 23, 23, 26,  8, 22, 11, 10,  7,  5, 25, 17, 19, 14,
         3]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([ 0.01465508,  0.        , -1.        ,  0.        ,  0.        ,
        -1.        ,  0.49154934,  0.        , -1.        ,  0.01188552,
         0.        , -1.        ,  0.5985086 ,  0.        , -1.        ,
         0.09070827,  0.        , -1.        ,  0.09378117,  0.        ,
        -1.        ,  0.22752479,  0.        , -1.        ,  0.12325462,
         0.        , -1.        ,  0.14227246,  0.        , -1.        ,
         0.1938528 ,  0.        , -1.        ,  0.13698395,  0.        ,
        -1.        ,  0.11928167,  0.        , -1.        ,  0.15674177,
         0.        , -1.        ,  0.3543688 ,  0.        , -1.        ,
         0.05974996,  0.        , -1.        ,  0.5245358 ,  0.        ,
        -1.        ,  0.02567784,  0.        , -1.        ,  1.        ,
         0.        , -1.        ,  0.057880

In [117]:
sampler = RandomSampler(ce25Dataset)

In [118]:
batch_sampler = BucketBatchSampler(sampler, 64, False, lambda x: len(x[0]), ce25Dataset)

In [123]:
loader = DataLoader(
        ce25Dataset,
        num_workers=multiprocessing.cpu_count() - 1,
        collate_fn=ce25Dataset.collate_fn,  # type: ignore
        batch_sampler=batch_sampler)

In [127]:
from tape import ProteinBertForValuePredictionFragmentationProsit

In [128]:
pytorch_model = ProteinBertForValuePredictionFragmentationProsit.from_pretrained("/sdd/berzelius/torch_model")
if torch.cuda.is_available():
    use_gpu = True
    pytorch_model = pytorch_model.to(torch.device('cuda:0'))
else:
    use_gpu = False

In [139]:
predictions = list()
targets = list()
for batch in tqdm(loader):  
    targets.append(batch["targets"].cpu().detach().numpy())
    if use_gpu:
        batch = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                 for name, tensor in batch.items()}
    predictions.append(pytorch_model(**batch)[1].cpu().detach().numpy())
    break

  0%|          | 0/5317 [00:00<?, ?it/s]


In [140]:
targets

[array([[ 0.00916206,  0.        ,  0.        , ..., -1.        ,
         -1.        , -1.        ],
        [ 0.3706697 ,  0.        , -1.        , ..., -1.        ,
         -1.        , -1.        ],
        [ 0.03486167,  0.        ,  0.        , ..., -1.        ,
         -1.        , -1.        ],
        ...,
        [ 0.17605817,  0.        ,  0.        , ..., -1.        ,
         -1.        , -1.        ],
        [ 0.04416243,  0.        ,  0.        , ..., -1.        ,
         -1.        , -1.        ],
        [ 0.00637203,  0.        ,  0.        , ..., -1.        ,
         -1.        , -1.        ]], dtype=float32)]

In [142]:
predictions

[array([[ 0.06135187, -0.02176052, -0.02159712, ..., -0.02932831,
         -0.02166041, -0.0138475 ],
        [ 0.18359505, -0.0098374 , -0.01145688, ..., -0.00943738,
         -0.00545273,  0.05913763],
        [ 0.15024786, -0.02002238, -0.02081721, ..., -0.02579243,
         -0.01681008, -0.00653753],
        ...,
        [ 0.19604029, -0.01412448, -0.01485772, ..., -0.01979962,
         -0.01729139, -0.00649025],
        [ 0.10406944, -0.02179089, -0.02288099, ..., -0.0273771 ,
         -0.02491859, -0.02711992],
        [ 0.03087524, -0.02073514, -0.02004476, ..., -0.02566377,
         -0.010216  , -0.01909494]], dtype=float32)]

In [144]:
from torchToTF_tape.utils import cleanTapeOutput

ModuleNotFoundError: No module named 'torchToTF_tape'

In [None]:
pytorch_model = ProteinBertForValuePredictionFragmentationProsit.from_pretrained(model)
    if torch.cuda.is_available():
        use_gpu = True
        pytorch_model = pytorch_model.to(torch.device('cuda:0'))
    else:
        use_gpu = False
   
    loader = getTorchDataLoader(lmdb, split, batch_size = batch_size)
    predictions = list()
    for batch in tqdm(loader):  
        if use_gpu:
            batch = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in batch.items()}
        predictions.append(pytorch_model(**batch)[1].cpu().detach().numpy())
    predictions = np.concatenate(predictions)

In [124]:
for b in loader:
    break

In [125]:
b

{'input_ids': tensor([[ 2, 15, 12, 20,  9, 19, 21, 20, 15, 20, 22,  8, 11, 14,  3],
         [ 2, 14,  9, 22,  9,  9, 25, 10,  8,  5, 15, 27, 15, 14,  3],
         [ 2, 11,  9,  9,  9, 17, 27, 27, 16, 21, 13,  5, 20, 14,  3],
         [ 2, 13, 20, 11, 13, 23, 14, 19,  5, 13, 21, 21, 15,  5,  3],
         [ 2, 25,  9, 22,  9, 13, 14, 25, 19,  8, 25,  9, 15, 14,  3],
         [ 2, 22, 19, 11,  5,  5, 19, 22, 22, 25, 22, 21, 11, 21,  3],
         [ 2, 19, 10,  9,  7, 13,  9,  7, 11, 14, 10, 10, 22, 21,  3],
         [ 2, 19,  5, 22, 22, 15, 11, 23, 11, 22, 19,  5,  5,  8,  3],
         [ 2, 19, 22, 20, 19, 10, 19, 14, 27, 23, 26, 16,  8, 22,  3],
         [ 2, 15, 23,  5,  9,  8,  8, 21, 11, 11, 22, 25, 13, 14,  3],
         [ 2, 15, 23,  9,  8, 15,  9, 28, 12, 12, 15, 15,  8, 14,  3],
         [ 2, 11, 15, 22,  5, 25, 15, 15, 23, 12, 15, 12, 22,  8,  3],
         [ 2,  5, 25, 22, 21, 19, 11, 21, 11,  9, 19, 12, 10, 13,  3],
         [ 2, 20, 13, 28,  5, 20, 10, 21, 11, 23, 25, 12,  9, 14

In [82]:

    batch_sampler = BucketBatchSampler(
        sampler, batch_size, False, lambda x: len(x[0]), dataset)

    loader = DataLoader(
        dataset,
        num_workers=num_workers,
        collate_fn=dataset.collate_fn,  # type: ignore
        batch_sampler=batch_sampler)

(array([ 2,  8,  5, 23, 12,  8,  9,  5, 25, 20,  5, 15, 14,  3]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([ 0.36813897,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.14776494,  0.        ,  0.        ,  0.23337084,
         0.        ,  0.        ,  0.34552184,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.8956807 ,  0.06529374,
         0.        ,  0.0603044 ,  0.        ,  0.        ,  0.41535375,
         0.14685172,  0.        ,  0.08375509,  0.        ,  0.        ,
         0.17214774,  0.        ,  0.        ,  0.33429173,  0.0717124 ,
         0.        ,  0.02780089,  0.        ,  0.        ,  0.928853  ,
         0.08630364,  0.        ,  0.19358736,  0.        ,  0.        ,
         0.09300248,  0.        ,  0.        ,  0.        ,  0.22067006,
         0.        ,  0.        ,  0.04551147,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
      

In [77]:
ceDataDict[0.25][0][:3] + tuple([np.array(0.3, dtype=np.float32)]) + ceDataDict[0.25][0][4:]

(array([ 2, 25, 22, 20, 23, 23, 26,  8, 22, 11, 10,  7,  5, 25, 17, 19, 14,
         3]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([ 0.01465508,  0.        , -1.        ,  0.        ,  0.        ,
        -1.        ,  0.49154934,  0.        , -1.        ,  0.01188552,
         0.        , -1.        ,  0.5985086 ,  0.        , -1.        ,
         0.09070827,  0.        , -1.        ,  0.09378117,  0.        ,
        -1.        ,  0.22752479,  0.        , -1.        ,  0.12325462,
         0.        , -1.        ,  0.14227246,  0.        , -1.        ,
         0.1938528 ,  0.        , -1.        ,  0.13698395,  0.        ,
        -1.        ,  0.11928167,  0.        , -1.        ,  0.15674177,
         0.        , -1.        ,  0.3543688 ,  0.        , -1.        ,
         0.05974996,  0.        , -1.        ,  0.5245358 ,  0.        ,
        -1.        ,  0.02567784,  0.        , -1.        ,  1.        ,
         0.        , -1.        ,  0.057880

In [72]:
ceDataDict[0.25][0][4:]

(array([0, 1, 0, 0, 0, 0], dtype=uint8),)

In [76]:
([np.array(0.3, dtype=np.float32)])

[array(0.3, dtype=float32)]

In [32]:
x == np.array(0.32, dtype=np.float32)

True

In [24]:
np.round(TEST[0][3], 2)

0.32