In [2]:
%config Completer.use_jedi = False

In [5]:
import pandas as pd
import numpy as np

from tape import ProteinBertForValuePredictionFragmentationProsit
import numpy as np
from tape import TAPETokenizer
from prosittransformer.DataHandler import pad_sequences
import torch

In [6]:
import tempfile

In [7]:
from prosittransformer.prositUtils.tensorize import csv
from prosittransformer.prositUtils import sanitize

In [8]:
from prosittransformer.prositUtils.converters import generic

In [9]:
CHARGES = [1, 2, 3, 4, 5, 6]
def get_precursor_charge_onehot(charges):
    array = np.zeros([len(charges), max(CHARGES)], dtype=int)
    for i, precursor_charge in enumerate(charges):
        array[i, precursor_charge - 1] = 1
    return array

tokenizer = TAPETokenizer()
def TokenizePeptides(peptides):
    input_ids = pad_sequences([tokenizer.encode(p) for p in peptides])
    return input_ids, np.ones_like(input_ids)

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def setData(chunk):
    collision_energy, charge, peptide_sequences = np.hstack(list(chunk[1] / 100)), get_precursor_charge_onehot(list(chunk[2])), list(chunk[0])
    input_ids, input_mask = TokenizePeptides(peptide_sequences)
    
    toy_data = {
    'collision_energy': torch.FloatTensor(collision_energy.astype(np.float32)),
    'charge': torch.FloatTensor(charge.astype(np.float32)),
    'input_ids' : torch.from_numpy(input_ids.astype(np.int64)),
    'input_mask' : torch.from_numpy(input_mask.astype(np.int64))
        }
    return toy_data

In [10]:
!ls ./../data/

2021-10-28-yeast-reviewed-UP000002311.fas.trypsin.z3_nce33.csv
2021-10-28-yeast-reviewed-UP000002311.fas.trypsin.z3_nce33_prsoit++.csv
all_result.pkl
delta_0.15
gamma_0
intensity.pkl
iRT.npy
predicted_spectra.npy
prosit_input.csv
prosit_input_processed.pkl
test_out.csv


In [11]:
df = pd.read_csv("./../data/2021-10-28-yeast-reviewed-UP000002311.fas.trypsin.z3_nce33.csv", sep=",")

In [12]:
df.shape

(446585, 3)

In [13]:
df = df.head(500)

In [14]:
x = csv(df)

In [15]:
!ls "/sdd/berzelius/final_results/"

delta_0.0   delta_0.1	delta_0.19  delta_0.29
delta_0.05  delta_0.15	delta_0.24  delta_0.34


In [16]:
model = ProteinBertForValuePredictionFragmentationProsit.from_pretrained("/sdd/berzelius/final_results/delta_0.0")
model = model.to(torch.device('cuda:0'))


In [17]:
y_list = list()
for chunk in chunker(df.values,64):
    data = setData(chunk.T)
    data = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in data.items()}
    y = model(**data)[0].cpu().detach().numpy()
    y_list.append(y)

In [18]:
preds = np.concatenate(y_list)

In [19]:
x["intensities_pred"] = preds

In [20]:
x["intensities_pred"].shape

(500, 174)

In [21]:
x["collision_energy_aligned_normed"].shape

(500, 1)

In [22]:
data = sanitize.prediction(x)

In [23]:
data["intensities_pred"].shape

(500, 174)

In [24]:
data["iRT"] = np.array([i for i in range(500)])

In [25]:
data["iRT"] = data["iRT"][:,None]

In [29]:
data.keys()

dict_keys(['collision_energy_aligned_normed', 'sequence_integer', 'precursor_charge_onehot', 'masses_pred', 'intensities_pred', 'iRT'])

In [None]:
tmp_f = tempfile.NamedTemporaryFile(delete=False)

In [None]:
c = generic.Converter(data, tmp_f.name)

In [None]:
c.convert()

In [None]:
I = pd.read_csv(tmp_f.name, ",")

In [None]:
I

In [None]:
X = pd.read_csv("/sdd/prosittransformer/data/test_out.csv", ",")

In [None]:
set(I.ModifiedPeptide)

In [None]:
X

In [None]:
X

In [None]:
!wc -l {tmp_f.name}

In [None]:
np.array([-1 for i in range(100)]).shape

In [None]:
data["input_mask"]

In [None]:
model = ProteinBertForValuePredictionFragmentationProsit.from_pretrained("/sdd/berzelius/final_results/delta_0.34177215189873417")


### Load GPU

In [None]:
model = model.to(torch.device('cuda:0'))
toy_data = {name: tensor.cuda(device=torch.device('cuda:0'), non_blocking=True)
                     for name, tensor in toy_data.items()}

### Predict

In [None]:
y = model(**toy_data)[0].cpu().detach().numpy()

In [None]:
y.shape