In [1]:
import os
import torch
from tqdm import tqdm
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import time
from datetime import timedelta

In [2]:
tok = RobertaTokenizerFast.from_pretrained("tok/")

In [3]:
# Read masked sequences and labels

masked_sequences = []
with open("data/sequences_masked_12M.txt", 'r') as masked_file:
    for i, line in enumerate(masked_file):
        seq = line.strip()
        masked_sequences.append(seq)

labels_from_file = []
with open("data/labels_12M.txt", "r") as labels_file:
    for i, line in enumerate(labels_file):
        lbl = line.strip()
        labels_from_file.append(lbl)

In [4]:
masked_sequences[2]

'MF<mask>FLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAISGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTXDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVXAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTYGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIDDTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSHRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPINFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILARLDKVEAEVQIDRLI

In [5]:
# Tokenize sequences and labels

data_dict = {'input_ids': [], 'attention_mask': [], 'labels': []}

start_time = time.time()

pbar = tqdm(total=len(masked_sequences), position=0)
for i in range(len(masked_sequences)):
    temp_tok_seq = tok.encode_plus(masked_sequences[i])
    data_dict['input_ids'].append(temp_tok_seq['input_ids'])
    data_dict['attention_mask'].append(temp_tok_seq['attention_mask'])
    temp_tok_label = tok.encode(labels_from_file[i])[1]
    temp_tok_label_ext = [-100 if t != tok.mask_token_id else temp_tok_label for t in temp_tok_seq['input_ids']]
    data_dict['labels'].append(temp_tok_label_ext)
    pbar.update(1)  # Update the progress bar for each processed sequence
pbar.close()

elapsed_time = time.time() - start_time
formatted_time = str(timedelta(seconds=elapsed_time))
print(f"Elapsed time: {formatted_time}")

100%|██████████| 12710183/12710183 [1:14:04<00:00, 2859.95it/s]

Elapsed time: 1:14:04.358525





In [6]:
data_dict

{'input_ids': [[0, 4, 142, 1951, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 16, 4, 51, 1951, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2764, 4, 2893, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 5757, 4, 1951, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 522, 4, 729, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 768, 4, 2109, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 522, 62, 4, 40, 701, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2301, 4, 19, 701, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2301, 15, 4, 701, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 1212, 4, 25, 4296, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 1212, 15, 4, 4296, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2134, 4, 86, 486, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2134, 22, 4, 20, 486, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2213, 4, 486, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 1894, 4, 46, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0, 2297, 4, 17, 1394, 877, 5032, 2466, 2762, 3065, 2],
  [0

In [7]:
# Convert your data to PyTorch tensors
input_ids = [torch.tensor(sample) for sample in data_dict['input_ids']]
attention_mask = [torch.tensor(sample) for sample in data_dict['attention_mask']]
labels = [torch.tensor(sample) for sample in data_dict['labels']]

In [8]:
# Pad sequences to the same length
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tok.pad_token_id)
attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
labels = pad_sequence(labels, batch_first=True, padding_value=-100)

In [9]:
# Create a custom dataset
tds = TensorDataset(input_ids, attention_mask, labels)

In [10]:
torch.save(tds, "data/tensor_dataset_12M.pth")