In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np
import csv
from tokenizers import Tokenizer
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from typing import Union

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', add_prefix_space=True)

In [3]:
data = []
with open('data/IMDB Dataset.csv') as f:
    r = csv.reader(f, delimiter=',', quotechar='"')
    for row in r:
        data.append(row)
texts, sentiment = list(zip(*data[1:]))

In [4]:
texts = list(texts)

In [5]:
vocab_size = 16000
emb_dim = 512
context_length = 1024

In [6]:
tokenizer.pad_token

Using pad_token, but it is not set yet.


In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [30]:
x = tokenizer(
    text = texts[0][:context_length],
    text_pair = texts[0][1:context_length+1],
    padding = True,
    return_tensors = 'pt'
)

In [31]:
x.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
text_chunks = []
text_chunk = ''
start_at = 0
for text in texts:
    remaining_text = text.copy()
    while len(remaining_text) > 0:
        end_at = context_length - len(text_chunk) + 1
        text_chunk = text_chunk + remaining_text[start_at:end_at]
        remaining_text = remaining_text[end_at:]
        start_at = end_at - 1
        if len(text_chunk) == context_length

    if len(text_chunk) == 1024:
        text_chunks.append(text_chunk)
        text_chunk = ''
        start_at = 0
        continue
    
    if len(text[end_at:]) > 0:
        text


In [17]:
class LM_DataSet(Dataset):

    def __init__(
        self,
        tokenizer: Union[str, Tokenizer],
        texts: iter,
        vocab_size: int,
        context_length: int,
        tokenizer_batch_size: int = 512
    ):
        self.vocab_size = vocab_size
        self.context_length = context_length
        
        if isinstance(tokenizer, str):
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        else:
            self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token

        if isinstance(texts, str):
            texts = [texts]
        self.tokenizer.train_new_from_iterator(texts, vocab_size=vocab_size)

        idx = 0
        idx_next = tokenizer_batch_size
        self.tokenized_texts = defaultdict(np.array)
        self.id_ar = np.array([], dtype=int)
        while idx <= len(texts):
            tokenized_batch = self.tokenizer(
                texts[idx:idx_next],
                padding = False,
                truncation = False,
                return_tensors = 'np'
            )
            for k,v in tokenized_batch.items():
                try:
                    self.tokenized_texts[k] = np.concatenate(self.tokenized_texts[k], v)
                except:
                    self.tokenized_texts[k] = v
            self.id_ar = np.append(self.id_ar, np.array([len(x) for x in tokenized_batch['input_ids']]))
            idx += tokenizer_batch_size
            idx_next += tokenizer_batch_size

    def __len__(self):
        return self.id_ar.sum()

    def __getitem__(self, idx):
        start_idx = max(idx-self.context_length, 0)
        end_idx = min(ar.shape+1, idx+1)
        ar = self.id_ar[start_idx, end_idx]
        text_id = ar[-2]
        ar[~text_id] = tokenizer.pad_token_id
        return ar[:-1], ar[1:]

In [18]:
ds = LM_DataSet(
    'gpt2',
    texts,
    vocab_size,
    context_length
)





Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors





In [19]:
ds.tokenized_texts['input_ids']

array([list([464, 717, 44153, 39168, 507, 11, 262, 2656, 11, 318, 616, 4004, 3807, 286, 477, 640, 13, 632, 373, 281, 4112, 30669, 13, 1406, 703, 319, 4534, 714, 484, 787, 257, 16304, 523, 31464, 2089, 13, 10490, 16738, 402, 14203, 373, 2818, 287, 262, 717, 3807, 13, 554, 428, 530, 11, 14235, 12620, 22523, 13, 1375, 318, 7818, 13, 843, 3521, 470, 484, 423, 1043, 257, 17943, 508, 1682, 3114, 588, 10490, 16738, 402, 14203, 30, 1629, 1551, 262, 976, 4190, 3124, 10185, 1312, 1612, 1282, 319, 13, 12325, 30833, 2125, 470, 355, 2089, 355, 12620, 11, 475, 339, 318, 5543, 7818, 618, 3688, 284, 6047, 11877, 3757, 13, 383, 26190, 287, 262, 717, 2646, 318, 390, 1442, 11, 37268, 913, 11, 290, 881, 517, 6181, 621, 262, 26190, 287, 262, 662, 31735, 13, 843, 644, 318, 510, 351, 262, 1621, 1627, 13, 632, 6209, 2925, 588, 428, 986, 27, 1671, 1220, 6927, 1671, 11037, 16, 12, 26190, 468, 257, 2089, 1128, 379, 465, 717, 1524, 11, 523, 262, 3807, 1139, 11, 3584, 340, 15802, 2147, 546, 683, 290, 465, 10691, 1

In [43]:
tokenizer.train_new_from_iterator(texts, vocab_size=vocab_size)






KeyboardInterrupt: 

In [84]:
tokenizer_batch_size = 512

In [89]:
tokenizer(texts[512:1024])

{'input_ids': [[1532, 314, 550, 1900, 428, 3807, 373, 18976, 287, 262, 41851, 803, 290, 8358, 589, 12, 48016, 8532, 1326, 6957, 3918, 11, 314, 561, 1239, 423, 26399, 340, 13, 15933, 11, 314, 1718, 257, 4850, 9862, 329, 262, 21547, 624, 1108, 290, 2921, 340, 257, 2823, 13, 314, 15436, 257, 845, 11, 845, 11, 845, 890, 16571, 2431, 878, 3501, 510, 13, 632, 338, 655, 14262, 11, 2181, 43787, 665, 37382, 29847, 1671, 1220, 6927, 1671, 11037, 464, 938, 4141, 3807, 314, 2497, 373, 366, 22834, 590, 1, 290, 340, 1165, 373, 2495, 39656, 11, 475, 379, 1551, 262, 4676, 373, 11831, 290, 407, 12704, 866, 262, 44184, 286, 262, 3435, 477, 262, 640, 13, 314, 716, 44432, 379, 262, 8282, 11533, 286, 8532, 1326, 6957, 11292, 1377, 340, 1183, 4929, 319, 287, 2253, 546, 262, 976, 640, 355, 262, 1306, 1263, 17645, 286, 443, 1676, 1837, 13, 357, 1026, 338, 1444, 8532, 1326, 6957, 780, 326, 338, 262, 2811, 1271, 286, 1661, 262, 10544, 389, 47320, 287, 262, 4151, 416, 262, 4676, 2014, 27, 1671, 1220, 6927, 1671,

In [86]:
batch_start = 0
ds_length = 0
for batch in tokenizer(texts[batch_start:tokenizer_batch_size]):
    for l in batch['input_ids']:
        ds_length += len(l)
    batch_start += tokenizer_batch_size

Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors


TypeError: string indices must be integers

In [76]:
batch

'input_ids'

In [71]:
batch = tokenizer(texts[:1024])

In [73]:
batch['input_ids'].shape

AttributeError: 'list' object has no attribute 'shape'

In [52]:
tokenizer.convert_tokens_to_ids(['<|endoftext|>'])

[50256]

In [50]:
token_emb = torch.nn.Embedding(vocab_size, emb_dim)

In [48]:
batch['input_ids']

[[3198,
  286,
  262,
  584,
  30702,
  468,
  4750,
  326,
  706,
  4964,
  655,
  352,
  18024,
  4471,
  345,
  1183,
  307,
  23373,
  13,
  1119,
  389,
  826,
  11,
  355,
  428,
  318,
  3446,
  644,
  3022,
  351,
  502,
  29847,
  1671,
  1220,
  6927,
  1671,
  11037,
  464,
  717,
  1517,
  326,
  7425,
  502,
  546,
  18024,
  373,
  663,
  24557,
  290,
  42880,
  8589,
  278,
  8188,
  286,
  3685,
  11,
  543,
  900,
  287,
  826,
  422,
  262,
  1573,
  10351,
  13,
  9870,
  502,
  11,
  428,
  318,
  407,
  257,
  905,
  329,
  262,
  18107,
  2612,
  276,
  393,
  44295,
  13,
  770,
  905,
  16194,
  645,
  25495,
  351,
  13957,
  284,
  5010,
  11,
  1714,
  393,
  3685,
  13,
  6363,
  318,
  22823,
  11,
  287,
  262,
  6833,
  779,
  286,
  262,
  1573,
  29847,
  1671,
  1220,
  6927,
  1671,
  11037,
  1026,
  318,
  1444,
  440,
  57,
  355,
  326,
  318,
  262,
  21814,
  1813,
  284,
  262,
  34374,
  22246,
  4765,
  1812,
  7507,
  48324,
  560,
  13,
  