

### Архитектуры моделей

Первая модель:

$\text{Embeddings} + \text{Absolute Positional Embeddings} \rightarrow \text{SDPA} \rightarrow \text{SDPA} \rightarrow \text{UnembeddedOutput}$

Вторая модель:

$\text{Embeddings} + \text{Absolute Positional Embeddings} \rightarrow \text{CausalConvolution} \rightarrow \text{SDPA} \rightarrow \text{UnembeddedOutput}$

Параметры `self-attention` блоков:


### Параметры обучения

В качестве оптимизатора был взят $AdamW$, начальный $lr=3e-4$

In [18]:
from transformers import GPT2Tokenizer

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [19]:
gpt2_tokenizer.vocab_size

50257

In [20]:
from src.models.AttentionAttentionModel import AttentionAttentionModel

two_layer_transformer = AttentionAttentionModel(
    vocab_size=gpt2_tokenizer.vocab_size,
    emb_dim=512,
    hidden_dim=512 // 1, # Потому что количество голов = 1
    max_seq_length=1024
)

In [21]:
text = "Hello my dear friend"

encoded_text = gpt2_tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")

In [22]:
encoded_text.shape

torch.Size([1, 4])

In [23]:
logits = two_layer_transformer(encoded_text)

tensor([[15496,   616, 13674,  1545]])


In [24]:
logits

tensor([[[ 2.2730e-04,  1.5930e-04, -1.1034e-04,  ...,  1.2887e-05,
           6.0364e-04, -1.6978e-04],
         [ 2.2730e-04,  1.5930e-04, -1.1034e-04,  ...,  1.2887e-05,
           6.0364e-04, -1.6978e-04],
         [ 2.2730e-04,  1.5930e-04, -1.1034e-04,  ...,  1.2887e-05,
           6.0364e-04, -1.6978e-04],
         [ 2.2730e-04,  1.5930e-04, -1.1034e-04,  ...,  1.2887e-05,
           6.0364e-04, -1.6978e-04]]], grad_fn=<UnsafeViewBackward0>)

In [25]:
import torch.nn as nn

In [26]:
nn.Softmax(dim=1)(logits)

tensor([[[0.2500, 0.2500, 0.2500,  ..., 0.2500, 0.2500, 0.2500],
         [0.2500, 0.2500, 0.2500,  ..., 0.2500, 0.2500, 0.2500],
         [0.2500, 0.2500, 0.2500,  ..., 0.2500, 0.2500, 0.2500],
         [0.2500, 0.2500, 0.2500,  ..., 0.2500, 0.2500, 0.2500]]],
       grad_fn=<SoftmaxBackward0>)

In [30]:
gpt2_tokenizer("Once upon a time, in a big lake, there was a brown kayak. The brown kayak liked to roll in the water all day long. It was very happy when it could roll and splash in the lake.\n\nOne day, a little boy named Tim came to play with the brown kayak. Tim and the brown kayak rolled in the water together. They laughed and had a lot of fun. The sun was shining, and the water was warm.\n\nAfter a while, it was time for Tim to go home. He said goodbye to the brown kayak and gave it a big hug. The brown kayak was sad to see Tim go, but it knew they would play together again soon. So, the brown kayak kept rolling in the water, waiting for the next fun day with Tim.", return_tensors="pt")

{'input_ids': tensor([[ 7454,  2402,   257,   640,    11,   287,   257,  1263, 13546,    11,
           612,   373,   257,  7586, 34681,   461,    13,   383,  7586, 34681,
           461,  8288,   284,  4836,   287,   262,  1660,   477,  1110,   890,
            13,   632,   373,   845,  3772,   618,   340,   714,  4836,   290,
         22870,   287,   262, 13546,    13,   198,   198,  3198,  1110,    11,
           257,  1310,  2933,  3706,  5045,  1625,   284,   711,   351,   262,
          7586, 34681,   461,    13,  5045,   290,   262,  7586, 34681,   461,
         11686,   287,   262,  1660,  1978,    13,  1119, 13818,   290,   550,
           257,  1256,   286,  1257,    13,   383,  4252,   373, 22751,    11,
           290,   262,  1660,   373,  5814,    13,   198,   198,  3260,   257,
           981,    11,   340,   373,   640,   329,  5045,   284,   467,  1363,
            13,   679,   531, 24829,   284,   262,  7586, 34681,   461,   290,
          2921,   340,   257,  1263, 1

In [31]:
gpt2_tokenizer.tokenize("Once upon a time, there was a little white cat named Fluffy. Fluffy loved to play with her best friend, a small boy named Timmy. They played outside in the sun every day. Fluffy liked to chase Timmy, and Timmy liked to run.\n\nOne day, Timmy learned a new word at school. He wanted to teach Fluffy the word too. Timmy said, \"Fluffy, the word is 'repeat'. Can you say 'repeat'?\" Fluffy looked at Timmy and said, \"Meow.\" Timmy laughed and said, \"No, Fluffy, say 'repeat'.\"\n\nFluffy tried again and said, \"Meow-peat.\" Timmy clapped his hands and said, \"Good job, Fluffy! You said the word!\" Fluffy was very happy. She liked learning new words with Timmy. From that day on, Fluffy and Timmy played a game where they would teach each other new words. They had lots of fun together, and they lived happily ever after.")

['Once',
 'Ġupon',
 'Ġa',
 'Ġtime',
 ',',
 'Ġthere',
 'Ġwas',
 'Ġa',
 'Ġlittle',
 'Ġwhite',
 'Ġcat',
 'Ġnamed',
 'ĠFl',
 'uffy',
 '.',
 'ĠFl',
 'uffy',
 'Ġloved',
 'Ġto',
 'Ġplay',
 'Ġwith',
 'Ġher',
 'Ġbest',
 'Ġfriend',
 ',',
 'Ġa',
 'Ġsmall',
 'Ġboy',
 'Ġnamed',
 'ĠTim',
 'my',
 '.',
 'ĠThey',
 'Ġplayed',
 'Ġoutside',
 'Ġin',
 'Ġthe',
 'Ġsun',
 'Ġevery',
 'Ġday',
 '.',
 'ĠFl',
 'uffy',
 'Ġliked',
 'Ġto',
 'Ġchase',
 'ĠTim',
 'my',
 ',',
 'Ġand',
 'ĠTim',
 'my',
 'Ġliked',
 'Ġto',
 'Ġrun',
 '.',
 'Ċ',
 'Ċ',
 'One',
 'Ġday',
 ',',
 'ĠTim',
 'my',
 'Ġlearned',
 'Ġa',
 'Ġnew',
 'Ġword',
 'Ġat',
 'Ġschool',
 '.',
 'ĠHe',
 'Ġwanted',
 'Ġto',
 'Ġteach',
 'ĠFl',
 'uffy',
 'Ġthe',
 'Ġword',
 'Ġtoo',
 '.',
 'ĠTim',
 'my',
 'Ġsaid',
 ',',
 'Ġ"',
 'Fl',
 'uffy',
 ',',
 'Ġthe',
 'Ġword',
 'Ġis',
 "Ġ'",
 'repeat',
 "'.",
 'ĠCan',
 'Ġyou',
 'Ġsay',
 "Ġ'",
 'repeat',
 "'",
 '?"',
 'ĠFl',
 'uffy',
 'Ġlooked',
 'Ġat',
 'ĠTim',
 'my',
 'Ġand',
 'Ġsaid',
 ',',
 'Ġ"',
 'Me',
 'ow',
 '."',
 'ĠTim',
 'my'

In [34]:
from datasets import load_dataset

In [37]:
tiny_stories = load_dataset("roneneldan/TinyStories")

In [39]:
tiny_stories["train"]

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [1]:
from src.train.dataloader import get_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataloder = get_dataloader(tokenizer_name="gpt2", dataset_name="roneneldan/TinyStories", split="train", batch_size=4)

Map:   0%|          | 0/2119719 [00:00<?, ? examples/s]


[[3198, 1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257, 17598, 287, 607, 2119, 13, 1375, 2993, 340, 373, 2408, 284, 711, 351, 340, 780, 340, 373, 7786, 13, 20037, 2227, 284, 2648, 262, 17598, 351, 607, 1995, 11, 523, 673, 714, 34249, 257, 4936, 319, 607, 10147, 13, 198, 198, 43, 813, 1816, 284, 607, 1995, 290, 531, 11, 366, 29252, 11, 314, 1043, 428, 17598, 13, 1680, 345, 2648, 340, 351, 502, 290, 34249, 616, 10147, 1701, 2332, 1995, 13541, 290, 531, 11, 366, 5297, 11, 20037, 11, 356, 460, 2648, 262, 17598, 290, 4259, 534, 10147, 526, 198, 198, 41631, 11, 484, 4888, 262, 17598, 290, 384, 19103, 262, 4936, 319, 20037, 338, 10147, 13, 632, 373, 407, 2408, 329, 606, 780, 484, 547, 7373], [7454, 2402, 257, 640, 11, 612, 373, 257, 1310, 1097, 3706, 1355, 538, 13, 1355, 538, 6151, 284, 467, 3049, 290, 711, 287, 262, 4252, 13, 1355, 538, 373, 257, 5448, 1097, 780, 339, 1464, 550, 922, 5252, 13, 4599, 5252, 925, 1355, 538, 3772, 290, 1913, 13, 198, 198, 3198, 1110, 11, 1355, 538, 373, 5059,

TypeError: list indices must be integers or slices, not tuple