In [1]:
!pip install datasets
!pip install tokenizers
!pip install torchmetrics
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.1-py3-none-a

In [2]:
import torch.optim as optim
import torch.nn as nn
import numpy as np
import torch
import re

from collections import Counter
from os.path import exists

from BERT_Model import Transformer
from BERT_Dataset import SentencesDataset
from config import get_bert_config

In [3]:
def get_batch(loader, loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter

def load_sentences(file_path):
    with open(file_path) as file:
        return file.read().lower().split('\n')

def tokenize_sentences(sentences, special_chars):
    tokenized_sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
    return [[w for w in s if len(w)] for s in tokenized_sentences]

def create_or_load_vocab(sentences, vocab_path, max_vocab_size):
    if not exists(vocab_path):
        words = [w for s in sentences for w in s]
        vocab = Counter(words).most_common(max_vocab_size)
        vocab = [w[0] for w in vocab]
        with open(vocab_path, 'w+') as file:
            file.write('\n'.join(vocab))
    else:
        with open(vocab_path) as file:
            vocab = file.read().split('\n')
    return vocab

def train_model(model, data_loader, optimizer, loss_model, num_iterations, print_each):
    print_each = 10
    model.train()
    batch_iter = iter(data_loader)
    n_iteration = 100
    for it in range(n_iteration):

    #get batch
      batch, batch_iter = get_batch(data_loader, batch_iter)

      #infer
      masked_input = batch['input']
      masked_target = batch['target']

      masked_input = masked_input.cuda(non_blocking=True)
      masked_target = masked_target.cuda(non_blocking=True)
      output = model(masked_input)

      #compute the cross entropy loss
      output_v = output.view(-1,output.shape[-1])
      target_v = masked_target.view(-1,1).squeeze()
      loss = loss_model(output_v, target_v)

      #compute gradients
      loss.backward()

      #apply gradients
      optimizer.step()

      #print step

      if it % print_each == 0:
          print('it:', it,
                ' | loss', np.round(loss.item(),2),
                ' | Δw:', round(model.embeddings.weight.grad.abs().sum().item(),3))

      #reset gradients
      optimizer.zero_grad()


def save_embeddings(model, dataset, num_embeddings, values_path, names_path):
    N = num_embeddings
    np.savetxt(values_path, np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
    s = [dataset.rvocab[i] for i in range(N)]
    with open(names_path, 'w+') as file:
        file.write('\n'.join(s))

def main():
    config = get_bert_config()

    print('loading text...')
    sentences = load_sentences('/content/data/training.txt')

    print('tokenizing sentences...')
    special_chars = ',?;.:/*!+-()[]{}"\'&'
    sentences = tokenize_sentences(sentences, special_chars)

    print('creating/loading vocab...')
    vocab = create_or_load_vocab(sentences, '/content/data/vocab.txt', config['n_vocab'])

    print('creating dataset...')
    dataset = SentencesDataset(sentences, vocab, config['seq_len'])
    data_loader = torch.utils.data.DataLoader(dataset, shuffle=True, drop_last=True, pin_memory=True, batch_size=config['batch_size'])

    print('initializing model...')
    inner_ff_size = config['embed_size'] * 4
    model = Transformer(config['n_code'], config['n_heads'], config['embed_size'], inner_ff_size, len(dataset.vocab), config['seq_len'], config['dropout']).cuda()
    print(model)
    print('initializing optimizer and loss...')
    optim_kwargs = config["optim_kwargs"]
    optimizer = optim.Adam(model.parameters(), **optim_kwargs)
    loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

    print('training...')
    print_each = 10
    num_iterations = 10000
    train_model(model, data_loader, optimizer, loss_model, num_iterations, print_each)

    print('saving embeddings...')
    save_embeddings(model, dataset, 3000, 'values.tsv', 'names.tsv')

    print('end')

if __name__ == "__main__":
    main()

loading text...
tokenizing sentences...
creating/loading vocab...
creating dataset...
initializing model...
Transformer(
  (embeddings): Embedding(23948, 128)
  (pe): PositionalEmbedding()
  (encoders): ModuleList(
    (0-7): 8 x EncoderLayer(
      (mha): MultiHeadAttention(
        (linear): Linear(in_features=128, out_features=384, bias=True)
        (out): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine

In [4]:
import torch
from GPT_Model import Transformer
from transformers import AutoTokenizer
from GPT_Utils import encode, decode,get_batch,estimate_loss
from config import get_gpt_config

config = get_gpt_config()
NUM_EMBED = config['NUM_HEAD'] * 128

In [6]:

def load_and_tokenize_data(path_to_data):
    data_raw = open(path_to_data, encoding="utf-8").read()
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    vocab_size = tokenizer.vocab_size
    data = encode(text_seq=data_raw, tokenizer=tokenizer)
    n = int(0.9 * len(data))
    train_data = data[:n]
    val_data = data[n:]
    return train_data, val_data, vocab_size,tokenizer

def initialize_model(vocab_size,NUM_EMBED):
    model = Transformer(
        vocab_size=vocab_size,
        num_embed=NUM_EMBED,
        block_size=config['BLOCK_SIZE'],
        num_heads=config['NUM_HEAD'],
        num_layers=config['NUM_LAYER'],
        dropout=config['DROPOUT'],
    )
    m = model.to(config['DEVICE'])
    optimizer = torch.optim.AdamW(m.parameters(), lr=config['LEARNING_RATE'])
    return m, optimizer

def train_model(model, optimizer, train_data, val_data):
    for step in range(config['MAX_ITER']):
        if step % config['EVAL_INTER'] == 0 or step == config['MAX_ITER'] - 1:
            loss_train = estimate_loss(
                data=train_data, model=model, block_size=config['BLOCK_SIZE'], batch_size=config['BATCH_SIZE']
            )
            loss_val = estimate_loss(
                data=val_data, model=model, block_size=config['BLOCK_SIZE'], batch_size=config['BATCH_SIZE']
            )
            print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

        xb, yb = get_batch(data=train_data, block_size=config['BLOCK_SIZE'], batch_size=config['BATCH_SIZE'])
        logits, loss = model.forward(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

def generate_output(model, tokenizer, context):
    generated_sequence = model.generate(idx=context, max_new_tokens=100, block_size=config['BLOCK_SIZE'])[0]
    decoded_sequence = decode(enc_sec=generated_sequence, tokenizer=tokenizer)
    return decoded_sequence



In [7]:
def main():
    path_to_data = "/content/data/english.txt"
    train_data, val_data, vocab_size,tokenizer = load_and_tokenize_data(path_to_data)
    model, optimizer = initialize_model(vocab_size,NUM_EMBED)
    print("Model with {:.2f}M parameters".format(sum(p.numel() for p in model.parameters()) / 1e6))
    train_model(model, optimizer, train_data, val_data)
    context = torch.zeros((1, 1), dtype=torch.long, device=config['DEVICE'])
    generated_output = generate_output(model, tokenizer, context)
    print(generated_output)

if __name__ == "__main__":
    main()

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


Model with 89.48M parameters
step          0 | train loss 10.7720 | val loss 10.8224
step        500 | train loss 0.5627 | val loss 8.2736
step       1000 | train loss 0.1711 | val loss 9.5480
step       1500 | train loss 0.1370 | val loss 10.0252
step       2000 | train loss 0.1287 | val loss 10.5643
step       2500 | train loss 0.1207 | val loss 10.6381
step       3000 | train loss 0.1213 | val loss 11.1373
step       3500 | train loss 0.1160 | val loss 10.7446
step       4000 | train loss 0.1083 | val loss 11.1762
step       4500 | train loss 0.1060 | val loss 11.5475
step       4999 | train loss 0.1065 | val loss 11.4910
[PAD], 2 step 1 we reduce momentum from higher to lower bound, and in step 2 we increase momentum from lower to higher bound. according to the paper, this cyclic momentum gives the same final results, but this saves time and effort of running multiple full cycles with different momentum values. with one cycle policy and cyclic momentum, i could replicate the result

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/pizza_steak_sushi

/content/drive/MyDrive/pizza_steak_sushi


In [6]:
%cp -av /content/drive/MyDrive/pizza_steak_sushi /content/DATA

'/content/drive/MyDrive/pizza_steak_sushi' -> '/content/DATA/pizza_steak_sushi'
'/content/drive/MyDrive/pizza_steak_sushi/train' -> '/content/DATA/pizza_steak_sushi/train'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak' -> '/content/DATA/pizza_steak_sushi/train/steak'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak/225990.jpg' -> '/content/DATA/pizza_steak_sushi/train/steak/225990.jpg'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak/922752.jpg' -> '/content/DATA/pizza_steak_sushi/train/steak/922752.jpg'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak/2129685.jpg' -> '/content/DATA/pizza_steak_sushi/train/steak/2129685.jpg'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak/2163079.jpg' -> '/content/DATA/pizza_steak_sushi/train/steak/2163079.jpg'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak/3245622.jpg' -> '/content/DATA/pizza_steak_sushi/train/steak/3245622.jpg'
'/content/drive/MyDrive/pizza_steak_sushi/train/steak/461689.jpg' -> '/content/DATA/pizz

In [5]:
import torch
import os
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from VIT_DataLoader import create_dataloaders
from VIT_Model import ViT
import VIT_engine
num_workers = os.cpu_count()
from typing import Dict, List, Tuple
# Create image size (from Table 3 in the ViT paper)
IMG_SIZE = 224

# Create transform pipeline manually
manual_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])
print(f"Manually created transforms: {manual_transforms}")

Manually created transforms: Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=warn)
    ToTensor()
)


In [15]:


# Example usage:
if __name__ == "__main__":
    image_path = "/content/DATA/pizza_steak_sushi"
    train_dir = image_path + "/train/"
    test_dir = image_path + "/test/"
    print(train_dir)
    train_data = datasets.ImageFolder(train_dir, transform=manual_transforms)
    test_data = datasets.ImageFolder(test_dir, transform=manual_transforms)
    class_names = train_data.classes
    device = "cuda" if torch.cuda.is_available() else "cpu"
    train_dataloader = DataLoader(
        train_data,
        batch_size=10,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
    )

    test_dataloader = DataLoader(
        test_data,
        batch_size=10,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,
    )
    train_dataloader
    # Initialize the ViT model
    vit = ViT()

    # Setup the optimizer
    optimizer = torch.optim.Adam(params=vit.parameters(),
                                 lr=3e-3,
                                 betas=(0.9, 0.999),
                                 weight_decay=0.3)

    # Setup the loss function
    loss_fn = torch.nn.CrossEntropyLoss()

    # Train the model
    results = VIT_engine.train(model=vit,
                               train_dataloader=train_dataloader,
                               test_dataloader=test_dataloader,
                               optimizer=optimizer,
                               loss_fn=loss_fn,
                               epochs=5,
                               device=device)



/content/DATA/pizza_steak_sushi/train/


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 2.0587 | train_acc: 0.2913 | test_loss: 1.3085 | test_acc: 0.3125
Epoch: 2 | train_loss: 1.2100 | train_acc: 0.3261 | test_loss: 1.3142 | test_acc: 0.2375
Epoch: 3 | train_loss: 1.2034 | train_acc: 0.3478 | test_loss: 1.4216 | test_acc: 0.2375
Epoch: 4 | train_loss: 1.2600 | train_acc: 0.3174 | test_loss: 1.3039 | test_acc: 0.4500
Epoch: 5 | train_loss: 1.2256 | train_acc: 0.3348 | test_loss: 1.2619 | test_acc: 0.3125
