# Finetune Pix2Struct model on Synthetic Bootstrap dataset

## Setup Envirnoment

In [3]:
pip install transformers==4.36.2

Defaulting to user installation because normal site-packages is not writeable
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.2)
  Using cached tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0:
      Successfully uninstalled tokenizers-0.21.0
Successfully installed tokenizers-0.15.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/software-current/2023.06/x86_64/generic/software/Python/3.11.3-GCCcore-12.3.0/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# !pip install -q wandb

In [3]:
# pip install torchvision nltk wandb tqdm Pillow

## Import necessary libraries

In [1]:
# from google.colab import drive
import os
import zipfile
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import re
from transformers import Pix2StructForConditionalGeneration, AutoProcessor
import torch
from torch.nn import functional as F
from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
from pathlib import Path
from nltk import edit_distance
import numpy as np
import wandb
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from torch.utils.data import random_split
import random

## Define variables and parameters

In [2]:
# G_DRIVE_FOLDER = '/content/drive/MyDrive/Datasets/'

# DATASET_NAME = 'synthBootstrap_mini'
# ZIP_NAME = DATASET_NAME + '.zip'
# DESTINATION_FOLDER= 'data/'
# DATASET_FOLDER = DESTINATION_FOLDER + DATASET_NAME

# HTML_FILES_FOLDER = DESTINATION_FOLDER + "html/"
# home/seyeon/data/synthBootstrap_mini/html

FOLDER_CHECKPOINTS = ''
DATASET_NAME = 'Design2Code/'
# ZIP_NAME = DATASET_NAME + '.zip'
DESTINATION_FOLDER= 'data/'
DATASET_FOLDER = DESTINATION_FOLDER + DATASET_NAME
HTML_FILES_FOLDER = DATASET_FOLDER + "html/"


EXPERIMENT_NAME = "Design2Code"

MAX_SENTENCE_LEN = 4096

CHUNK_LENGTH =  1024
CONTEXT_OVERLAP_LENGTH = 256

MAX_PATCHES = 1024 #1024

DEBUG = False
VERBOSE = True

BATCH_SIZE = 4
NUM_WARMUP_STEPS = 1000
MAX_EPOCHS = 200
LR = 1e-4
CHECK_VAL_EVERY_N_EPOCH = 5
GRADIENT_CLIP_VAL = 1.0
ACCUMULATE_GRAD_BATCHES = 8 / BATCH_SIZE

TRAIN_SET_PERCENTAGE = 0.88
VALID_SET_PERCENTAGE = 0.02 # Use 20 for validation
# TEST_SET_PERCENTAGE is 1 - TRAIN_SET_PERCENTAGE - VALID_SET_PERCENTAGE # Use 100 for test

RANDOM_SEED = 123

LOAD_FROM_CHECKPOINT = False
LAST_CHECKPOINT_NAME = ""

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
MAX_N_CHUNKS_PER_SENTENCE = 1 + (MAX_SENTENCE_LEN - CHUNK_LENGTH) // (CHUNK_LENGTH - CONTEXT_OVERLAP_LENGTH)
print("MAX_N_CHUNKS_PER_SENTENCE", MAX_N_CHUNKS_PER_SENTENCE)

MAX_N_CHUNKS_PER_SENTENCE 5



## Load Synthetic Bootstrap Dataset

## Load Model and Processor

In [5]:
# pip install tokenizers==0.21.0

In [6]:

repo_id = "google/pix2struct-base"
processor = AutoProcessor.from_pretrained(repo_id)
model = Pix2StructForConditionalGeneration.from_pretrained(repo_id, is_encoder_decoder=True)

## Create Dataset class

### Preprocessing functions

In [7]:
def round_floats_in_text(text, precision=0):
    # match float numbers with 2 or more decimal places in the text
    pattern = r"\b\d+\.\d{2,}\b"

    def replace(match):
        float_number = float(match.group())
        return f"{float_number:.{precision}f}"

    text = re.sub(pattern, replace, text)
    return text

In [8]:
def remove_html_comments(text):
    # match html comments
    pattern = r"<!--.*?-->"

    text = re.sub(pattern, '', text, flags=re.DOTALL)
    return text

In [9]:
def preprocess_html_file(html_text):
    text_cleaned = html_text.replace('\n', ' ')
    text_cleaned_without_multiple_spaces = re.sub(r'\s+', ' ', text_cleaned)
    text_without_comments = remove_html_comments(text_cleaned_without_multiple_spaces)
    text_without_long_floats = round_floats_in_text(text_without_comments)
    return text_without_long_floats

### Find max sentence length and new unknown tokens

In [10]:
# import os

# def safe_read(filepath):
#     try:
#         with open(filepath, "r", encoding="utf-8") as f:
#             return f.read()
#     except UnicodeDecodeError:
#         try:
#             with open(filepath, "r", encoding="latin-1") as f:
#                 return f.read()
#         except Exception as e:
#             print(f"Skipping {filepath} due to decode error: {e}")
#             return None

# HTML_FILES_FOLDER = "data/new_data/html/"
# all_paths = [f for f in os.listdir(HTML_FILES_FOLDER) if f.endswith(".html")]

# max_length = 0
# tokens_to_add = set()

# for html_file in all_paths:
#     text = safe_read(os.path.join(HTML_FILES_FOLDER, html_file))
#     if text is None:
#         continue

#     processed_text = preprocess_html_file(text)
#     tokens = processor.tokenizer(processed_text).tokens()

#     tokens_to_add.update(tokens)
#     max_length = max(max_length, len(tokens))

# print(f"Max sentence length = {max_length}")

# newly_added_num = processor.tokenizer.add_tokens(list(tokens_to_add))
# print(f"Number of new tokens = {newly_added_num}")

# if newly_added_num > 0:
#     model.decoder.resize_token_embeddings(len(processor.tokenizer))


In [11]:
# import shutil
# import os

# checkpoint_path = "data/WebSight/html/.ipynb_checkpoints"
# if os.path.exists(checkpoint_path):
#     shutil.rmtree(checkpoint_path)
#     print("✅ Deleted:", checkpoint_path)
# else:
#     print("⚠️ No .ipynb_checkpoints folder found.")


In [12]:
# Find max length
all_paths = os.listdir(HTML_FILES_FOLDER)

max_length = 0

# Read text files and add new tokens to dictionary
tokens_to_add = set()
for html_file_path in all_paths:
    with open(HTML_FILES_FOLDER + html_file_path, "r") as reader:
        splitted_text = processor.tokenizer(preprocess_html_file(reader.read())).tokens()
        tokens_to_add = tokens_to_add.union(set(splitted_text))

        # Check if the current sentence has the largest number of tokens
        if len(splitted_text) > max_length:
            max_length = len(splitted_text)

print(f"Max sentence length = {max_length}")

newly_added_num = processor.tokenizer.add_tokens(list(tokens_to_add))
print(f"Number of new tokens = {newly_added_num}")

# Resize the model's token embeddings if there are new tokens
if newly_added_num > 0:
    model.decoder.resize_token_embeddings(len(processor.tokenizer))

Max sentence length = 102351
Number of new tokens = 23086


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


### Split files into training - validation - test sets

In [13]:
random.seed(RANDOM_SEED)
random.shuffle(sorted(all_paths))

train_len = int(TRAIN_SET_PERCENTAGE * len(all_paths))
valid_len = int(VALID_SET_PERCENTAGE * len(all_paths))

train_paths = all_paths[:train_len]
valid_paths = all_paths[train_len:train_len+valid_len]
test_paths = all_paths[train_len+valid_len:]

print(f"TRAIN_SET size = {len(train_paths)}")
print(f"VALID_SET size = {len(valid_paths)}")
print(f"TEST_SET size = {len(test_paths)}")

TRAIN_SET size = 425
VALID_SET size = 9
TEST_SET size = 50


In [14]:
class SythBootstrapTrainingDataset(Dataset):
    # This is a modification of the dataset used for validation and testing
    # In this one the sentences are already split into chunks, already having
    # the context from the previous chunk, empty chunks are discarded
    def __init__(self, root_dir, transform, text_files_paths):

        self.root_dir = root_dir
        self.transform = transform
        self.text_files_paths = text_files_paths

        self.max_patches = MAX_PATCHES
        self.max_length = MAX_SENTENCE_LEN
        self.ignore_id = -100

        self.data = []
        self.images_encoding = []

        for text_file in tqdm(text_files_paths):
            image_file = text_file.replace('.html', '.png')

            # Directly process the text files, and save them in the ram
            # Do the same also for images, if there is enough space in memory
            text_file_path = os.path.join(root_dir + "html/", text_file)
            image_file_path = os.path.join(root_dir + "images/", image_file)

            # Each data entry has the following structure
            # labels, image_encoding_idx, part

            # image_encoding_idx points to an entry of images_encoding, which contains attention_mask and flattened_patches for the image
            # Since a single image is used for multiple slices of the same text, this approach is used to save memory

            # Load image
            image = Image.open(image_file_path).convert('RGB')

            if DEBUG:
                image.show()

            if self.transform:
                image = self.transform(image)

            encoding = processor(images=image, max_patches=self.max_patches, return_tensors="pt")
            encoding = {k:v.squeeze() for k,v in encoding.items()}

            self.images_encoding.append(encoding)
            image_encoding_idx = len(self.images_encoding) - 1

            # Load text
            with open(text_file_path, 'r') as f:
                text = f.read()
                text_cleaned = preprocess_html_file(text)

            if DEBUG:
              print("text:")
              print(text)
              print("\n\n\ntext_cleaned:")
              print(text_cleaned)

            input_ids = processor.tokenizer(
                text_cleaned,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            ).input_ids

            input_ids_slices = []

            start_index = 0
            end_index = CHUNK_LENGTH
            while end_index <= MAX_SENTENCE_LEN:
                input_ids_slices.append(input_ids[:, start_index:end_index])
                start_index = end_index - CONTEXT_OVERLAP_LENGTH
                end_index = start_index + CHUNK_LENGTH

            for part, input_ids_slice in enumerate(input_ids_slices):
                labels = input_ids_slice.squeeze().clone()

                labels[labels == processor.tokenizer.pad_token_id] = self.ignore_id  # model doesn't need to predict pad token

                # Skip slices with only padding tokens, ignore context from the previous chunk
                if part != 0 and all(x == self.ignore_id for x in labels[CONTEXT_OVERLAP_LENGTH:]):
                    continue

                # labels, image_encoding_idx, part
                # Save them as int32 to save ram memory
                self.data.append((labels.to(torch.int32), image_encoding_idx, part))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        labels, image_encoding_idx, part = self.data[idx]
        encoding = self.images_encoding[image_encoding_idx]
        encoding["labels"] = labels.to(torch.int64)

        return encoding, part

In [15]:
class SythBootstrapDataset(Dataset):
    def __init__(self, root_dir, transform, text_files_paths):

        self.root_dir = root_dir
        self.transform = transform
        self.text_files_paths = text_files_paths

        self.max_patches = MAX_PATCHES
        self.max_length = MAX_SENTENCE_LEN
        self.ignore_id = -100

        self.encodings = []

        for text_file in tqdm(text_files_paths):
            image_file = text_file.replace('.html', '.png')

            # Directly process the text files, and save them in the ram
            # Do the same also for images, if there is enough space in memory
            text_file_path = os.path.join(root_dir + "html/", text_file)
            image_file_path = os.path.join(root_dir + "images/", image_file)

            # Load image
            image = Image.open(image_file_path).convert('RGB')

            if DEBUG:
                image.show()

            if self.transform:
                image = self.transform(image)

            encoding = processor(images=image, max_patches=self.max_patches, return_tensors="pt")
            encoding = {k:v.squeeze() for k,v in encoding.items()}

            # Load text
            with open(text_file_path, 'r') as f:
                text = f.read()
                text_cleaned = preprocess_html_file(text)

            if DEBUG:
              print("text:")
              print(text)
              print("\n\n\ntext_cleaned:")
              print(text_cleaned)

            input_ids = processor.tokenizer(
                text_cleaned,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            ).input_ids

            labels = input_ids.squeeze().clone()
            labels[labels == processor.tokenizer.pad_token_id] = self.ignore_id  # model doesn't need to predict pad token

            encoding["labels"] = labels.to(torch.int32)

            # For each sample save directly the encoding of both text and image
            self.encodings.append(encoding)

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        return self.encodings[idx]

In [16]:
# Transformations for the image
transform = transforms.Compose([
    transforms.ToTensor(),  # convert PIL Image to PyTorch Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # normalize for pretrained models
])

# Instantiate the CustomDataset
train_dataset = SythBootstrapTrainingDataset(DATASET_FOLDER, transform, train_paths)
val_dataset = SythBootstrapDataset(DATASET_FOLDER, transform, valid_paths)

# Use DataLoader for batching and shuffling
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=10, shuffle=False) # Use 10 as batch for testing

100%|██████████| 425/425 [00:38<00:00, 11.08it/s]
100%|██████████| 9/9 [00:00<00:00, 14.59it/s]


In [17]:
print(f"train_dataloader size = {len(train_dataloader)}")
print(f"val_dataloader size = {len(val_dataloader)}")

train_dataloader size = 511
val_dataloader size = 1


## Training

In [18]:
START_TOKEN_ID = PAD_TOKEN_ID = processor.tokenizer.pad_token_id

### Utility functions

In [19]:
def move_to_device(data):
    if isinstance(data, (list,tuple)):
        return [move_to_device(x) for x in data]
    elif isinstance(data, dict):
        return {k: move_to_device(v) for k, v in data.items()}
    elif isinstance(data, torch.Tensor):
        return data.to(DEVICE)
    else:
        return data

In [20]:
def create_extended_attention_mask_for_decoder_with_context(input_shape, attention_mask, part):
    device = attention_mask.device
    batch_size, seq_length = input_shape
    seq_ids = torch.arange(seq_length, device=device)

    causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]

    # Expand part to have the same shape as the relevant slice of causal_mask
    part_expanded = part.unsqueeze(-1).unsqueeze(-1).expand(-1, seq_length, CONTEXT_OVERLAP_LENGTH)

    # Create a mask with ones where part is not zero
    context_mask = (part_expanded != 0).float()

    # Apply the context_mask to the corresponding part of causal_mask
    causal_mask[:, :, :CONTEXT_OVERLAP_LENGTH] = causal_mask[:, :, :CONTEXT_OVERLAP_LENGTH] * (1 - context_mask) + context_mask

    # in case past_key_values are used we need to add a prefix ones mask to the causal mask
    causal_mask = causal_mask.to(attention_mask.dtype)

    if causal_mask.shape[1] < attention_mask.shape[1]:
        print("!!should not enter here in my case!!")
        prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
        causal_mask = torch.cat(
            [
                torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
                causal_mask,
            ],
            axis=-1,
        )

    extended_attention_mask = causal_mask[:, :, :] * attention_mask[:, None, :]
    return extended_attention_mask


In [21]:
def get_attention_mask(decoder_input_ids, part):
    decoder_attention_mask = (decoder_input_ids.ne(PAD_TOKEN_ID).float())

    # always attend on first token
    decoder_attention_mask[:, 0] = 1

    # Expand part to have the same shape as the relevant slice of decoder_attention_mask
    part_expanded = part.unsqueeze(-1).expand(-1, CONTEXT_OVERLAP_LENGTH)

    # Create a mask with ones where part is not zero
    context_mask = (part_expanded != 0).float()

    # Apply the context_mask to the corresponding part of decoder_attention_mask
    decoder_attention_mask[:, 0:CONTEXT_OVERLAP_LENGTH] = decoder_attention_mask[:, 0:CONTEXT_OVERLAP_LENGTH] * (1 - context_mask) + context_mask

    return decoder_attention_mask

In [22]:
def shift_right_modified(input_ids, decoder_starting_token_idx):

    # shift inputs to the right
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
    shifted_input_ids[..., 0] = decoder_starting_token_idx

    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, PAD_TOKEN_ID)

    return shifted_input_ids

In [23]:
def get_decoder_input_ids(labels_chunk, start_id):
    return shift_right_modified(labels_chunk, start_id)

In [24]:
def get_decoder_input_ids_and_attention_mask(labels, part):
    decoder_input_ids = get_decoder_input_ids(labels, START_TOKEN_ID)
    decoder_attention_mask = get_attention_mask(decoder_input_ids, part)
    extended_decoder_attention_mask = create_extended_attention_mask_for_decoder_with_context(decoder_input_ids.shape, decoder_attention_mask, part)

    return decoder_input_ids, extended_decoder_attention_mask

### Main training function

In [36]:
def train_model(config, processor, model, train_dataloader, val_dataloader):
    # Extract configuration values
    lr = config.get("lr")
    max_epochs = config.get("max_epochs")
    num_warmup_steps = config.get("num_warmup_steps")

    model.to(DEVICE)

    optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, lr=lr, weight_decay=1e-05)

    # Use total steps (i.e., max_epochs * length_of_train_data)
    total_steps = max_epochs * len(train_dataloader)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=num_warmup_steps,
                                                num_training_steps=total_steps)

    global_step = 0  # to keep track of total steps
    epoch_start = 0

    if LOAD_FROM_CHECKPOINT:
        print("Loading model from checkpoint:", LAST_CHECKPOINT_NAME)
        checkpoint = torch.load(LAST_CHECKPOINT_NAME)
        # model.resize_token_embeddings(50244) ### retrain
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        epoch_start = checkpoint["epoch"] + 1
        global_step = checkpoint["global_step"] + 1
        wandb_run_id = checkpoint["wandb_run_id"]

        # Resume the WandB run
        wandb.init(project="Pix2Struct", name="run-" + EXPERIMENT_NAME, config=config,     resume="must", id="iaegqr6z")
    else:
        wandb.init(project="Pix2Struct", name="run-" + EXPERIMENT_NAME, config=config)

    epoch_last = epoch_start + max_epochs - 1
    for epoch in range(epoch_start, epoch_start + max_epochs):
        global_step, moving_avg_loss = training_loop(epoch, train_dataloader, model, config, optimizer, scheduler, global_step, epoch_last)

        if epoch == 0 + epoch_start or epoch == epoch_last or (epoch + 1) % config.get("check_val_every_n_epoch") == 0:
            avg_bleu_score = testing_loop(val_dataloader, model, processor, config, f"Epoch {epoch}/{epoch_last} - valid loop")

            # Save the model after each validation step
            save_checkpoint(model, optimizer, scheduler, epoch, global_step, wandb.run.id, avg_bleu_score, EXPERIMENT_NAME, FOLDER_CHECKPOINTS)

            if config.get("verbose", False):
                print(f"Moving Avg Loss: {moving_avg_loss:.3f}")
                print(f" Avg Bleu Score: {avg_bleu_score:.2f}")

            wandb.log({"moving_avg_loss": moving_avg_loss, "bleu": avg_bleu_score, **{f'lr_{i}': param_group['lr'] for i, param_group in enumerate(optimizer.param_groups)}})

    wandb.finish()

In [37]:
def training_loop(epoch, train_dataloader, model, config, optimizer, scheduler, global_step, epoch_last):


    model.train()
    train_loop = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch}/{epoch_last} - train loop")

    # Extract configuration values
    accumulate_grad_batches = config.get('accumulate_grad_batches', 1)
    gradient_clip_val = config.get("gradient_clip_val")

    moving_avg_loss = 0
    alpha = 0.1 # Smoothing factor

    for step, batch in train_loop:
        encoding, part = map(move_to_device, batch)
        labels, flattened_patches, attention_mask = encoding["labels"], encoding["flattened_patches"], encoding["attention_mask"]

        decoder_input_ids, decoder_attention_mask = get_decoder_input_ids_and_attention_mask(labels, part)
        
        outputs = model(labels=labels, flattened_patches=flattened_patches, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)#, decoder_attention_mask=decoder_attention_mask)
        loss = outputs.loss
        loss.backward()

        if global_step % accumulate_grad_batches == 0 or step == len(train_dataloader) - 1:
            if gradient_clip_val:
                torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip_val)
            optimizer.step()
            optimizer.zero_grad()

        # Update the progress bar
        train_loop.set_postfix({'loss': loss.item()}, refresh=True)

        scheduler.step()
        global_step += 1

        # Update the moving average loss
        moving_avg_loss = loss.item() if moving_avg_loss == 0 else alpha * loss.item() + (1 - alpha) * moving_avg_loss

        # Log Loss after each step
        wandb.log({"loss": loss.item()})

    return global_step, moving_avg_loss

In [38]:
# def testing_loop(testing_dataloader, model, processor, config, description):
#     model.eval()
#     bleu_scores = []

#     with torch.no_grad():
#         test_loop = tqdm(enumerate(testing_dataloader), total=len(testing_dataloader), desc=description)
#         for i, batch in test_loop:
#             encoding = move_to_device(batch)
#             labels, flattened_patches, attention_mask = encoding["labels"], encoding["flattened_patches"], encoding["attention_mask"]

#             # Initialize total_outputs with zeros
#             total_outputs = None
#             context_from_last = None

#             # Initialize a mask to track which sentences are finished
#             finished_sentences_mask = torch.zeros(flattened_patches.size(0), dtype=torch.bool, device=flattened_patches.device)

#             for iteration in range(MAX_N_CHUNKS_PER_SENTENCE):

#                 generate_args = {
#                     "flattened_patches": flattened_patches[~finished_sentences_mask],
#                     "attention_mask": attention_mask[~finished_sentences_mask],
#                     "max_new_tokens": CHUNK_LENGTH - (CONTEXT_OVERLAP_LENGTH if iteration else 0),
#                 }

#                 if iteration and context_from_last is not None:
#                     generate_args["decoder_input_ids"] = context_from_last[~finished_sentences_mask]

#                 outputs = model.generate(**generate_args)

#                 # Remove context overlap only from the second iteration onwards
#                 new_chunks = outputs if iteration == 0 else outputs[:, CONTEXT_OVERLAP_LENGTH:]

#                 if iteration == 0:
#                     total_outputs = new_chunks
#                 else:
#                     # Update total_outputs by concatenating new chunks
#                     new_chunks_with_padding_chunks = torch.full((flattened_patches.shape[0], new_chunks.shape[1]), PAD_TOKEN_ID, dtype=new_chunks.dtype, device=new_chunks.device)
#                     new_chunks_with_padding_chunks[~finished_sentences_mask] = new_chunks
#                     total_outputs = torch.cat((total_outputs, new_chunks_with_padding_chunks), dim=1)

#                 # Update the finished_sentences_mask
#                 finished_sentences_mask[~finished_sentences_mask] |= (outputs == processor.tokenizer.eos_token_id).any(dim=1)

#                 # If all sentences are finished, exit the loop
#                 if finished_sentences_mask.all():
#                     break

#                 if outputs.shape[1] < CHUNK_LENGTH:
#                     print("ERROR: !! should have already exited because all sentences reached the end!!")

#                 # -1 because it will put in front a START_TOKEN automatically
#                 context_from_last = total_outputs[:, -(CONTEXT_OVERLAP_LENGTH-1):]

#             predictions = processor.tokenizer.batch_decode(total_outputs, skip_special_tokens=True)

#             labels[labels == -100] = 0
#             answers = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

#             bleu_scores += [corpus_bleu([[answer]], [pred], smoothing_function=SmoothingFunction().method4) for pred, answer in zip(predictions, answers)]

#             avg_bleu_score = np.mean(bleu_scores)
#             test_loop.set_postfix(bleu_score=avg_bleu_score)

#             if config.get("verbose", False):
#                 for pred, answer, bleu_score in zip(predictions, answers, bleu_scores):
#                     tqdm.write(f"\nPrediction: {pred}\n    Answer: {answer}\n      Bleu: {bleu_score:.2f}")


#     return avg_bleu_score


In [39]:
def testing_loop(testing_dataloader, model, processor, config, description):
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        test_loop = tqdm(enumerate(testing_dataloader), total=len(testing_dataloader), desc=description)
        for i, batch in test_loop:
            encoding = move_to_device(batch)
            labels, flattened_patches, attention_mask = encoding["labels"], encoding["flattened_patches"], encoding["attention_mask"]

            # Initialize total_outputs with zeros
            total_outputs = None
            context_from_last = None

            # Initialize a mask to track which sentences are finished
            finished_sentences_mask = torch.zeros(flattened_patches.size(0), dtype=torch.bool, device=flattened_patches.device)

            for iteration in range(MAX_N_CHUNKS_PER_SENTENCE):

                generate_args = {
                    "flattened_patches": flattened_patches[~finished_sentences_mask],
                    "attention_mask": attention_mask[~finished_sentences_mask],
                    "max_new_tokens": CHUNK_LENGTH - (CONTEXT_OVERLAP_LENGTH if iteration else 0),
                }

                if iteration and context_from_last is not None:
                    generate_args["decoder_input_ids"] = context_from_last[~finished_sentences_mask]

                outputs = model.generate(**generate_args)

                # Remove context overlap only from the second iteration onwards
                new_chunks = outputs if iteration == 0 else outputs[:, CONTEXT_OVERLAP_LENGTH:]

                if iteration == 0:
                    total_outputs = new_chunks
                else:
                    # Update total_outputs by concatenating new chunks
                    new_chunks_with_padding_chunks = torch.full((flattened_patches.shape[0], new_chunks.shape[1]), PAD_TOKEN_ID, dtype=new_chunks.dtype, device=new_chunks.device)
                    new_chunks_with_padding_chunks[~finished_sentences_mask] = new_chunks
                    total_outputs = torch.cat((total_outputs, new_chunks_with_padding_chunks), dim=1)

                # Update the finished_sentences_mask
                finished_sentences_mask[~finished_sentences_mask] |= (outputs == processor.tokenizer.eos_token_id).any(dim=1)

                # If all sentences are finished, exit the loop
                if finished_sentences_mask.all():
                    break

                if outputs.shape[1] < CHUNK_LENGTH:
                    print("ERROR: !! should have already exited because all sentences reached the end!!")

                # -1 because it will put in front a START_TOKEN automatically
                context_from_last = total_outputs[:, -(CONTEXT_OVERLAP_LENGTH-1):]

            predictions = processor.tokenizer.batch_decode(total_outputs, skip_special_tokens=True)

            labels[labels == -100] = 0
            answers = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

            bleu_scores += [corpus_bleu([[answer]], [pred], smoothing_function=SmoothingFunction().method4) for pred, answer in zip(predictions, answers)]

            avg_bleu_score = np.mean(bleu_scores)
            test_loop.set_postfix(bleu_score=avg_bleu_score)

            if config.get("verbose", False):
                for pred, answer, bleu_score in zip(predictions, answers, bleu_scores):
                    tqdm.write(f"\nPrediction: {pred}\n    Answer: {answer}\n      Bleu: {bleu_score:.2f}")


    return avg_bleu_score


In [40]:
def save_checkpoint(model, optimizer, scheduler, epoch, global_step, wandb_run_id, avg_bleu_score, experiment_name, folder_path):
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict(),
        "epoch": epoch,
        "global_step": global_step,
        'wandb_run_id': wandb_run_id
    }
    model_name = f"{experiment_name}_epoch[{epoch}]_bleu[{avg_bleu_score:.2f}].pth"
    torch.save(checkpoint, folder_path + model_name)


In [41]:
config = {
          "batch_size": BATCH_SIZE,
          "num_warmup_steps": NUM_WARMUP_STEPS,
          "max_epochs": MAX_EPOCHS,
          "lr": LR,
          "check_val_every_n_epoch": CHECK_VAL_EVERY_N_EPOCH,
          "gradient_clip_val": GRADIENT_CLIP_VAL,
          "accumulate_grad_batches": ACCUMULATE_GRAD_BATCHES,
          "verbose": VERBOSE,
}

In [42]:
# import os
# import glob

# # Define directories
# html_dir = "data/new_data/html"
# image_dir = "data/new_data/images"

# # Function to delete all files in a directory
# def delete_files_in_dir(directory):
#     files = glob.glob(os.path.join(directory, '*'))
#     for f in files:
#         if os.path.isfile(f):
#             os.remove(f)

# # Delete files
# delete_files_in_dir(html_dir)
# delete_files_in_dir(image_dir)

# print("All files deleted in html and images folders.")


In [43]:
def validate_config(config):
    # Check required keys
    required_keys = [
        "batch_size",
        "num_warmup_steps",
        "max_epochs",
        "lr",
        "check_val_every_n_epoch",
        "gradient_clip_val",
        "accumulate_grad_batches",
        "verbose"
    ]
    for key in required_keys:
        if key not in config:
            raise ValueError(f"Key '{key}' must be present in the configuration.")

    # Check that values are in expected ranges
    if config["batch_size"] <= 0:
        raise ValueError("batch_size must be positive.")
    if config["num_warmup_steps"] < 0:
        raise ValueError("num_warmup_steps must be non-negative.")
    if config["max_epochs"] <= 0:
        raise ValueError("max_epochs must be positive.")
    if config["lr"] <= 0:
        raise ValueError("Learning rate must be positive.")
    if config["check_val_every_n_epoch"] <= 0:
        raise ValueError("check_val_every_n_epoch must be positive.")
    if config["gradient_clip_val"] < 0:
        raise ValueError("gradient_clip_val must be non-negative.")
    if config["accumulate_grad_batches"] <= 0:
        raise ValueError("accumulate_grad_batches must be positive.")
    if not isinstance(config["verbose"], bool):
        raise ValueError("verbose must be a boolean value.")


In [44]:
validate_config(config)
print(config)

{'batch_size': 4, 'num_warmup_steps': 1000, 'max_epochs': 200, 'lr': 0.0001, 'check_val_every_n_epoch': 5, 'gradient_clip_val': 1.0, 'accumulate_grad_batches': 2.0, 'verbose': True}


In [45]:
# checkpoint = torch.load('checkpointsPix2Struct_SynthBootstrap_1000_Complete_epoch[19]_bleu[0.87].pth')
# print("Saved WandB run ID:", checkpoint['wandb_run_id'])


In [None]:
train_model(config, processor, model, train_dataloader, val_dataloader)

0,1
loss,▇▅▄▄▄▅▅█▅▄▃▅▅▄▄▁▃▂▅▆▄▄▅▂▅▃▆▄▅▂

0,1
loss,5.17825


Epoch 0/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.54it/s, loss=2.66]
Epoch 0/199 - valid loop: 100%|██████████| 1/1 [04:47<00:00, 287.31s/it, bleu_score=0.0161]



Prediction: <<<img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img img 

Epoch 1/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.54it/s, loss=1.69]
Epoch 2/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.42] 
Epoch 3/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.33] 
Epoch 4/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.54it/s, loss=1.13] 
Epoch 4/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.69s/it, bleu_score=0.0702]



Prediction: <!DOCTYPE html> <html lang="en"> <head> <style> /*! normalize.css v3.0.2 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-text-size-adjust:100%

Epoch 5/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.4]  
Epoch 6/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.4]  
Epoch 7/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.29] 
Epoch 8/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.83] 
Epoch 9/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.599]
Epoch 9/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.43s/it, bleu_score=0.261]



Prediction: <!DOCTYPE html> <html lang="en"> <head> <style> @font-face { font-family: 'Roboto'; font-style: normal; font-weight: 400; src: url(https://fonts.gstatic.com/s/roboto/v30/KFOmCnqEu92Fr1Mu51TjASc1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1Cs1C

Epoch 10/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.19] 
Epoch 11/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=1.6]  
Epoch 12/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.646]
Epoch 13/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.428]
Epoch 14/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.652]
Epoch 14/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.72s/it, bleu_score=0.346]



    Answer: <!DOCTYPE html> <html dir="ltr" lang="en"> <head> <style> </style> <meta content="width=device-width, initial-scale=1" name="viewport"> <title> Windshield Wipers </title> <meta content="text/html; charset=utf-8" http-equiv="Content-Type"> <meta content="#f7f7f7" name="theme-color"> <meta content="#f7f7f7" name="msapplication-navbutton-color"> <meta content="blogger" name="generator"> <meta content="" property="og:url"> <meta content="Windshield Wipers" property="og:title"> <meta content="Last weekend we tried to go bowling with some friends. We beat them to the bowling ally and so we waited in the car for a few minutes becaus..." property="og:description"> <style type="text/css"> @font-face{font-family:'Merriweather';font-style:italic;font-weight:300;src:url(//fonts.gstatic.com/s/merriweather/v30/u-4l0qyriQwlOrhSvowK_l5-eR7lXff1jvzRPA.woff2)format('woff2');unicode-range:U+0460-052F,U+1C80-1C88,U+20B4,U+2DE0-2DFF,U+A640-A69F,U+FE2E-FE2F;}@font-face{font-family:'Merriweather

Epoch 15/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.45] 
Epoch 16/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.903]
Epoch 17/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.585]
Epoch 18/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=1.14] 
Epoch 19/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.732]
Epoch 19/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.50s/it, bleu_score=0.215]



Prediction: <!DOCTYPE html> <html class="js flexbox flexboxlegacy canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths" lang="en"> <head> <style> @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 400; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsjZ0B4gaVc.ttf) format('truetype'); } @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 700; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1

Epoch 20/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.483]
Epoch 21/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=1.07] 
Epoch 22/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.902]
Epoch 23/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.766] 
Epoch 24/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=1.13] 
Epoch 24/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.72s/it, bleu_score=0.302]



    Answer: <!DOCTYPE html> <html dir="ltr" lang="en"> <head> <style> </style> <meta content="width=device-width, initial-scale=1" name="viewport"> <title> Windshield Wipers </title> <meta content="text/html; charset=utf-8" http-equiv="Content-Type"> <meta content="#f7f7f7" name="theme-color"> <meta content="#f7f7f7" name="msapplication-navbutton-color"> <meta content="blogger" name="generator"> <meta content="" property="og:url"> <meta content="Windshield Wipers" property="og:title"> <meta content="Last weekend we tried to go bowling with some friends. We beat them to the bowling ally and so we waited in the car for a few minutes becaus..." property="og:description"> <style type="text/css"> @font-face{font-family:'Merriweather';font-style:italic;font-weight:300;src:url(//fonts.gstatic.com/s/merriweather/v30/u-4l0qyriQwlOrhSvowK_l5-eR7lXff1jvzRPA.woff2)format('woff2');unicode-range:U+0460-052F,U+1C80-1C88,U+20B4,U+2DE0-2DFF,U+A640-A69F,U+FE2E-FE2F;}@font-face{font-family:'Merriweather

Epoch 25/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.785]
Epoch 26/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.656] 
Epoch 27/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.226] 
Epoch 28/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.54it/s, loss=0.944]
Epoch 29/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.356] 
Epoch 29/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.73s/it, bleu_score=0.219]



    Answer: <!DOCTYPE html> <html dir="ltr" lang="en"> <head> <style> </style> <meta content="width=device-width, initial-scale=1" name="viewport"> <title> Windshield Wipers </title> <meta content="text/html; charset=utf-8" http-equiv="Content-Type"> <meta content="#f7f7f7" name="theme-color"> <meta content="#f7f7f7" name="msapplication-navbutton-color"> <meta content="blogger" name="generator"> <meta content="" property="og:url"> <meta content="Windshield Wipers" property="og:title"> <meta content="Last weekend we tried to go bowling with some friends. We beat them to the bowling ally and so we waited in the car for a few minutes becaus..." property="og:description"> <style type="text/css"> @font-face{font-family:'Merriweather';font-style:italic;font-weight:300;src:url(//fonts.gstatic.com/s/merriweather/v30/u-4l0qyriQwlOrhSvowK_l5-eR7lXff1jvzRPA.woff2)format('woff2');unicode-range:U+0460-052F,U+1C80-1C88,U+20B4,U+2DE0-2DFF,U+A640-A69F,U+FE2E-FE2F;}@font-face{font-family:'Merriweather

Epoch 30/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.54it/s, loss=0.71]  
Epoch 31/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.65]  
Epoch 32/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.588] 
Epoch 33/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.535] 
Epoch 34/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.87]  
Epoch 34/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.52s/it, bleu_score=0.205]



Prediction: <!DOCTYPE html> <html class="js flexbox flexboxlegacy canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths" lang="en"> <head> <style> @font-face { font-family: 'Open Sans'; font-style: italic; font-weight: 400; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsjZ0C4n.ttf) format('truetype'); } @font-face { font-family: 'Open Sans'; font-style: italic; font-weight: 700; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr

Epoch 35/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.307]
Epoch 36/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.626] 
Epoch 37/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.484]
Epoch 38/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.503]
Epoch 39/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.39]  
Epoch 39/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.75s/it, bleu_score=0.354]



Prediction: <!DOCTYPE html> <html lang="en"> <head> <style> @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 400; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsjZ0B4gaVc.ttf) format('truetype'); } @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 700; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1x4gaVc.ttf) format('truetype'); } @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 900; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1x4gaVc.ttf) format('truetype'); } @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 100; font-stretch: normal; font-display: swap; src: url(https://fonts.gstatic.com/s/opensans/v36/me

Epoch 40/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.414]
Epoch 41/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.794]
Epoch 42/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.453] 
Epoch 43/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.51] 
Epoch 44/199 - train loop: 100%|██████████| 511/511 [02:23<00:00,  3.55it/s, loss=0.277]
Epoch 44/199 - valid loop: 100%|██████████| 1/1 [04:29<00:00, 269.12s/it, bleu_score=0.188]



Prediction: <!DOCTYPE html> <html lang="en"> <head> <style> @font-face { font-family: 'Source Sans Pro'; font-style: normal; font-weight: 400; src: url(https://fonts.gstatic.com/s/sourcesanspro/v22/6xKwdSBYKcSV-LCoeQqfX1RYOo3qPZZMkido18S0xR41YDw.ttf) format('truetype'); } @font-face { font-family: 'Source Sans Pro'; font-style: normal; font-weight: 700; src: url(https://fonts.gstatic.com/s/sourcesanspro/v22/6xKwdSBYKcSV-LCoeQqfX1RYOo3qPZZMkidg18S0xR41YDw.ttf) format('truetype'); } @font-face { font-family: 'Source Sans Pro'; font-style: normal; font-weight: 900; src: url(https://fonts.gstatic.com/s/sourcesanspro/v22/6xKwdSBYKcSV-LCoeQqfX1RYOo3qPZZMkidg18S0xR41YDw.ttf) format('truetype'); } @font-face { font-family: 'Source Sans Pro'; font-style: normal; font-weight: 100; src: url(https://fonts.gstatic.com/s/sourcesanspro/v22/6xKwdSBYKcSV-LCoeQqfX1RYOo3qPZZMkidg18S0xR41YDw.ttf) format('truetype'); } @font-face { font-family: 'Source Sans Pro'; font-style: normal; font-weight: 100; src:

Epoch 45/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.468] 
Epoch 46/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.342] 
Epoch 47/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.189] 
Epoch 48/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.353] 
Epoch 49/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.405] 
Epoch 49/199 - valid loop: 100%|██████████| 1/1 [04:46<00:00, 286.60s/it, bleu_score=0.317]



    Answer: <!DOCTYPE html> <html dir="ltr" lang="en"> <head> <style> </style> <meta content="width=device-width, initial-scale=1" name="viewport"> <title> Windshield Wipers </title> <meta content="text/html; charset=utf-8" http-equiv="Content-Type"> <meta content="#f7f7f7" name="theme-color"> <meta content="#f7f7f7" name="msapplication-navbutton-color"> <meta content="blogger" name="generator"> <meta content="" property="og:url"> <meta content="Windshield Wipers" property="og:title"> <meta content="Last weekend we tried to go bowling with some friends. We beat them to the bowling ally and so we waited in the car for a few minutes becaus..." property="og:description"> <style type="text/css"> @font-face{font-family:'Merriweather';font-style:italic;font-weight:300;src:url(//fonts.gstatic.com/s/merriweather/v30/u-4l0qyriQwlOrhSvowK_l5-eR7lXff1jvzRPA.woff2)format('woff2');unicode-range:U+0460-052F,U+1C80-1C88,U+20B4,U+2DE0-2DFF,U+A640-A69F,U+FE2E-FE2F;}@font-face{font-family:'Merriweather

Epoch 50/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.617] 
Epoch 51/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.0903]
Epoch 52/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.498] 
Epoch 53/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.545] 
Epoch 54/199 - train loop: 100%|██████████| 511/511 [02:24<00:00,  3.55it/s, loss=0.142] 
Epoch 54/199 - valid loop:   0%|          | 0/1 [00:00<?, ?it/s]