### Are you using Google colab instead of your environment ? then uncomment the following lines and put your login token before submit()

In [1]:
# from huggingface_hub import notebook_login
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Order:
    - 1. Check the IOAI2024_CV_Problem_Intro.ipynb notebook for some theory
    - 2. Solve the task from this notebook
    - . [OPTIONAL] Finally evaluate your solution in the IOAI2024_CV_Problem_EVAL.ipynb notebook

# Installing the required packages

In [None]:
import importlib
if importlib.util.find_spec('diffusers') is None:
    !pip install diffusers transformers accelerate datasets


if importlib.util.find_spec('huggingface_hub') is None:
    !pip install huggingface_hub

!pip install datasets

# Downloading the dataset and base model

In [None]:
from torch.utils.data import DataLoader
import math
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from datasets import load_dataset
from torchvision import transforms
from tqdm import tqdm
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch

#Load a dataset
dataset = load_dataset('unibuc-cs/ROAItransportationCo', split='train')#, trust_remote_code=True)

base_model_name =  "unibuc-cs/ROAITransportationCo_v2"
weight_dtype = torch.bfloat16
pipe = DiffusionPipeline.from_pretrained(base_model_name, torch_dtype=weight_dtype)


In [None]:
from torch.utils.data import DataLoader
import math
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from datasets import load_dataset
from torchvision import transforms
from tqdm import tqdm
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch

weight_dtype = torch.bfloat16
device = "cuda"

# Load the base mini stable diffusion model
base_model_name =  "unibuc-cs/ROAITransportationCo_v2"
pipe = DiffusionPipeline.from_pretrained(base_model_name,
                                         torch_dtype=weight_dtype,
                                         safety_checker = None,
                                        requires_safety_checker = False)
pipe = pipe.to(device)

# Setup some learning parameters
# TODO 1 (very low priority): You can play with these.
# If you do not have GPU memory, you can try to reduce the train_batch_size
# Important:  Try to set a limited train_steps in the begging until you are convinced by some setup that work and lowers the error
# (this is valid for other TODOs as well)
learning_rate = 2e-05
resolution = 256 # might get interesting too.
max_train_steps = 101
train_batch_size = 8

# Extract the individual components and put them on the device and specific weights
# Just informative, let the code below as it is
#-----
vae = pipe.vae
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer
unet = pipe.unet
noise_scheduler = pipe.scheduler

# Move text_encode and vae to gpu and cast to weight_dtype
text_encoder = text_encoder.to(device, dtype=weight_dtype)
vae = vae.to(device, dtype=weight_dtype)
unet = unet.to(device, dtype=weight_dtype)
#-----

## Dataset loading and preparation
The function below is a standalone function that returns a dataloader for training the components.

You might need first to filter the records of the dataset first.

It is not required, but you might want to play with the resolution parameter (defined in a cell above), or add additional augmentation in train_transformers sequence, in between the resize and ToTensor. You can take a look here: https://pytorch.org/vision/stable/transforms.html  

In [7]:
def prepare_dataset_loader():
    # convert dataset to a loader that could be feed during training
    def tokenize_captions(examples, is_train=True):
        captions = examples['text']
        inputs = tokenizer(
            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        return inputs.input_ids

    def preprocess_train(examples):
        images = [image.convert("RGB") for image in examples['image']]
        examples["pixel_values"] = [train_transforms(image) for image in images]
        examples["input_ids"] = tokenize_captions(examples)
        return examples

    def collate_fn(examples):
        pixel_values = torch.stack([example["pixel_values"] for example in examples])
        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
        input_ids = torch.stack([example["input_ids"] for example in examples])
        return {"pixel_values": pixel_values, "input_ids": input_ids}

    # Preprocessing the datasets.
    train_transforms = transforms.Compose(
        [
            transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR),

            # ----
            # TODO 3.5 (very low priority): You might add additional augmentation
            transforms.CenterCrop(resolution),
            transforms.RandomHorizontalFlip(),

            #----
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ]
    )

    # Load a dataset
    dataset = load_dataset('unibuc-cs/ROAItransportationCo', split='train', trust_remote_code=True)

    train_dataset = dataset.with_transform(preprocess_train)

    # TODO 3: filter the dataset to consider only the case interesting classes, i.e.,
    # if the label (which is a text) for each image contains one of the mentioned classes
    # Hint: you might also balance a bit to select more items from the new labels added (vehicles) rather than the previous ones,
    # but at the same time keep some of the previous in. If you need more info how to process and filter data the dataset, check here:
    # https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter , for API. I remember that the previous engineer was just
    # checking if words: bus truck and train,  are in the images labels.


    # Data Loader final object used during training process
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=train_batch_size,
        num_workers=0,
    )

    return train_dataloader

train_dataloader = prepare_dataset_loader()


### Let's sample some things from dataset to ensure that we don't messup with the transformations or any other parameters!


In [None]:
# We just copy-pase some of the code froim the INTRO notebook

###################
from PIL import Image

# Create a generator and set a seed to id for reproducibily
generator = torch.Generator("cuda").manual_seed(0)

def image_grid(imgs, rows=2, cols=2):
    w, h = imgs[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))

    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid

def generate_img(prompt):
    image = pipe(prompt=prompt, generator=torch.Generator(device=device).manual_seed(42), num_inference_steps=50).images[0]
    image.resize((384, 384))
    return image



### Now let's sample a batch of data so see how it works
sample_batch = next(iter(train_dataloader))
sample_pixels = (sample_batch['pixel_values'] * 0.5) + 0.5 # These is the conversion back to original data range
sample_input_ids = sample_batch['input_ids']

sample_texts_decoded = []
for item in sample_input_ids:
    item_text = ""
    for token in item:
        if token !=49406 and token !=49407: # those start/end/padding tokens
            item_text += tokenizer.decode(token) + " "

    sample_texts_decoded.append(item_text)


transf_to_img = transforms.ToPILImage()
sample_images = [transf_to_img(x) for x in sample_pixels]
display(image_grid(sample_images, 1, train_batch_size))

for textt in sample_texts_decoded:
    print(textt)

## Chose what you train from the diffusion pipeline

- remember that the tokenizer is frozen, is good enough
- you can unfreeze one of vae, unet, or both components at the same time!
- remember that each of them has a different number of parameters and can affect the training results in various ways. However, our previous engineer was training only the UNET!

In [9]:
# By default we do not train any component, but if we don't train anything we don't get improvements.
# However, our previous engineer was training only the UNET!
# TODO 2: You can set any of them on True, or even both!
train_UNET = False
train_VAE = False


# Please let it freeze :)
text_encoder.requires_grad_(False)

# Decide
if train_VAE is True:
    vae.train()
else:
    vae.requires_grad_(False)

if train_UNET is True:
    unet.train()
else:
    unet.requires_grad_(False)

# Only If you train VAE too, add both set of parameters here. Otherwise optimizer won't do anything on them
optimizer = torch.optim.AdamW(unet.parameters(), lr=learning_rate)

## The training loop - your main playtoy!

In [None]:
num_train_epochs = math.ceil(max_train_steps * train_batch_size / len(train_dataloader))
print("***** Running training *****")
print(f"  Num examples = {len(train_dataloader)}")
print(f"  Num Epochs = {num_train_epochs}")
print(f"  Instantaneous batch size per device = {train_batch_size}")
print(f"  Total optimization steps = {max_train_steps}")

global_step = 0
initial_global_step = 0

progress_bar = tqdm(
    range(0, max_train_steps),
    initial=initial_global_step,
    desc="Steps",
    position=0,
    leave=True
)

losses = []
for epoch in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):

        # Convert images to latent space
        latents = vae.encode(batch["pixel_values"].to(weight_dtype).to(device)).latent_dist.sample()

        # Scale, do not worry about it
        latents = latents * vae.config.scaling_factor

        # Sample noise that we'll add to the latents
        noise = torch.randn_like(latents)
        batch_size = latents.shape[0]

        # Sample a random timestep for each image in the batch
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), device=latents.device)
        timesteps = timesteps.long()

        # Add noise to the latents according to the noise magnitude at each timestep
        # (this is the forward diffusion process)
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        # Get the text embedding for conditioning
        encoder_hidden_states = text_encoder(batch["input_ids"].to(device), return_dict=False)[0]

        # Predict the noise residual and compute loss
        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
        loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")

        # Backpropagate, and apply gradients
        if train_UNET or train_VAE:
            loss.backward()

        # TODO 4: many times, the gradients have high variance, values and this creates instability. You might want to clip the gradients
        # for the models that you train, e.g., unet, vae, or both.
        # You might check the https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html.
        # The previous engineer was setting the max_norm to 1, but it's up to you. You can start with that value too.

        optimizer.step()
        optimizer.zero_grad()

        ###############################################################

        losses.append(loss.item())
        progress_bar.update(1)
        global_step += 1

        ### Logging and correctness check
        # TODO 5: First, you should probably show the average loss over time, as a mean of last 20 or so steps.
        # Displaying only the last loss might confuse you
        avg_losses = losses[-1]

        progress_bar.set_postfix(average_loss=avg_losses, step=global_step)

        # TODO 6:
        # In the process of training diffusers it is very important to display after a certain number of steps the results and evaluate them with your eyes.
        # Just try some prompts as in the intro notebook to see if are you really going to the correct direction.
        # Many times in this case the loss is not as significant as the quality itself. You can use some of the intro code to display a selection of results here

        # TODO 7: Remember that training could go...crazy. Try to save your best checkpoint, considering the loss and visual quality at some points.
        # If that happens, remember to restart training. You can do something programatically, try to reload the model from start

        if global_step >= max_train_steps:
            break

In [None]:
# let's check a final  result
prompt = "A plane on the runway"
image = pipe(prompt, width=256, height=256).images[0]
display(image.resize((512, 512)))


# Write your explanation first, and then submit the model

# 300 words or less!!!!!!!
My  model is just playing around for now. I need to work on to win the contest!

# Final submission code to HF hub

In [None]:

pipe.push_to_hub("your_hf_id/your_model_name", token="your_hf_token")