#Installing Neccesary Libraries

In [None]:
!pip install transformers torch datasets Pillow

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#Dataset Class to import and store image and description pairs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
from transformers import CLIPProcessor
class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, processor):
        with open(annotations_file, 'r') as f:
            self.img_labels = []
            for line in f:
                # Split the line into image name and description based on the first comma
                image, description = line.strip().split(',', 1)
                self.img_labels.append({'image': image, 'description': description.strip()})

        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels[idx]['image'])
        image = Image.open(img_path).convert("RGB")
        text = self.img_labels[idx]['description']

        # Use the processor to prepare inputs
        inputs = self.processor(text=[text], images=image, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        return inputs


processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
dataset = CustomImageDataset(annotations_file="/content/drive/MyDrive/flick8/captions1061.txt", img_dir="/content/drive/MyDrive/flick8/Images", processor=processor)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

#Custom collate function to bring all tensors in same shape

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Extract all 'input_ids', 'pixel_values', and 'attention_mask' tensors from the batch
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    pixel_values = [item['pixel_values'].squeeze(0) for item in batch]
    attention_mask = [item['attention_mask'].squeeze(0) for item in batch]

    # Pad the sequences so that they are all the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True)

    # Stack pixel values without needing padding
    pixel_values_stacked = torch.stack(pixel_values)

    # Return the batch as a dictionary
    return {
        'input_ids': input_ids_padded,
        'pixel_values': pixel_values_stacked,
        'attention_mask': attention_mask_padded,
    }

In [None]:
len(dataset)
for i in range(3):  # View the first 5 items and check if size of torch is same
    item = dataset[i]
    print(f"Sample {i+1}:")
    for key, value in item.items():
        print(f"{key}: {value.shape}")
    print("\n")

Sample 1:
input_ids: torch.Size([20])
attention_mask: torch.Size([20])
pixel_values: torch.Size([3, 224, 224])


Sample 2:
input_ids: torch.Size([10])
attention_mask: torch.Size([10])
pixel_values: torch.Size([3, 224, 224])


Sample 3:
input_ids: torch.Size([11])
attention_mask: torch.Size([11])
pixel_values: torch.Size([3, 224, 224])




#Load custom Data and import CLIP model for finetuning

In [None]:
from torch.utils.data import DataLoader
# Create the DataLoader using the custom collate function
dataloader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)

In [None]:
from transformers import CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
model.train()  # Set model to training mode


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [None]:
len(dataloader)

254

#Fine tunining CLIP Component

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Define loss function
loss_fn = CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=5e-6)

for epoch in range(2):  # Adjust the number of epochs as needed
    for batch in dataloader:
        optimizer.zero_grad()

        # Get model outputs
        outputs = model(**batch)

        # Get logits
        logits_per_image = outputs.logits_per_image  # shape [batch_size, batch_size]
        logits_per_text = outputs.logits_per_text  # shape [batch_size, batch_size]

        # Create labels
        labels = torch.arange(len(logits_per_image)).to(logits_per_image.device)  # labels are [0, 1, 2, ..., batch_size-1]

        # Compute loss
        loss_image = loss_fn(logits_per_image, labels)
        loss_text = loss_fn(logits_per_text, labels)
        loss = (loss_image + loss_text) / 2  # Average the loss

        # Backpropagation
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 2.293484687805176
Epoch 1, Loss: 1.078918218612671
Epoch 1, Loss: 1.0336787700653076
Epoch 1, Loss: 2.407337188720703
Epoch 1, Loss: 2.040100336074829
Epoch 1, Loss: 6.314553260803223
Epoch 1, Loss: 1.4143588542938232
Epoch 1, Loss: 0.8775413036346436
Epoch 1, Loss: 1.3854382038116455
Epoch 1, Loss: 1.525776982307434
Epoch 1, Loss: 2.1553571224212646
Epoch 1, Loss: 1.047343373298645
Epoch 1, Loss: 1.006317377090454
Epoch 1, Loss: 0.8459956049919128
Epoch 1, Loss: 2.0064141750335693
Epoch 1, Loss: 1.9524552822113037
Epoch 1, Loss: 0.8792784214019775
Epoch 1, Loss: 1.4790772199630737
Epoch 1, Loss: 1.6425843238830566
Epoch 1, Loss: 1.9328067302703857
Epoch 1, Loss: 2.2020530700683594
Epoch 1, Loss: 0.8957343697547913
Epoch 1, Loss: 1.2347745895385742
Epoch 1, Loss: 1.3132097721099854
Epoch 1, Loss: 1.9004249572753906
Epoch 1, Loss: 1.760294795036316
Epoch 1, Loss: 0.9435906410217285
Epoch 1, Loss: 0.7944098114967346
Epoch 1, Loss: 1.193881869316101
Epoch 1, Loss: 1.9910678

#Save the finetuned CLIP model

In [None]:
model.save_pretrained("flick8-fine-tuned-clip-model")
processor.save_pretrained("flick8-fine-tuned-clip-model")

[]

#Fine Tuning VAE and UNET Components

In [None]:
!pip install diffusers
!pip install torch

In [None]:
from diffusers import StableDiffusionPipeline
from transformers import CLIPTextModel, CLIPTokenizer

model = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
model.vae.requires_grad_(True)  # Fine-tune VAE
model.unet.requires_grad_(True)  # Fine-tune UNet

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")


In [None]:
import torch

class CustomImageDataset(Dataset):
    def __init__(self, descriptions_file, img_dir, processor):
        self.img_dir = img_dir
        self.processor = processor
        with open(descriptions_file, 'r') as f:
            self.data = [json.loads(line) for line in f]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data[idx]["image"]
        description = self.data[idx]["description"]
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        inputs = self.processor(text=description, images=image, return_tensors="pt")

        pixel_values = inputs["pixel_values"].squeeze(0)  # Remove batch dimension
        input_ids = inputs["input_ids"].squeeze(0)  # Remove batch dimension

        return pixel_values, input_ids


#Custom collate function to pre process tensors data and shape

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms

def collate_fn_ldm(batch):
    pixel_values = [item['pixel_values'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]

    # Pad input_ids to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)

    # Resize images to a common size
    resize_transform = transforms.Resize((224, 224))  # Choose a suitable size
    pixel_values = [resize_transform(img) for img in pixel_values]

    # Convert list of pixel values to a PyTorch tensor
    pixel_values = torch.stack(pixel_values)

    return {'pixel_values': pixel_values, 'input_ids': input_ids}

##Load custom Data for finetuning VAE and UNET

In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch

params = list(model.vae.parameters()) + list(model.unet.parameters())
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn_ldm)
optimizer = AdamW(params, lr=1e-5)

num_epochs = 5

for epoch in range(num_epochs):
    for pixel_values, input_ids in dataloader:
        optimizer.zero_grad()

        # Forward pass
        noise = torch.randn(pixel_values.shape)
        latents = model.vae.encode(pixel_values).latent_dist.sample()  # Latent representation
        latents = latents * model.vae.config.scaling_factor

        noise_pred = model.unet(latents, input_ids).sample

        # Compute loss and backpropagate
        loss = ((noise_pred - noise) ** 2).mean()
        loss.backward()

        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")


#Push new trained models to huggingface

In [None]:
pip install huggingface_hub



In [None]:
!huggingface-cli login
#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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 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
Invalid input. Must be one of ('y', 'yes', '1', 'n', 'no', '0', '')
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `skin-stable-diffusion` has been saved to /root/.c

In [None]:
from huggingface_hub import HfApi, HfFolder

# Initialize HfApi
api = HfApi()

# Your model repository name
repo_name = "saiabhishek-itta/flick8-finetuned-clip"

# Create a repository on the Hugging Face Hub
api.create_repo(repo_name, exist_ok=True)

# Push the model and processor to the repository
model.push_to_hub(repo_name)
processor.push_to_hub(repo_name)


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saiabhishek-itta/flick8-finetuned-clip/commit/d6d000895acd2ea1ec81984260e0cbd5aeb0566d', commit_message='Upload processor', commit_description='', oid='d6d000895acd2ea1ec81984260e0cbd5aeb0566d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saiabhishek-itta/flick8-finetuned-clip', endpoint='https://huggingface.co', repo_type='model', repo_id='saiabhishek-itta/flick8-finetuned-clip'), pr_revision=None, pr_num=None)

#Running streamlit app.py on LocalTunnel
need to import app.py file to this colab session

In [None]:
!pip install diffusers
!npm install localtunnel
!pip install streamlit

In [None]:
!streamlit run app.py & npx localtunnel --port 8501 & curl ipv4.icanhazip.com