#Installing Neccesary Libraries

In [None]:
!pip install transformers torch datasets Pillow

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

#Dataset Class to import and store image and description pairs

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
from transformers import CLIPProcessor

class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, processor):
        with open(annotations_file, 'r') as f:
            self.img_labels = [eval(line.strip()) for line in f]
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels[idx]['image'])
        image = Image.open(img_path).convert("RGB")
        text = self.img_labels[idx]['description']

        #print(f"Description for {self.img_labels[idx]['image']}: {text}")

        inputs = self.processor(text=[text], images=image, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        return inputs

processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
dataset = CustomImageDataset(annotations_file="/content/drive/MyDrive/SkinRash_training/descriptions.txt", img_dir="/content/drive/MyDrive/SkinRash_training/", processor=processor)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

#Custom collate function to bring all tensors in same shape

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Extract all 'input_ids', 'pixel_values', and 'attention_mask' tensors from the batch
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    pixel_values = [item['pixel_values'].squeeze(0) for item in batch]
    attention_mask = [item['attention_mask'].squeeze(0) for item in batch]

    # Pad the sequences so that they are all the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True)

    # Stack pixel values without needing padding
    pixel_values_stacked = torch.stack(pixel_values)

    # Return the batch as a dictionary
    return {
        'input_ids': input_ids_padded,
        'pixel_values': pixel_values_stacked,
        'attention_mask': attention_mask_padded,
    }




In [None]:
len(dataset)
for i in range(3):  # View the first 5 items and check if size of torch is same
    item = dataset[i]
    print(f"Sample {i+1}:")
    for key, value in item.items():
        print(f"{key}: {value.shape}")
    print("\n")

Sample 1:
input_ids: torch.Size([17])
attention_mask: torch.Size([17])
pixel_values: torch.Size([3, 224, 224])


Sample 2:
input_ids: torch.Size([17])
attention_mask: torch.Size([17])
pixel_values: torch.Size([3, 224, 224])


Sample 3:
input_ids: torch.Size([12])
attention_mask: torch.Size([12])
pixel_values: torch.Size([3, 224, 224])




#Load custom Data and import CLIP model for finetuning

In [None]:
from torch.utils.data import DataLoader
# Create the DataLoader using the custom collate function
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

In [None]:
from transformers import CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
model.train()  # Set model to training mode


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

#Fine tunining CLIP Component

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Define loss function
loss_fn = CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=5e-6)

for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in dataloader:
        optimizer.zero_grad()

        # Get model outputs
        outputs = model(**batch)

        # Get logits
        logits_per_image = outputs.logits_per_image  # shape [batch_size, batch_size]
        logits_per_text = outputs.logits_per_text  # shape [batch_size, batch_size]

        # Create labels
        labels = torch.arange(len(logits_per_image)).to(logits_per_image.device)  # labels are [0, 1, 2, ..., batch_size-1]

        # Compute loss
        loss_image = loss_fn(logits_per_image, labels)
        loss_text = loss_fn(logits_per_text, labels)
        loss = (loss_image + loss_text) / 2  # Average the loss

        # Backpropagation
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.7010190486907959
Epoch 1, Loss: 0.6185771226882935
Epoch 1, Loss: 1.10859215259552
Epoch 1, Loss: 2.905186414718628
Epoch 2, Loss: 0.5374573469161987
Epoch 2, Loss: 0.6744922399520874
Epoch 2, Loss: 0.8340348601341248
Epoch 2, Loss: 1.1946301460266113
Epoch 3, Loss: 0.19791071116924286
Epoch 3, Loss: 0.4304355978965759
Epoch 3, Loss: 0.6413471102714539
Epoch 3, Loss: 0.7564416527748108


#Save the finetuned CLIP model

In [None]:
model.save_pretrained("fine-tuned-clip-model")
processor.save_pretrained("fine-tuned-clip-model")

[]

#Fine Tuning VAE and UNET Components

In [None]:
!pip install diffusers
!pip install torch

Collecting diffusers
  Downloading diffusers-0.30.0-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.30.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.30.0


In [None]:
from diffusers import StableDiffusionPipeline
from transformers import CLIPTextModel, CLIPTokenizer

model = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
model.vae.requires_grad_(True)  # Fine-tune VAE
model.unet.requires_grad_(True)  # Fine-tune UNet

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")


model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

safety_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

(…)kpoints/scheduler_config-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [None]:
import torch

class CustomImageDataset(Dataset):
    def __init__(self, descriptions_file, img_dir, processor):
        self.img_dir = img_dir
        self.processor = processor
        with open(descriptions_file, 'r') as f:
            self.data = [json.loads(line) for line in f]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data[idx]["image"]
        description = self.data[idx]["description"]
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        inputs = self.processor(text=description, images=image, return_tensors="pt")

        pixel_values = inputs["pixel_values"].squeeze(0)  # Remove batch dimension
        input_ids = inputs["input_ids"].squeeze(0)  # Remove batch dimension

        return pixel_values, input_ids


#Custom collate function to pre process tensors data and shape

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms

def collate_fn_ldm(batch):
    pixel_values = [item['pixel_values'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]

    # Pad input_ids to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)

    # Resize images to a common size
    resize_transform = transforms.Resize((224, 224))  # Choose a suitable size
    pixel_values = [resize_transform(img) for img in pixel_values]

    # Convert list of pixel values to a PyTorch tensor
    pixel_values = torch.stack(pixel_values)

    return {'pixel_values': pixel_values, 'input_ids': input_ids}

##Load custom Data for finetuning VAE and UNET

In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch

params = list(model.vae.parameters()) + list(model.unet.parameters())
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn_ldm)
optimizer = AdamW(params, lr=1e-5)

num_epochs = 5

for epoch in range(num_epochs):
    for pixel_values, input_ids in dataloader:
        optimizer.zero_grad()

        # Forward pass
        noise = torch.randn(pixel_values.shape)
        latents = model.vae.encode(pixel_values).latent_dist.sample()  # Latent representation
        latents = latents * model.vae.config.scaling_factor

        noise_pred = model.unet(latents, input_ids).sample

        # Compute loss and backpropagate
        loss = ((noise_pred - noise) ** 2).mean()
        loss.backward()

        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")


#Push new trained models to huggingface

In [None]:
pip install huggingface_hub



In [None]:
!huggingface-cli login
#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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [None]:
from huggingface_hub import HfApi, HfFolder

# Initialize HfApi
api = HfApi()

# Your model repository name
repo_name = "saiabhishek-itta/fine-tuned-clip"

# Create a repository on the Hugging Face Hub
api.create_repo(repo_name, exist_ok=True)

# Push the model and processor to the repository
model.push_to_hub(repo_name)
processor.push_to_hub(repo_name)


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saiabhishek-itta/fine-tuned-clip/commit/11536a30a70c8d3617bb769b636b68e17601013b', commit_message='Upload processor', commit_description='', oid='11536a30a70c8d3617bb769b636b68e17601013b', pr_url=None, pr_revision=None, pr_num=None)

#Running streamlit app.py on LocalTunnel
need to import app.py file to this colab session

In [None]:
!pip install diffusers
!npm install localtunnel
!pip install streamlit

Collecting diffusers
  Downloading diffusers-0.30.0-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.30.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.30.0
[K[?25h
added 22 packages, and audited 23 packages in 1s

3 packages are looking for funding
  run `npm fund` for details

2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues, run:
  npm audit fix

Run `npm audit` for details.
Collecting streamlit
  Downloading streamlit-1.37.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py

In [None]:
!streamlit run app.py & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

34.75.205.183

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.75.205.183:8501[0m
[0m
your url is: https://dry-sheep-sniff.loca.lt
2024-08-14 23:18:49.019975: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-14 23:18:49.042907: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-14 23:18:49.049964: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when