In [None]:
!pip install transformers torch datasets Pillow

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
from transformers import CLIPProcessor

class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, processor):
        with open(annotations_file, 'r') as f:
            self.img_labels = [eval(line.strip()) for line in f]
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels[idx]['image'])
        image = Image.open(img_path).convert("RGB")
        text = self.img_labels[idx]['description']

        #print(f"Description for {self.img_labels[idx]['image']}: {text}")

        inputs = self.processor(text=[text], images=image, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        return inputs

processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
dataset = CustomImageDataset(annotations_file="descriptions.txt", img_dir="/content/drive/MyDrive/SkinRash_training/", processor=processor)


In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Extract all 'input_ids', 'pixel_values', and 'attention_mask' tensors from the batch
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    pixel_values = [item['pixel_values'].squeeze(0) for item in batch]
    attention_mask = [item['attention_mask'].squeeze(0) for item in batch]

    # Pad the sequences so that they are all the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True)

    # Stack pixel values without needing padding
    pixel_values_stacked = torch.stack(pixel_values)

    # Return the batch as a dictionary
    return {
        'input_ids': input_ids_padded,
        'pixel_values': pixel_values_stacked,
        'attention_mask': attention_mask_padded,
    }




In [None]:
len(dataset)
for i in range(3):  # View the first 5 items
    item = dataset[i]
    print(f"Sample {i+1}:")
    for key, value in item.items():
        print(f"{key}: {value.shape}")
    print("\n")

Sample 1:
input_ids: torch.Size([13])
attention_mask: torch.Size([13])
pixel_values: torch.Size([3, 224, 224])


Sample 2:
input_ids: torch.Size([12])
attention_mask: torch.Size([12])
pixel_values: torch.Size([3, 224, 224])


Sample 3:
input_ids: torch.Size([12])
attention_mask: torch.Size([12])
pixel_values: torch.Size([3, 224, 224])




In [None]:
from torch.utils.data import DataLoader

# Create the DataLoader using the custom collate function
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)



In [None]:
for item in dataset:
    inputs = item
    description = item['input_ids']  # Assuming 'input_ids' corresponds to the description
    print(description)

tensor([49406,  1774,   893,   539,   320,  2849,  3575,  2533,   593,   839,
          905,  8388, 49407])
tensor([49406,  1710,   539,   320,  2849,  3575,  2533,   593,   839,   905,
         8388, 49407])
tensor([49406,  1710,   539,   320,  2849,  3575,  2533,   593,   839,   905,
         8388, 49407])


In [None]:
from transformers import CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
model.train()  # Set model to training mode


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Define loss function
loss_fn = CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=5e-6)

for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in dataloader:
        optimizer.zero_grad()

        # Get model outputs
        outputs = model(**batch)

        # Get logits
        logits_per_image = outputs.logits_per_image  # shape [batch_size, batch_size]
        logits_per_text = outputs.logits_per_text  # shape [batch_size, batch_size]

        # Create labels
        labels = torch.arange(len(logits_per_image)).to(logits_per_image.device)  # labels are [0, 1, 2, ..., batch_size-1]

        # Compute loss
        loss_image = loss_fn(logits_per_image, labels)
        loss_text = loss_fn(logits_per_text, labels)
        loss = (loss_image + loss_text) / 2  # Average the loss

        # Backpropagation
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.5881126523017883
Epoch 2, Loss: 2.0498573780059814
Epoch 3, Loss: 0.5765641331672668


In [None]:
model.save_pretrained("fine-tuned-clip-model")
processor.save_pretrained("fine-tuned-clip-model")


[]

In [None]:
pip install huggingface_hub




In [None]:
!huggingface-cli login
#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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [None]:
from huggingface_hub import HfApi, HfFolder

# Initialize HfApi
api = HfApi()

# Your model repository name
repo_name = "saiabhishek-itta/fine-tuned-clip"

# Create a repository on the Hugging Face Hub
api.create_repo(repo_name, exist_ok=True)

# Push the model and processor to the repository
model.push_to_hub(repo_name)
processor.push_to_hub(repo_name)


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saiabhishek-itta/fine-tuned-clip/commit/3da38f4a09d114e8e4d3a561a48463343ca538e3', commit_message='Upload processor', commit_description='', oid='3da38f4a09d114e8e4d3a561a48463343ca538e3', pr_url=None, pr_revision=None, pr_num=None)