<a href="https://colab.research.google.com/github/sanjanb/Internship-log/blob/main/SynerSense_Finetuning_clip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Tring modular approach**

### Loading and Slicing the Vision Encoder

In [None]:
from transformers import CLIPModel, CLIPProcessor
import torch
from PIL import Image

# Loading pretrained CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Slicing only the visual encoder
vision_encoder = model.vision_model


### Passing an Image and Getting its Embeddings

In [None]:
# Preparing a sample image
image = Image.open("image.png")

# Processing image
inputs = processor(images=image, return_tensors="pt")

# Extracting visual features
with torch.no_grad():
    vision_outputs = vision_encoder(**inputs)
    pooled_output = vision_outputs.pooler_output  # (1, 768)

### Connecting to Dense (MLP) Network

In [None]:
import torch.nn as nn

class VLMtoMLP(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, output_dim=2):
        super(VLMtoMLP, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.mlp(x)

# Instantiateing and connecting
mlp_head = VLMtoMLP()
output = mlp_head(pooled_output)

### Adding Loss and Train

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_head.parameters(), lr=1e-4)

# Dummy labels for example
labels = torch.tensor([1])

loss = criterion(output, labels)
loss.backward()
optimizer.step()


## **Testing for a single image**

In [None]:
from transformers import CLIPModel, CLIPProcessor
import torch
import torch.nn as nn
from PIL import Image

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Slice visual encoder
vision_encoder = model.vision_model
for param in vision_encoder.parameters():
    param.requires_grad = False  # Freeze if not training

# Load image
image = Image.open("image.png")
inputs = processor(images=image, return_tensors="pt").to(device)

# Forward pass
with torch.no_grad():
    vision_outputs = vision_encoder(**inputs)
    pooled_output = vision_outputs.pooler_output  # shape: [1, 768]

# Custom MLP head
class VLMtoMLP(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, output_dim=2):
        super(VLMtoMLP, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.mlp(x)

# Train step
mlp_head = VLMtoMLP().to(device)
output = mlp_head(pooled_output)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_head.parameters(), lr=1e-4)

labels = torch.tensor([1], dtype=torch.long).to(device)

loss = criterion(output, labels)
loss.backward()
optimizer.step()

print("Loss:", loss.item())


Loss: 0.8742982149124146


## **Code for Dataset**
Since dataset is being prepared, the model haven't trained on it

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load pretrained CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Freeze the vision encoder
for param in model.vision_model.parameters():
    param.requires_grad = False

vision_encoder = model.vision_model

# Custom MLP head
class VLMtoMLP(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, output_dim=2):
        super(VLMtoMLP, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.mlp(x)

mlp_head = VLMtoMLP().to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_head.parameters(), lr=1e-4)

# Transform (CLIP expects 224x224 RGB images normalized)
transform = Compose([
    Resize((224, 224)),
    ToTensor(),
    Normalize((0.4815, 0.4578, 0.4082), (0.2686, 0.2613, 0.2758))  # CLIP normalization
])

# Load dataset
dataset = datasets.ImageFolder(root="dataset", transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Training loop
for epoch in range(5):
    total_loss = 0
    for images, labels in dataloader:
        # CLIP expects pixel_values in a dict
        inputs = processor(images=[transforms.ToPILImage()(img) for img in images],
                           return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        # Get visual embeddings
        with torch.no_grad():
            vision_outputs = vision_encoder(**inputs)
            pooled_output = vision_outputs.pooler_output  # (B, 768)

        # Forward pass
        logits = mlp_head(pooled_output)
        loss = criterion(logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")