In [2]:
!git clone https://github.com/shashnkvats/Indofashionclip.git

Cloning into 'Indofashionclip'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 17 (delta 6), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (17/17), 6.30 KiB | 3.15 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [3]:
import os
os.chdir('/content/Indofashionclip')

In [None]:
!pip install -r requirements.txt

In [5]:
from google.colab import drive
drive.mount("/content/mydrive")

Mounted at /content/mydrive


In [6]:
# Dataset reference: https://www.kaggle.com/datasets/validmodel/indo-fashion-dataset
!unzip -qq '/content/mydrive/MyDrive/Colab Notebooks/COSE474/archive.zip'

In [None]:
!pip install tensorflow-gpu==2.8.0

In [8]:
import json
from PIL import Image

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import clip
from transformers import CLIPProcessor, CLIPModel

In [9]:
# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [10]:
# Choose computation device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load pre-trained CLIP model
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 130MiB/s]


In [11]:
# Define a custom dataset
class image_title_dataset():
    def __init__(self, list_image_path,list_txt):
        # Initialize image paths and corresponding texts
        self.image_path = list_image_path
        # Tokenize text using CLIP's tokenizer
        self.title  = clip.tokenize(list_txt)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        # Preprocess image using CLIP's preprocessing function
        image = preprocess(Image.open(self.image_path[idx]))
        title = self.title[idx]
        return image, title

In [40]:
# Create train dataloader
json_path = '/content/Indofashionclip/train_data.json'
image_path = '/content/Indofashionclip/images/train/'

with open(json_path, 'r') as f:
    input_data = []
    for line in f:
        obj = json.loads(line)
        input_data.append(obj)

list_image_path = []
list_txt = []
for item in input_data:
  img_path = image_path + item['image_path'].split('/')[-1]
  caption = item['product_title'][:40]
  list_image_path.append(img_path)
  list_txt.append(caption)

dataset = image_title_dataset(list_image_path, list_txt)
train_dataloader = DataLoader(dataset, batch_size=256, shuffle=True) # Define train dataloader

In [14]:
# Function to convert model's parameters to FP32 format
def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        p.grad.data = p.grad.data.float()


if device == "cpu":
  model.float()

In [None]:
!pip install wandb

In [None]:
import os
import wandb

# Initialize WandB
wandb.init(project="CLIP-2", name="training_run")

# Define variables to keep track of the best model and its corresponding loss
best_loss = float('inf')
best_model_path = 'best_model(1).pth'
epoch_losses = []

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2)
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

# Train the model
num_epochs = 30
for epoch in range(1, num_epochs):
    pbar = tqdm(train_dataloader, total=len(train_dataloader))
    epoch_loss = 0.0

    for batch in pbar:
        optimizer.zero_grad()

        images, texts = batch
        images = images.to(device)
        texts = texts.to(device)

        # Forward pass
        logits_per_image, logits_per_text = model(images, texts)

        # Compute loss
        ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
        total_loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        # Backward pass
        total_loss.backward()
        if device == "cpu":
            optimizer.step()
        else:
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)

        epoch_loss += total_loss.item()
        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")

    # Calculate average loss for the epoch
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    epoch_losses.append(avg_epoch_loss)

    # Save the best model
    if avg_epoch_loss < best_loss:
        best_loss = avg_epoch_loss
        torch.save(model.state_dict(), best_model_path)

    # Log metrics to WandB
    wandb.log({"epoch": epoch, "loss": avg_epoch_loss, "learning_rate": optimizer.param_groups[0]['lr']})

# Print and save the best loss
print(f"Best Loss: {best_loss:.4f}")
with open('epoch_losses.txt', 'w') as f:
    for epoch, loss in enumerate(epoch_losses):
        f.write(f"Epoch {epoch}/{num_epochs}, Loss: {loss:.4f}\n")

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Epoch 1/30, Loss: 3.4004: 100%|██████████| 357/357 [08:21<00:00,  1.40s/it]
Epoch 2/30, Loss: 3.4004: 100%|██████████| 357/357 [08:22<00:00,  1.41s/it]
Epoch 3/30, Loss: 5.5469:  16%|█▌        | 56/357 [01:18<06:54,  1.38s/it]

In [19]:
def evaluate(model, dataloader, loss_img, loss_txt, device):
    model.eval()
    total_loss_img = 0.0
    total_loss_txt = 0.0
    correct_img = 0
    correct_txt = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:
            images, texts = batch
            images = images.to(device)
            texts = texts.to(device)

            # Forward pass
            logits_per_image, logits_per_text = model(images, texts)

            # Compute loss
            ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
            loss_img_val = loss_img(logits_per_image, ground_truth)
            loss_txt_val = loss_txt(logits_per_text, ground_truth)

            # Accumulate loss
            total_loss_img += loss_img_val.item()
            total_loss_txt += loss_txt_val.item()

            # Accuracy calculation
            predicted_img = torch.argmax(logits_per_image, 1)
            predicted_txt = torch.argmax(logits_per_text, 1)
            correct_img += (predicted_img == ground_truth).sum().item()
            correct_txt += (predicted_txt == ground_truth).sum().item()

            total_samples += len(images)

    avg_loss_img = total_loss_img / len(dataloader)
    avg_loss_txt = total_loss_txt / len(dataloader)
    accuracy_img = correct_img / total_samples
    accuracy_txt = correct_txt / total_samples

    return avg_loss_img, avg_loss_txt, accuracy_img, accuracy_txt

In [23]:
# Create valid dataloader
json_path = '/content/Indofashionclip/val_data.json'
image_path = '/content/Indofashionclip/images/val/'

with open(json_path, 'r') as f:
    input_data = []
    for line in f:
        obj = json.loads(line)
        input_data.append(obj)

list_image_path = []
list_txt = []
for item in input_data:
  img_path = image_path + item['image_path'].split('/')[-1]
  caption = item['product_title'][:40]
  list_image_path.append(img_path)
  list_txt.append(caption)

dataset = image_title_dataset(list_image_path, list_txt)
val_dataloader = DataLoader(dataset, batch_size=256, shuffle=True) # Define valid dataloader

In [29]:
# Validation
val_loss_img, val_loss_txt, val_acc_img, val_acc_txt = evaluate(model, val_dataloader, loss_img, loss_txt, device)
print(f"Validation Loss - Image: {val_loss_img:.4f}, Text: {val_loss_txt:.4f}")
print(f"Validation Accuracy - Image: {val_acc_img:.4f}, Text: {val_acc_txt:.4f}")

Validation Loss - Image: 3.5857, Text: 3.6437
Validation Accuracy - Image: 0.1857, Text: 0.1735


In [30]:
# Create test dataloader
json_path = '/content/Indofashionclip/test_data.json'
image_path = '/content/Indofashionclip/images/test/'

with open(json_path, 'r') as f:
    input_data = []
    for line in f:
        obj = json.loads(line)
        input_data.append(obj)

list_image_path = []
list_txt = []
for item in input_data:
  img_path = image_path + item['image_path'].split('/')[-1]
  caption = item['product_title'][:40]
  list_image_path.append(img_path)
  list_txt.append(caption)

dataset = image_title_dataset(list_image_path, list_txt)
test_dataloader = DataLoader(dataset, batch_size=256, shuffle=True) # Define test dataloader

In [31]:
# Validation
test_loss_img, test_loss_txt, test_acc_img, test_acc_txt = evaluate(model, test_dataloader, loss_img, loss_txt, device)
print(f"Validation Loss - Image: {test_loss_img:.4f}, Text: {test_loss_txt:.4f}")
print(f"Validation Accuracy - Image: {test_acc_img:.4f}, Text: {test_acc_txt:.4f}")

Validation Loss - Image: 3.5899, Text: 3.6337
Validation Accuracy - Image: 0.1817, Text: 0.1661


In [44]:
import torch, gc
gc.collect()
torch.cuda.empty_cache() # To empty cuda memory