#### Download the zip files

In [1]:
!pip install gdown

import gdown

!wget -O images.zip "https://drive.google.com/u/3/uc?id=1U2PljA7NE57jcSSzPs21ZurdIPXdYZtN&export=download&confirm=yes"

url = 'https://drive.google.com/uc?id=11WoM5ZFwWpVjrIvZajW0g8EmQCNKMAWH'
output = 'labels.zip'
gdown.download(url, output, quiet=True)

!unzip 'images.zip'
!unzip 'labels.zip'


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: images/WOMEN-Tees_Tanks-id_00004559-04_7_additional.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-06_1_front.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-06_2_side.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-06_3_back.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-06_7_additional.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-07_1_front.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-07_2_side.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-07_3_back.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-07_4_full.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-07_7_additional.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-08_1_front.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-08_3_back.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-08_7_additional.jpg  
  inflating: images/WOMEN-Tees_Tanks-id_00004560-09_1_front.jpg

#### GPU resources

In [4]:
!nvidia-smi

Thu Dec  1 18:24:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#### Resize all images

In [2]:
import os
import sys

import cv2

img_names = os.listdir('images')

os.makedirs('images_224x329', exist_ok = True)
count = 0

for img_name in img_names:
    count += 1
    img_path = os.path.join('images', img_name)
    img = cv2.imread(img_path)
    if count%1000 == 0:
        print(count)
    img = cv2.resize(img, (224, 329), interpolation = cv2.INTER_AREA)
    cv2.imwrite(os.path.join('images_224x329', img_name), img)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000


#### Torch utilities

In [1]:
import torch, torchvision
import copy
from torchsummary import summary

!pip install -U albumentations

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### DataLoader 

In [12]:
import os
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from PIL import Image 
import pandas as pd
import time

import albumentations as A
from albumentations.pytorch import ToTensorV2

import cv2

#Define Custom Dataset Class
class FashionDataset(Dataset):
    def __init__(self, shape_file, fabric_file, pattern_file, root_dir, transform=None):
        super().__init__()
        self.shape_annotations = pd.read_csv(shape_file, sep=' ')
        self.fabric_annotations = pd.read_csv(fabric_file, sep=' ')
        self.pattern_annotations = pd.read_csv(pattern_file, sep=' ')

        self.root_dir = root_dir
        self.transform = transform

        self.mode = 'train'
    
    def __len__(self):
        return len(self.shape_annotations)

    def __getitem__(self, idx):
        #start_time = time.time()
        path = os.path.join(self.root_dir, self.shape_annotations.iloc[idx, 0]) # idx: row | 0: column(image name)
        #image = Image.open(path)
        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)    #convert image from BGR to RGB format
        #print("Image read time: ", time.time() - start_time)

        #start_time = time.time()

        shape_tensor = torch.tensor(self.shape_annotations.iloc[idx, 1:])
        fabric_tensor = torch.tensor(self.fabric_annotations.iloc[idx, 1:])
        pattern_tensor = torch.tensor(self.pattern_annotations.iloc[idx, 1:])

        #print("Label read time: ", time.time() - start_time)

        y1 = torch.cat((shape_tensor, fabric_tensor, pattern_tensor))
        
        start_time = time.time()
        apply_transform = self.transform_data()   #apply augmentations to the data
        image = apply_transform(image = img)['image']
        #print("Transform time: ", time.time() - start_time)
        #print("Total time: ", time.time() - start_time)

        #Check if the transforms are applied properly or not
        # invTrans = transforms.Compose([ transforms.Normalize(mean = [ 0., 0., 0. ], std = [ 1/0.229, 1/0.224, 1/0.225 ]),
        #                         transforms.Normalize(mean = [ -0.485, -0.456, -0.406 ], std = [ 1., 1., 1. ]),
        #                        ])

        # unnorm_image_data = invTrans(image).clone().detach().cpu().numpy()
        # from google.colab.patches import cv2_imshow
        # import numpy as np
        # cv2_imshow(cv2.cvtColor(np.round(np.transpose(unnorm_image_data, (1, 2, 0))*255), cv2.COLOR_BGR2RGB))
        
        
        return image, y1
    
    
    def transform_data(self):

        if self.mode == 'train':
            transform_data = A.Compose(
              [
                  #always resize the image to 329x224
                  #A.Resize(height = 329, width= 224, interpolation = cv2.INTER_AREA, p=1),
                  A.HorizontalFlip(p=0.4),  
                  A.ShiftScaleRotate(shift_limit=0.025, scale_limit=0, rotate_limit=15, p=0.5),
                  #A.RandomCrop(height = 224, width = 224, p=1),
                  #randomly change brightness, contrast, and saturation of the image 50% of the time
                  A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue = 0, p=0.5), 
                  A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1), 
                  ToTensorV2(p=1),
              ])
        else:     #augmentations during validation and testing
          transform_data = A.Compose(
          [
              #always resize the image to 329x224
              #A.Resize(height = 329, width = 224, p=1),   
              A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1),
              ToTensorV2(p=1),
          ])
    
        return transform_data

In [13]:
# Loading Data
import shutil
import pandas as pd
import os
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

#from utils.customDataset import FashionDataset

shape_file = './labels/shape/shape_anno_all.txt'
fabric_file = './labels/texture/fabric_ann.txt'
pattern_file = './labels/texture/pattern_ann.txt'

# Define Image Augmentations
transform = transforms.Compose([
    transforms.Resize((329, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

dataset = FashionDataset(shape_file, fabric_file, pattern_file, './images_224x329', transform)

# Split into train, val sets
train, val = torch.utils.data.random_split(dataset, [40543, 2000])
train_loader = DataLoader(dataset = train, batch_size = 64, shuffle = True)#, pin_memory = True, num_workers = 8)
val_loader = DataLoader(dataset = val, batch_size = 64, shuffle = False)
image, label = next(iter(train_loader))
print(image.shape)
print(label.shape)

torch.Size([64, 3, 329, 224])
torch.Size([64, 18])


#### Transformer backbone

In [9]:
import torchvision
# Initialize swin transformer backbone with ImageNet weights
backbone = torchvision.models.swin_t(weights='IMAGENET1K_V1')


# Remove classifier head
backbone.head = torch.nn.Identity()
#layers = list(backbone.children())
#backbone = nn.Sequential(*list(backbone.children())[:-1])

Downloading: "https://download.pytorch.org/models/swin_t-704ceda3.pth" to /root/.cache/torch/hub/checkpoints/swin_t-704ceda3.pth


  0%|          | 0.00/108M [00:00<?, ?B/s]

In [10]:
# Classifier with 18 forks (For each of the 18 attribute categories)
class AttributeClassifier(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.forks = torch.nn.ModuleList()
        attribute_classes = [
            6, 5, 4, 3, 5, 3, 3, 3, 5, 8, 3, 3, #Shape Attributes
            8, 8, 8, #Fabric Attributes
            8, 8, 8 #Color Attributes
        ]
        
        for class_count in attribute_classes:
            fork = torch.nn.Linear(in_features=768, out_features=class_count)
            self.forks.append(fork)
        #self.common_fork = torch.nn.Linear(in_features=768, out_features = 99)
    
    def forward(self, x):
        out = []
        for index,fork in enumerate(self.forks):
            out_fork = fork(x) #Classification
            out.append(out_fork)
        #out = self.common_fork(x)
        return out

# Model definition
class ClassifierModel(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.backbone = backbone
        self.classifier = AttributeClassifier()
    
    def forward(self, x):
        out = self.backbone(x)
        out = self.classifier(out)
        return out

model = ClassifierModel()
model.to(device)
summary(model, (3, 329, 224))

# Freeze weights
for param in model.parameters():
    param.requires_grad = False

# Unfreeze classifier weights
for param in model.classifier.parameters():
    param.requires_grad = True

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 96, 82, 56]           4,704
           Permute-2           [-1, 82, 56, 96]               0
         LayerNorm-3           [-1, 82, 56, 96]             192
         LayerNorm-4           [-1, 82, 56, 96]             192
ShiftedWindowAttention-5           [-1, 82, 56, 96]               0
   StochasticDepth-6           [-1, 82, 56, 96]               0
         LayerNorm-7           [-1, 82, 56, 96]             192
            Linear-8          [-1, 82, 56, 384]          37,248
              GELU-9          [-1, 82, 56, 384]               0
          Dropout-10          [-1, 82, 56, 384]               0
           Linear-11           [-1, 82, 56, 96]          36,960
          Dropout-12           [-1, 82, 56, 96]               0
  StochasticDepth-13           [-1, 82, 56, 96]               0
SwinTransformerBlock-14           [

#### Transformer training code

In [7]:
# Will contain utility functions used for training the model(s)
import torch
import copy
from tqdm import tqdm
from time import sleep

import time

#Training Function
def fit_classifier(model, train_loader, val_loader, optimizer, loss_func, attributes, epochs=10, device='cpu'):
    '''
    fit() function to train a classifier model.

    args:
        model - the model to be trained
        train_loader - torch.utils.data.Dataloader() for train set
        val_loader - torch.utils.data.Dataloader() for val set
        optimizer - optimization algorithm for weight updates
        criterion - loss function to be used for training
    
    keyword args:
        epochs - Number of training epochs (default=10)
        device - the device for training (default='cpu')
    
    returns: (train_loss_history, train_acc_history, val_loss_history, val_acc_history)
    
    '''
    
    model = model.to(device)
    
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []
    best_acc = 0.
    
    #Iterate epochs
    for epoch in range(epochs):
        print('Training epoch {}/{}...:'.format(epoch+1, epochs))
        epoch_start_time = time.time()
        #Each epoch has a training phase and validation phase
        for phase in ['train','val']:
            data_loader = None
            if phase == 'train':
                #Set train mode
                model.train()
                data_loader = train_loader
            else:
                #Set Eval mode
                model.eval()
                data_loader = val_loader

            running_loss = 0.
            running_corrects = torch.tensor([0.]).to(device)
            with tqdm(data_loader, unit="batch") as tepoch:
                idx = 0
                #Iterate batches
                data_loader_iter = iter(data_loader)
                batch_start_time = time.time()

                # next_batch = data_loader_iter.next() # start loading the first batch
                # next_batch = [ _.cuda(non_blocking=True) for _ in next_batch ]  # with pin_memory=True and non_blocking=True, this will copy data to GPU non blockingly

                # for i in range(len(data_loader)):
                #     inputs, labels = next_batch 
                #     if i + 1 != len(data_loader): 
                #         # start copying data of next batch
                #         next_batch = data_loader_iter.next()
                #         next_batch = [ _.cuda(non_blocking=True) for _ in next_batch]
                #for inputs, labels in data_loader_iter: 
                for inputs, labels in tepoch:
                    idx += 1
                    #print("Current start time: ", full_start_time)
                    tepoch.set_description(f"Epoch {epoch} {phase}")
                    inputs = inputs.to(device)
                    labels = labels.long().to(device)
                    optimizer.zero_grad()
                    #Set gradient calc on only for training phase
                    with torch.set_grad_enabled(phase == 'train'):
                        #start_time = time.time()
                        #768x99 outputs across all the attributes
                        outputs = model(inputs)
                        #print("process time 1: ", time.time() - start_time)
                        #split the outputs to list of 18 separate outputs
                        #start_time = time.time()
                        #list_outputs = []
                        #prev_count = 0
                        #for count in attributes:
                          #list_outputs.append(outputs[:, prev_count:prev_count+count])
                          #prev_count += count
                        #print("process time 2: ", time.time() - start_time)
                        #start_time = time.time()
                        loss = classifier_loss(outputs, labels, loss_func, attributes)
                        #print("process time 3: ", time.time() - start_time)
                        #start_time = time.time()
                        preds = classifier_preds(outputs, shape=(inputs.shape[0],labels.shape[1]), device=device)
                        #print("process time 4: ", time.time() - start_time)
                        #Do backprop if phase = train
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels)
                    tepoch.set_postfix(loss=loss.item() * inputs.size(0))
                
                # if idx % 50 == 0:
                #   print(idx)
                #   print("50 batches processing time: ", time.time() - batch_start_time)
                #   batch_start_time = time.time()
                
                #print("Previous end time: ", time.time())
                
                epoch_loss = running_loss / len(data_loader.dataset)
                epoch_acc = running_corrects.float() / len(data_loader.dataset)
                #tepoch.set_postfix(loss=epoch_loss, accuracy=epoch_acc)
                #sleep(0.1)
                #print('{} loss: {}, {} acc: {}'.format(phase, epoch_loss, phase, epoch_acc))
                if phase == 'val':
                    val_loss_history.append(epoch_loss)
                    val_acc_history.append(epoch_acc)
                else:
                    train_loss_history.append(epoch_loss)
                    train_acc_history.append(epoch_acc)
                
                #Saving best model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
                
        print('-'*20)
        epoch_end_time = time.time()
    
    print('Best val acc: {}'.format(best_acc))
    print(f"Time taken for an epoch: {epoch_end_time - epoch_start_time}")
    return (train_loss_history, train_acc_history, val_loss_history, val_acc_history)

#Loss function for classifier
def classifier_loss(outputs, targets, loss_func, attributes):
    '''
    Loss function that calculates cross-entropy over each output and sums it.

    args:
        outputs - a list of outputs where each output corresponds to a vector of predictions
        targets - a tensor of targets where each target corresponds to a class index

    '''
    loss_out = torch.empty((len(attributes), 1))
    for index, output in enumerate(outputs):
        loss_out[index] = loss_func(output, targets[:,index])
    return torch.sum(loss_out)

#Utility method to get predictions
def classifier_preds(outputs, shape, device):
    '''
    Utility function that returns predictions for a list of outputs

    args:
        outputs - a list of outputs where each output corresponds to a vector of predictions
        shape - shape of the predictions to return
    '''
    preds = torch.empty(size=shape).to(device)
    for index, output in enumerate(outputs):
        preds[:,index] = torch.argmax(output, dim=1)
    return preds



In [14]:
# Training the model
#from utils.train_funcs import fit_classifier, classifier_loss

epochs = 25
learning_rate = 1e-3
loss_func = classifier_loss
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=learning_rate)
ce_loss = torch.nn.CrossEntropyLoss()
attribute_classes = [
    6, 5, 4, 3, 5, 3, 3, 3, 5, 8, 3, 3, #Shape Attributes
    8, 8, 8, #Fabric Attributes
    8, 8, 8 #Color Attributes
]

fit_classifier(fit_classifier(
    model, 
    train_loader = train_loader, 
    val_loader = val_loader, 
    optimizer = optimizer, 
    loss_func = ce_loss,
    attributes = attribute_classes, 
    epochs = epochs, 
    device = device
))


Training epoch 1/25...:


Epoch 0 train:  24%|██▎       | 149/634 [01:50<05:58,  1.35batch/s, loss=874]


KeyboardInterrupt: ignored