# **Baseline Model Starter Code**


This notebook provides some basic starter code to train on the Herbarium 2022 Kaggle Competition from the competition hosts. 
Specifically, this code provides data loaders for PyTorch and does standard normalization and augmentations on the training data. 
The code then performs training over 4 epochs starting with a pretrained Resnet101 architecture, using CrossEntropy as the loss function, and SGD/ReduceOnPlateau as the optimizer/learning rate scheduler. 
Additionally, this network employs the trick of proxy batch size to boost the effective batch size of optimization while using minimal GPU memory. 

This notebook also gives simple code to use a saved model to generate a submission for this competition which obtains ~27 percent accuracy on the public data set. 

Imports

In [None]:
import os
import json
import torch
import pickle
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from torch import optim
from datetime import datetime
from torch.optim import lr_scheduler
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter
from torch.optim.lr_scheduler import MultiStepLR, CosineAnnealingLR,ReduceLROnPlateau
import torchvision
from PIL import Image
from torch.utils import data
import torchvision.transforms as transforms

Define some data util functions and loaders

In [None]:

def generate_img_pth_class(data_dir,split="train"):
    """ Read meta-data and change to file and class list"""
    # 1. Load json meta data
    with open(os.path.join(data_dir,"{}_metadata.json".format(split))) as f:
        file_data = json.load(f)
    # 2. Iterate through image and class list and save full pth and class
    print("Generating {} file and class list".format(split))
    if split == "train":
        full_file_list, class_list = [],[]
        for i in tqdm(range(len(file_data["annotations"]))):
            # Ensure same picture
            assert file_data["annotations"][i]["image_id"] == file_data["images"][i]["image_id"]
            full_file_list.append(os.path.join(data_dir,"train_images",file_data["images"][i]["file_name"]))
            class_list.append(file_data["annotations"][i]["category_id"])
        # 3. Return as np array
        return np.array(full_file_list), np.array(class_list)
    else:
        full_file_list = []
        full_id_list = []
        for i in tqdm(range(len(file_data))):
            full_id_list.append(file_data[i]['image_id'])
            full_file_list.append(os.path.join(data_dir,"test_images",file_data[i]["file_name"]))
        # 3. Return as np array
        return np.array(full_file_list), np.array(full_id_list)




def get_loaders(data_dir,batch_size=32,num_workers=2):
    """Returns train and test loader"""
    # 1. Get train data, make data set, and make data loader
    train_file_pths, train_cls = generate_img_pth_class(data_dir)
    train_data_set = HerbariumDataLoader(train_file_pths, train_cls)
    train_loader = torch.utils.data.DataLoader(train_data_set,batch_size=batch_size,shuffle=True,pin_memory=True,num_workers=num_workers)
    # 2. Calculate number of classes -- use additional 3 classes so that mapping is 1 to 1
    num_classes = np.unique(train_cls)[-1]+1

    return train_loader, num_classes

def get_transforms(train):
    mean= [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    if train:
        transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
    else:
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
    return transform

class HerbariumDataLoader(data.Dataset):

    def __init__(self, file_pths, class_list, train=True):
        self.is_train_set = train
        self.file_paths = file_pths
        self.classes = class_list
        self.transforms = get_transforms(train)
        self.total_img_count = self.file_paths.shape[0]
        self.image_size = 380

    def __len__(self):
        return self.total_img_count

    def __getitem__(self,idx):
        # 1. Get class
        class_label = np.int64(self.classes[idx]) if self.is_train_set else None

        # 2. Load image, resize and transform
        img = Image.open(self.file_paths[idx])
        img = img.resize((self.image_size,self.image_size))
        img = self.transforms(img)

        # 3. Return image and class
        return img, class_label


Setup logging for tensorboard, and F1 score

In [None]:
def log_metrics(metric_dict,epoch,title):
    # log all metrics
    for key in metric_dict.keys():
        label_text = "{}/{}".format(title,key)
        logger.add_scalar(label_text,metric_dict[key],epoch)

def calculate_f1(preds,labs):
    preds = preds.detach().cpu().numpy()
    labs = labs.cpu().numpy()

    f1_vals = f1_score(preds,labs,average="weighted")
    return f1_vals


Making train method for single epoch

In [None]:
def train():
    # 1. Set network to be in training mode
    net.train()
    # 2. Setup loggers, and progress bar
    correct_class = 0
    total_loss = 0
    total_num = 0
    pbar = tqdm(enumerate(train_data_loader), total=len(train_data_loader), desc='Train epoch {}'.format(epoch))
    # 3. Iterate through
    for batch_idx, (data, class_lab) in pbar:
        # 3a. Set to correct device
        data, class_lab = data.to(device),class_lab.to(device)
        # 3b. Forward propagate
        class_output  = net(data)
        # 3d. Calculate loss (averaged over pseudo batch size), and Backpropagate on fullbatch
        loss_val = loss_fn(class_output, class_lab) / proxy_batch_size
        loss_val.backward()
        if (batch_idx+1) % int(proxy_batch_size) == 0:
            optimizer.step()
            optimizer.zero_grad()
        # 3f. Update loggers
        _, predicted = torch.max(class_output.data, 1)
        correct_class += (predicted == class_lab).sum().item()
        total_loss += loss_val.item()
        total_num += data.shape[0]
        # 3g. Update progress bar
        pbar.set_postfix(avg_acc=float(correct_class)/total_num)
    # 4. Calculate average loss and accuracy and return
    class_acc = float(correct_class)/total_num
    avg_loss = total_loss/(batch_idx+ 1) # Average per batch

    return {"train_avg_loss": avg_loss, "class_acc":class_acc}


Method to create pretrained feature network, with untrained classification head

In [None]:
def build_net():
    pretrained_model = torchvision.models.resnet101(pretrained=True)
    pretrained_dict = pretrained_model.state_dict()
    net = torchvision.models.resnet101(pretrained=False,num_classes=num_classes)

    # Fiter out unneccessary keys
    copy_pretrained_dict = pretrained_dict.copy()
    for k, v in pretrained_dict.items():
        if "fc" in k:
            del copy_pretrained_dict[k]
    net.load_state_dict(copy_pretrained_dict,strict=False)
    return net


Train Model

In [None]:
# 1. Seed all processes
RANDOM_SEED = 1994
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

# 2. Setup device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device: {}".format(device))
if device == "cuda":
    torch.backends.cudnn.benchmark = True

# 3. Setup logging
dt_string = datetime.now().strftime("%d%m%Y%H%M%S")
dir_name = "baseline_run/run_id_{}".format(dt_string)
logger = SummaryWriter(log_dir=dir_name)

# 4. Load data
batch_size = 16
proxy_batch_size = (256.0/batch_size)
data_dir = "/kaggle/input/herbarium-2022-fgvc9" # PATH TO DATA 

train_data_loader, num_classes = get_loaders(data_dir,batch_size=batch_size)

# 5. Load model, using pretrained imagenet weights, and put on device
net = build_net()
net.to(device)

# 6.  Setup optimization, learning rate scheduler, and loss function
num_epochs = 4
optimizer = optim.SGD(net.parameters(), lr=.6,momentum=.9,weight_decay=0.0001,nesterov=True)
lr_schedule = ReduceLROnPlateau(optimizer)
loss_fn = nn.CrossEntropyLoss()


# 7. Start train/test loop
best_acc = 0
logging_dict = {}
for epoch in range(num_epochs):
    # Train one epoch, get loss and acc
    train_dict = train()
    # Increment scheduler
    lr_schedule.step(train_dict["train_avg_loss"])
    # Log all to tensorboard and logging dict
    log_metrics(train_dict,epoch,"full_exp")
    logging_dict["{}".format(epoch)] = train_dict
    # Save model every epoch (could ensemble later)
    model_name = "epoch_{}_ca_{:.3f}.pth".format(epoch,train_dict["class_acc"])
    print(model_name)
    # Save model name if it is the best
    if train_dict["class_acc"] > best_acc:
        best_model_pth = os.path.join(dir_name,model_name)
        best_acc = train_dict["class_acc"]
        torch.save(net.state_dict(),best_model_pth)





# Close logger, and save logging dictionary
logger.close()
output = open(os.path.join(dir_name,"full_logs.pkl"), 'wb')
pickle.dump(logging_dict, output)
output.close()

Now using saved model make submission CSV

In [None]:
def get_test_loaders(data_dir,batch_size=32,num_workers=2):
    """Returns train and test loader"""
    # 1. Get train data, make data set, and make data loader
    test_file_pths, test_ids = generate_img_pth_class(data_dir,split="test")
    train_data_set = HerbariumTestDataLoader(test_file_pths, test_ids)
    test_loader = torch.utils.data.DataLoader(train_data_set,batch_size=batch_size,shuffle=True,pin_memory=True,num_workers=num_workers)

    return test_loader

class HerbariumTestDataLoader(data.Dataset):

    def __init__(self, file_pths, id_list):
        #self.is_train_set = train
        self.file_paths = file_pths
        self.ids = id_list
        self.transforms = get_transforms(False)
        self.total_img_count = self.file_paths.shape[0]
        self.image_size = 380

    def __len__(self):
        return self.total_img_count

    def __getitem__(self,idx):
        # 1. Get id
        id_label = np.int64(self.ids[idx])

        # 2. Load image, resize and transform
        img = Image.open(self.file_paths[idx])
        img = img.resize((self.image_size,self.image_size))
        img = self.transforms(img)

        # 3. Return image and class
        return img, id_label


Now produce prediction CSV

In [None]:
import csv
# 4. Load data
batch_size = 32
data_dir = "/kaggle/input/herbarium-2022-fgvc9" # PATH TO DATA 


test_data_loader = get_test_loaders(data_dir,batch_size=batch_size)
num_classes = 15505 # Actually there are only 15501 classes -- but to keep it a simpler 1 to 1 matching consider 4 extra classes
# 5. Load model, using pretrained imagenet weights, and put on device
net = build_net()
net.to(device)
MODEL_PTH = best_model_pth # PATH TO SAVED MODEL
net.load_state_dict(torch.load(MODEL_PTH))
net.eval()

def write_results(preds,id_lab):
    for i in range(preds.shape[0]):
        csv_writer.writerow(['{:d}'.format(int(id_lab[i])),'{:d}'.format(int(preds[i]))])

output_file = open("sample_submission.csv", 'w')
csv_writer = csv.writer(output_file, delimiter=',')
csv_writer.writerow(["Id","Predicted"])
# Iterate through
pbar = tqdm(enumerate(test_data_loader), total=len(test_data_loader), desc='Inference')
# 3. Iterate through
with torch.no_grad():
    for batch_idx, (data, id_lab) in pbar:
        # put to device
        data = data.to(device)
        # 3b. Forward propagate
        class_output  = net(data)
        # 3c. Get prediction
        _, predicted = torch.max(class_output.data, 1)
        # 3d. write to file
        write_results(predicted.cpu().numpy(),id_lab.numpy())
output_file.close()


# Future work
There are many ways to improve this starter code -- we provide some ideas below. 
## Data
The data can be augmented in more robust ways or ways that leverage the natural structure of the problem, and the image size could be increased. 
Additionally, you could leverage off the hierarchical structure of the data, which we provide some starter code for below. 
## Model 
The model could be improved by trying out different and bigger CNN and transformer models. Additionally, you could try things like using ArcFace to encourage stronger embeddings for each class/ stronger class separation. Additionally, for this dataset, the use of different types of learning rate schedulers may significantly affect accuracy. 
## Post-processing 
There are many ways to post-process the output of your models using ensemble methods and multiple augmented predictions per image and ways to combine all those predictions creatively given the data set's constraints.




In [None]:
import torch
from PIL import Image
import json
import pandas as pd
import torchvision
import numpy as np
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler

class HerbariumDataset(torch.utils.data.Dataset):
    def __init__(self, basepath, train=True, hier=False, transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor()])):
        self.basepath=basepath
        self.train=train
        self.hier=hier
        self.transform=transform
            
        if self.train :
            metadata_path = basepath + 'train_metadata.json'
            self.basepath = self.basepath + 'train_images/'
        else:
            metadata_path = basepath + 'test_metadata.json'
            self.basepath = self.basepath + 'test_images/'
            
            
        with open(metadata_path) as f:
            metadata = json.load(f)
        
        if self.train:
            self.df_images = pd.DataFrame(metadata['images'])
        else:
            self.df_images = pd.DataFrame(metadata)
            
        if self.train:
            self.df_annotations = pd.DataFrame(metadata['annotations'])
            self.df_images.index = self.df_images.image_id
            self.df_categories = pd.DataFrame(metadata['categories'])

        if self.hier:
            genuses = list(set(self.df_categories.genus.values))
            families = list(set(self.df_categories.family.values))
            genuses.sort()
            families.sort()
            genuses2id = {item:i for item,i in zip(genuses,range(len(genuses)))}
            families2id = {item:i for item,i in zip(families,range(len(families)))}
            self.df_categories['genus_id'] = self.df_categories['genus'].map(genuses2id)
            self.df_categories['family_id'] = self.df_categories['family'].map(families2id)
            self.df_categories.index = self.df_categories['category_id']

        
    def __len__(self):
        return len(self.df_images)
    
    def __getitem__(self, index):
        label=0
        
        if self.train:       
            
            img_path = self.basepath+self.df_images.loc[self.df_annotations.loc[index].image_id].file_name
            label = self.df_annotations.loc[index].category_id
        
            if self.hier:
                family = self.df_categories.loc[label].family_id
                genus = self.df_categories.loc[label].genus_id
            

        else:
            img_path = self.basepath+self.df_images.loc[index].file_name
       
        img=self.image_reader(img_path)
        if self.transform is not None:
            img=self.transform(img)

        if self.hier:
            return img, np.array([family, genus, label]) 
        else:
            return img,label

    def image_reader(self,img_path):
        img = Image.open(img_path).convert('RGB')
        return img


    def select_indices(self,indices):
        self.df_annotations=self.df_annotations.loc[indices]
        self.df_images=self.df_images.loc[self.df_annotations.image_id]
        self.df_annotations.reset_index(inplace=True)