In [None]:
cfg = {
    # parameters for dataset and dataloader
    'data':
        {
            'dataset':
                {
                    'root_path': '/kaggle/input/coco-2017-dataset/coco2017',  # '/datasets/homeworks/cv segmentation/',
                    'nb_train_images': 562331,
                    'nb_val_images': 36334,
                    'nb_classes': 80
                },
            'dataloader':
                {
                    'batch_size': 16
                }


        },

    # parameters for setting up training parameters
    'train':
        {
            'optimizer':
                {
                    'lr': 1e-3,
                    'weight_decay': 1e-4
                },
            'epochs': 100

        },

    # parameters for model evaluation
    "eval":
        {
            'evaluate_on_train_data': False,
            'evaluate_before_training': False,
        },

    # parameters for logging training process, saving/restoring model
    "logging":
        {
            'log_metrics': True,
            'experiment_name': 'coco_classification',
            'checkpoints_dir': 'checkpoints/',
            'save_model': True,
            'load_model': False,
            'epoch_to_load': 20,
            'save_frequency': 1,
        },

    # parameters to debug training and check if everything is ok
    "debug":
        {
            # to check batches before training
            "save_batch":
                {
                    "enable": False,
                    "nrof_batches_to_save": 5,
                    "path_to_save": 'batches_images/',
                },
            "overfit_on_batch":
                {
                    "enable": False,
                    "nb_iters": 1000,
                }
        },
}

In [None]:
""" data/coco_80_dataset.py """

from torch.utils import data
from PIL import Image
from pycocotools.coco import COCO
from torchvision import transforms
from pprint import pprint
from torch.utils.data import DataLoader
import os


class CocoClsDataset(data.Dataset):
    def __init__(self, root_dir, ann_file, img_dir, phase, less_sample=False):
        self.ann_file = os.path.join(root_dir, ann_file)
        self.img_dir = os.path.join(root_dir, img_dir)
        self.coco = COCO(self.ann_file)
        self.dataset_type = phase

        if phase == 'train':
            self.transform = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])

        cat_ids = self.coco.getCatIds()
        categories = self.coco.dataset['categories']
        self.id2cat = dict()
        for category in categories:
            self.id2cat[category['id']] = category['name']
        self.id2label = {category['id']: label for label, category in enumerate(categories)}
        self.label2id = {v: k for k, v in self.id2label.items()}

        tmp_ann_ids = self.coco.getAnnIds()
        self.ann_ids = []
        for ann_id in tmp_ann_ids:
            ann = self.coco.loadAnns([ann_id])[0]
            x, y, w, h = ann['bbox']
            x, y, w, h = int(x), int(y), int(w), int(h)
            if ann['area'] <= 0 or w < 1 or h < 1 or ann['iscrowd']:
                continue
            self.ann_ids.append(ann_id)

        self._cal_num_dict()

        if phase == 'train' and less_sample:
            self.ann_ids = self._mining_sample()

        print('total_length of dataset:', len(self))

    def _cal_num_dict(self):
        self.num_dict = {}
        for ann_id in self.ann_ids:
            ann = self.coco.loadAnns([ann_id])[0]
            cat = self.id2cat[ann['category_id']]
            num = self.num_dict.get(cat, 0)
            self.num_dict[cat] = num + 1

    def _mining_sample(self):
        self.num_dict = {}
        tmp_ann_ids = []
        for ann_id in self.ann_ids:
            ann = self.coco.loadAnns([ann_id])[0]
            cat = self.id2cat[ann['category_id']]
            num = self.num_dict.get(cat, 0)
            if num >= 20000:
                continue
            self.num_dict[cat] = num + 1
            tmp_ann_ids.append(ann_id)
        return tmp_ann_ids

    def _load_bg_anns(self):
        assert os.path.exists(self.bg_bboxes_file)
        bg_anns = []
        with open(self.bg_bboxes_file, 'r') as f:
            line = f.readline()
            while line:
                if line.strip() == '':
                    break
                file_name, num = line.strip().split()
                for _ in range(int(num)):
                    bbox = f.readline()
                    bbox = bbox.strip().split()
                    bbox = [float(i) for i in bbox]
                    w = bbox[2] - bbox[0] + 1
                    h = bbox[3] - bbox[1] + 1
                    bbox[2], bbox[3] = w, h
                    ann = dict(
                        file_name=file_name,
                        bbox=bbox)
                    bg_anns.append(ann)
                line = f.readline()
        return bg_anns

    def __len__(self):
        return len(self.ann_ids)

    def __getitem__(self, idx):
        ann = self.coco.loadAnns([self.ann_ids[idx]])[0]

        cat_id = ann['category_id']
        label = self.id2label[cat_id]

        img_meta = self.coco.loadImgs(ann['image_id'])[0]
        img_path = os.path.join(self.img_dir, img_meta['file_name'])

        img = Image.open(img_path).convert('RGB')
        x, y, w, h = ann['bbox']
        x, y, w, h = int(x), int(y), int(w), int(h)
        img = img.crop((x, y, x + w - 1, y + h - 1))

        # save_img = img.resize((224, 224), Image.BILINEAR)
        # save_img.save('test.jpg')

        try:
            img = self.transform(img)
        except:
            print(img.mode)
            exit(0)
        return img, label


def get_data(cfg):
    """
    Gets data and returns train, test dataloaders
    :param cfg: cfg['data'] part of config
    :return: train, test dataloaders
    """
    print(f'Getting train set...')
    train_set = CocoClsDataset(root_dir=cfg['dataset']['root_path'],
                               ann_file='annotations/instances_train2017.json',
                               img_dir='train2017',
                               phase='train',
                               less_sample=True)
    print('length: ', len(train_set))
    pprint(train_set.num_dict)
    train_dl = DataLoader(train_set, batch_size=cfg['dataloader']['batch_size'], shuffle=True)

    print(f'Getting test set...')
    test_set = CocoClsDataset(root_dir=cfg['dataset']['root_path'],
                              ann_file='annotations/instances_val2017.json',
                              img_dir='val2017',
                              phase='val',
                              less_sample=True)
    print('length: ', len(test_set))
    pprint(test_set.num_dict)
    test_dl = DataLoader(test_set, batch_size=cfg['dataloader']['batch_size'])

    return train_dl, test_dl


In [None]:
""" models/resnet50.py """

import torchvision
import torch


def get_model(cfg):
    """
    :param cfg: config
    :return: pretrained on ImageNet resnet-50 model
    """
    print(f'Getting model...')
    resnet50 = torchvision.models.resnet50(pretrained=True)
    resnet50.features = torch.nn.Sequential(resnet50.conv1, resnet50.bn1, resnet50.relu, resnet50.maxpool,
                                            resnet50.layer1,
                                            resnet50.layer2, resnet50.layer3, resnet50.layer4)
    resnet50.sz_features_output = 2048
    resnet50.features_pooling = torch.nn.AvgPool2d(7, stride=1, padding=0, ceil_mode=True, count_include_pad=True)
    resnet50.fc = torch.nn.Linear(resnet50.sz_features_output, cfg['data']['dataset']['nb_classes'])

    def forward(x):
        x = resnet50.features(x)
        x = resnet50.features_pooling(x)
        bs = x.size(0)
        x = x.view(bs, -1)
        x = resnet50.fc(x)
        return x

    resnet50.forward = forward
    return resnet50

In [None]:
""" utils/debug_utils.py """

import torchvision
import torch
import numpy as np


def overfit_on_batch(cfg_overfit_on_batch, cfg_train, train_dl, model, optimizer, criterion):
    """
    Overfits on one batch
    :param cfg_overfit_on_batch: cfg['debug']['overfit_on_batch'] part of config
    :param train_dl: train dataloader
    :param model: resnet50 model
    :param optimizer: optimizer
    :param criterion: criterion
    """
    train_dl = iter(train_dl)
    images, labels = next(train_dl)
    model = model.cuda()
    accuracies = []

    for iter_ in range(cfg_overfit_on_batch['nb_iters']):
        optimizer.zero_grad()
        logits = model(images.cuda()).cpu()
        loss = criterion(logits, labels)
        _, predicted = torch.max(logits.data, 1)
        accuracy = torch.sum(predicted == labels).item() / labels.size(0) * 100
        print(f'iter: {iter_}, acc: {accuracy}, loss: {loss.item()}')

        accuracies.append(accuracy)
        if len(accuracies) >= 5 and np.min(accuracies[-5:]) == 100:
            break

        loss.backward()
        optimizer.step()
    print(f'Overfitting on batch is finished.')


def save_batch_images(cfg, train_dl, valid_dl):
    """
    Saves several batches of images as .png file
    :param cfg: cfg['debug']['save_batch'] part of config
    :param train_dl: train dataloader to saves batches from
    :param valid_dl: valid dataloader to saves batches from
    """
    for dl in [train_dl, valid_dl]:
        dataset_type = dl.dataset.dataset_type
        print(dataset_type)
        dl = iter(dl)
        for i in range(cfg['nrof_batches_to_save']):
            images, labels = next(dl)
            print(f'batch {i} labels: {labels}')
            torchvision.utils.save_image(images, cfg['path_to_save'] + f'{dataset_type}_batch_{i}.png')

In [None]:
""" utils/eval_utils.py """

import time
import numpy as np
import torch

# from utils.log_utils import log_metrics


def evaluate(cfg_train, cfg_logging, model, dl, epoch, dataset_type, criterion):
    """
    Evaluates on train/valid data
    :param cfg_eval: cfg['train'] part of config
    :param cfg_logging: cfg['logging'] part of config
    :param model: resnet-50 model
    :param dl: train/valid dataloader
    :param epoch: epoch for logging
    :param dataset_type: type of current data ('train' or 'valid')
    """
    print(f'Evaluating on {dataset_type} data...')
    eval_start_time = time.time()
    correct, total = 0, 0
    losses = []
    model = model.cuda()

    dl_len = len(dl)
    for i, (images, labels) in enumerate(dl):
        images, labels = images.cuda(), labels.cuda()

        if i % 50 == 0:
            print(f'iter: {i}/{dl_len}')

        with torch.no_grad():
            logits = model(images)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += torch.sum(predicted == labels)


        # calculate losses
        loss = criterion(logits, labels)
        losses.append((loss).item())

    log_metrics([f'{dataset_type}_eval/total_loss'], [np.mean(losses)], epoch, cfg_logging)

    accuracy = 100 * correct.item() / total
    print(f'Accuracy on {dataset_type} data: {accuracy}')

    log_metrics([f'{dataset_type}_eval/accuracy'], [accuracy], epoch, cfg_logging)
    print(f'Evaluating time: {round((time.time() - eval_start_time) / 60, 3)} min')

In [None]:
""" utils/log_utils.py """


import mlflow


def start_logging(cfg, experiment_name=None):
    """
    Starts mlflow logging
    :param cfg: cfg['logging'] part of config
    :param experiment_name: experiment name for mlflow visualization
    """
    if cfg['log_metrics']:
        experiment_name = cfg['train']['experiment_name'] if experiment_name is None else experiment_name
        mlflow.start_run(run_name=experiment_name)


def end_logging(cfg):
    """
    Finishes mlflow logging
    :param cfg: cfg['logging'] part of config
    """
    if cfg['log_metrics']:
        mlflow.end_run()


def log_metrics(names, metrics, step, cfg):
    """
    Logs metrics in given list with corresponding names
    :param names: list of names of given metrics
    :param metrics: list of given metrics
    :param step: step to log
    :param cfg: cfg['logging'] part of config
    """
    if cfg['log_metrics']:
        for name, metric in zip(names, metrics):
            mlflow.log_metric(name, metric, step)


def log_params(cfg):
    """
    Logs experiment config with all parameters
    :param cfg: cfg['logging'] part of config
    """
    if cfg['log_metrics']:
        mlflow.log_param('cfg', cfg)

In [None]:
""" utils/train_utils.py """

import torch


def get_optimizer(cfg, model):
    """
    Gets Adam optimizer
    :param cfg: cfg['train']['optimizer'] part of config
    :param model: resnet-50 model
    :return: optimizer
    """
    print(f'Getting optimizer...')
    opt = torch.optim.Adam(model.parameters(), lr=cfg['lr'], weight_decay=cfg['weight_decay'])
    return opt


def get_criterion():
    """
    Gets loss function
    :return: loss function
    """
    print(f'Getting criterion...')
    criterion = torch.nn.CrossEntropyLoss()
    return criterion


def make_training_step(cfg_train, batch, model, criterion, optimizer):
    """
    Makes single parameters updating step.
    :param cfg_train: cfg['train'] part of config
    :param batch: current batch
    :param model: resnet50 model
    :param criterion: criterion
    :param optimizer: optimizer
    :param iter_: current iteration
    :return: current loss value
    """
    images, labels = batch
    images, labels, model = images.cuda(), labels.cuda(), model.cuda()
    optimizer.zero_grad()
    logits = model(images)
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
""" trainer/main.py """

import torch
import time
import numpy as np

# from models.resnet50 import get_model
# from data.coco_80_dataset import get_data
# from utils.train_utils import get_optimizer, get_criterion, make_training_step
# from utils.eval_utils import evaluate
# from utils.debug_utils import save_batch_images, overfit_on_batch
# from utils.log_utils import start_logging, end_logging, log_metrics, log_params
# from configs.config import cfg


def train(cfg, train_dl, test_dl, model, optimizer, criterion):
    # check data before training
    if cfg['debug']['save_batch']['enable']:
        save_batch_images(cfg['debug']['save_batch'], train_dl, test_dl)

    # check training procedure before training
    if cfg['debug']['overfit_on_batch']['enable']:
        overfit_on_batch(cfg['debug']['overfit_on_batch'], cfg['train'], train_dl, model, optimizer, criterion)

    # save experiment name and experiment params to mlflow
    start_logging(cfg['logging'], experiment_name='baseline')
    log_params(cfg['logging'])

    global_step, start_epoch = 0, 0
    if cfg['logging']['load_model']:
        print(f'Trying to load checkpoint from epoch {cfg["logging"]["epoch_to_load"]}...')
        checkpoint = torch.load(cfg['logging']['checkpoints_dir'] + f'checkpoint_{cfg["logging"]["epoch_to_load"]}.pth')
        load_state_dict = checkpoint['model']
        model.load_state_dict(load_state_dict)
        start_epoch = checkpoint['epoch'] + 1
        global_step = checkpoint['global_step'] + 1
        print(f'Successfully loaded checkpoint from epoch {cfg["logging"]["epoch_to_load"]}.')

    # evaluate on train and test data before training
    if cfg['eval']['evaluate_before_training']:
        model.eval()
        if cfg['eval']['evaluate_on_train_data']:
            evaluate(cfg['train'], cfg['logging'], model, train_dl, -1, 'train', criterion)
        evaluate(cfg['train'], cfg['logging'], model, test_dl, -1, 'valid', criterion)
        model.train()

    nb_iters_per_epoch = len(train_dl.dataset) // train_dl.batch_size

    # training loop
    for epoch in range(start_epoch, cfg['train']['epochs']):
        losses = []
        epoch_start_time = time.time()
        print(f'Epoch: {epoch}')
        for iter_, batch in enumerate(train_dl):
            loss = make_training_step(cfg['train'], batch, model, criterion, optimizer)
            losses.append(loss)
            global_step += 1

            log_metrics(['train/loss'], [loss], global_step, cfg['logging'])

            if global_step % 100 == 0:
                mean_loss = np.mean(losses[:-20]) if len(losses) > 20 else np.mean(losses)
                print(f'step: {global_step}, total_loss: {mean_loss}')

        # log mean loss per epoch
        log_metrics(['train/mean_loss'], [np.mean(losses[:-nb_iters_per_epoch])], epoch, cfg['logging'])
        print(f'Epoch time: {round((time.time() - epoch_start_time) / 60, 3)} min')

        # save model
        if cfg['logging']['save_model'] and epoch % cfg['logging']['save_frequency'] == 0:
            print('Saving current model...')
            state = {
                'model': model.state_dict(),
                'epoch': epoch,
                'global_step': global_step,
                'opt': optimizer.state_dict(),
            }
            torch.save(state, cfg['logging']['checkpoints_dir'] + f'checkpoint_{epoch}.pth')

        # evaluate on train and test data
        model.eval()
        if cfg['eval']['evaluate_on_train_data']:
            evaluate(cfg['train'], cfg['logging'], model, train_dl, epoch, 'train', criterion)
        evaluate(cfg['train'], cfg['logging'], model, test_dl, epoch, 'valid', criterion)
        model.train()

    end_logging(cfg['logging'])


def run(cfg):
    train_dl, test_dl = get_data(cfg['data'])
    model = get_model(cfg)
    optimizer = get_optimizer(cfg['train']['optimizer'], model)
    criterion = get_criterion()

    train(cfg, train_dl, test_dl, model, optimizer, criterion)


if __name__ == '__main__':
    start_time = time.time()
    run(cfg)
    print(f'Total time: {round((time.time() - start_time) / 60, 3)} min')