In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-1.17.0-py3-none-any.whl (14.2 MB)
[K     |████████████████████████████████| 14.2 MB 316 kB/s eta 0:00:01    |███████████████████▌            | 8.6 MB 5.4 MB/s eta 0:00:02
Collecting querystring-parser
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 4.6 MB/s  eta 0:00:01
Collecting databricks-cli>=0.8.7
  Downloading databricks-cli-0.14.3.tar.gz (54 kB)
[K     |████████████████████████████████| 54 kB 1.9 MB/s  eta 0:00:01
Collecting alembic<=1.4.1
  Downloading alembic-1.4.1.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.5 MB/s eta 0:00:01
Collecting prometheus-flask-exporter
  Downloading prometheus_flask_exporter-0.18.2.tar.gz (22 kB)
Building wheels for collected packages: alembic, databricks-cli, prometheus-flask-exporter
  Building wheel for alembic (setup.py) ... [?25ldone
[

In [2]:
# !pip install neptune
# import neptune
# run = neptune.init(project='railyavaliullina/DL-SOP-classification')

In [3]:
""" config """

cfg = {
    # parameters for dataset and dataloader
    "data":
        {
            "dataset_path": '/kaggle/input/',
            "nb_train_images": 71940,
            "nb_valid_images": 24045,
            "nb_classes": 12,
            "dataloader": {
                "nb_epochs": 20,
                "shuffle": {
                    "train": True,
                    "valid": False
                },
                "batch_size": {
                    "train": 128,
                    "valid": 256
                },
            },
            "augmentation":
                {
                    "sz_crop": 224,
                    "sz_resize": 256,
                    "mean": [0.485, 0.456, 0.406],
                    "std": [0.229, 0.224, 0.225],
                    "contrast": 0.4,
                    "saturation": 0.4,
                    "brightness": 0.4,
                }
        },

    # parameters for ResNet-50 model parts
    "model":
        {
            'pretrained':
                {
                    'load_pretrained': False,
                    'url': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
                    'progress': True
                },

            'FirstConv':
                {
                    'in_channels': 3,
                    'out_channels': 64,
                    'kernel_size': 7,
                    'stride': 2,
                    'padding': 3,
                    'bias': False
                },
            'MaxPool':
                {
                    'kernel_size': 3,
                    'stride': 2,
                    'padding': 1
                },
            'LayersGroup':
                {
                    # BottleNeck class common parameters
                    'BottleNeck':
                        {
                            'kernel_size': [1, 3, 1],
                            'padding': [0, 1, 0],
                            'bias': False,
                            'downsample':
                                {
                                    'kernel_size': 1,
                                    'bias': False
                                }
                        },
                    # layer group specific parameters
                    'layer1': {
                        'in_channels': 64,
                        'out_channels': 64,
                        'nb_layers': 3,
                        'stride': 1
                    },
                    'layer2': {
                        'in_channels': 256,
                        'out_channels': 128,
                        'nb_layers': 4,
                        'stride': 2
                    },
                    'layer3': {
                        'in_channels': 512,
                        'out_channels': 256,
                        'nb_layers': 6,
                        'stride': 2
                    },
                    'layer4': {
                        'in_channels': 1024,
                        'out_channels': 512,
                        'nb_layers': 3,
                        'stride': 2
                    }
                },
            'AvgPool':
                {
                    'output_size': (1, 1)
                },
            'Linear':
                {
                    'in_features': 2048,
                    'out_features': 12,
                    'bias': True
                }
        },

    # parameters for setting up training parameters
    "train":
        {
            # training stage common parameters
            'epochs': 100,

            # optimizer parameters
            'opt':
                {
                    'optim_type': 'SGD',
                    'learning_rate': 0.01,
                    'momentum': 0.9,
                    'weight_decay': 5e-4,
                    'nesterov': True
                }
        },

    # parameters for model evaluation
    "eval":
        {
            'evaluate_on_train_data': False,
            'evaluate_before_training': True,
        },

    # parameters for logging training process, saving/restoring model
    "logging":
        {
            'log_metrics': True,
            'experiment_name': 'baseline',
            'checkpoints_dir': '/kaggle/input/checkpoints/',
            'save_model': True,
            'load_model': False,
            'epoch_to_load': 20,
            'save_frequency': 1,
        },

    # parameters to debug training and check if everything is ok
    "debug":
        {
            # to check batches before training
            "save_batch":
                {
                    "enable": False,
                    "nrof_batches_to_save": 5,
                    "path_to_save": '',
                },
            "overfit_on_batch":
                {
                    "enable": False,
                    "nb_iters": 1000,
                }
        },
}

In [4]:
""" data/dataset.py """

import torch
from torchvision import transforms as transforms_
from torchvision.datasets import ImageFolder
from collections import Counter


class SOPDataset(torch.utils.data.Dataset):
    def __init__(self, cfg, dataset_type):
        """
        Class for getting SOP dataset
        :param cfg: cfg['data'] part of config
        :param dataset_type: type of data ('train' or 'valid')
        """
        cfg_aug = cfg['augmentation']
        self.sz_crop = cfg_aug['sz_crop']
        self.sz_resize = cfg_aug['sz_resize']
        self.mean = cfg_aug['mean']
        self.std = cfg_aug['std']
        self.contrast = cfg_aug['contrast']
        self.saturation = cfg_aug['saturation']
        self.brightness = cfg_aug['brightness']

        self.nb_classes = cfg['nb_classes']
        self.dataset_type = dataset_type
        # directory with all images
        self.dataset_path = cfg['dataset_path'] + "sop-" + dataset_type + "/" + dataset_type + "/"

        if dataset_type == 'train':
            transforms = transforms_.Compose([
                transforms_.RandomResizedCrop(self.sz_crop),
                transforms_.RandomHorizontalFlip(),
                transforms_.ColorJitter(contrast=self.contrast, saturation=self.saturation, brightness=self.brightness),
                transforms_.ToTensor(),
                transforms_.Normalize(
                    mean=self.mean,
                    std=self.std,
                )
            ])
        elif dataset_type == 'valid':
            transforms = transforms_.Compose([
                    transforms_.Resize(self.sz_resize),
                    transforms_.CenterCrop(self.sz_crop),
                    transforms_.ToTensor(),
                    transforms_.Normalize(
                        mean=self.mean,
                        std=self.std,
                    )
                ])
        else:
            raise Exception

        print(f'Creating ImageFolder for {dataset_type} set...')
        self.image_folder = ImageFolder(self.dataset_path, transforms)
        self.image_folder.dataset_type = dataset_type
        self.image_folder.nb_classes = cfg['nb_classes']
        self.image_folder.labels = [sample[1] for sample in self.image_folder.samples]
        self.image_folder.nb_images_per_class = Counter(np.asarray(self.image_folder.labels))

        assert len(self.image_folder) == cfg[f'nb_{dataset_type}_images'], \
            f'Incorrect number of images in {dataset_type} set.'

In [5]:
""" data/dataloader.py """

from torch.utils.data import DataLoader
# from data.dataset import SOPDataset


def get_dataloader(cfg, dataset_type):
    """
    Get dataloader within dataset
    :param cfg: cfg['data'] part of config
    :param dataset: dataset to get dataloader from
    :return: dataLoader
    """
    dataset = SOPDataset(cfg, dataset_type)
    dl = DataLoader(dataset.image_folder,
                    batch_size=cfg['dataloader']['batch_size'][dataset_type],
                    shuffle=cfg['dataloader']['shuffle'][dataset_type])
    return dl

In [6]:
""" models/resnet_model.py """

import torch
import torch.nn as nn
from torch.hub import load_state_dict_from_url


class FirstConv(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        """
        :param cfg: cfg['model']['FirstConv'] part of config
        """
        self.in_channels = cfg['in_channels']
        self.out_channels = cfg['out_channels']
        self.kernel_size = cfg['kernel_size']
        self.stride = cfg['stride']
        self.padding = cfg['padding']
        self.bias = cfg['bias']

        self.conv = nn.Conv2d(in_channels=self.in_channels, out_channels=self.out_channels,
                              kernel_size=self.kernel_size,
                              stride=self.stride, padding=self.padding, bias=self.bias)
        self.bn = nn.BatchNorm2d(self.out_channels)
        self.relu = nn.ReLU(inplace=True)

    def __call__(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x


class Bottleneck(nn.Module):
    def __init__(self, cfg, in_channels, out_channels, stride=1, is_downsampling=False):
        super(Bottleneck, self).__init__()
        """
        :param cfg: cfg['model']['LayersGroup']['BottleNeck'] part of config
        """
        self.kernel_size = cfg['kernel_size']
        self.padding = cfg['padding']
        self.bias = cfg['bias']

        out_channels_2 = 4 * out_channels
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=self.kernel_size[0], padding=self.padding[0],
                               bias=self.bias, stride=stride)
        self.bn1 = nn.BatchNorm2d(out_channels)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=self.kernel_size[1], padding=self.padding[1],
                               bias=self.bias)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.conv3 = nn.Conv2d(out_channels, out_channels_2, kernel_size=self.kernel_size[2], padding=self.padding[2],
                               bias=self.bias)
        self.bn3 = nn.BatchNorm2d(out_channels_2)
        self.relu = nn.ReLU(inplace=True)
        if is_downsampling:
            self.downsample = nn.Sequential(*[nn.Conv2d(in_channels, out_channels_2,
                                                        kernel_size=cfg['downsample']['kernel_size'], stride=stride,
                                                        bias=cfg['downsample']['bias']), nn.BatchNorm2d(out_channels_2)])
        else:
            self.downsample = None

    def forward(self, x):
        identity = x
        conv1 = self.relu(self.bn1(self.conv1(x)))
        conv2 = self.relu(self.bn2(self.conv2(conv1)))
        conv3 = self.bn3(self.conv3(conv2))

        if self.downsample is not None:
            identity = self.downsample(x)

        out = conv3 + identity
        return self.relu(out)


class LayersGroup(nn.Module):
    def __init__(self, cfg, name):
        super(LayersGroup, self).__init__()
        """
        :param cfg: cfg['model']['LayersGroup'] part of config
        :param name: name of current layer group
        """
        self.in_channels = cfg[name]['in_channels']
        self.out_channels = cfg[name]['out_channels']
        self.nb_layers = cfg[name]['nb_layers']
        self.stride = cfg[name]['stride']

        self.out_channels_2 = 4 * self.out_channels
        self.layers_group = [Bottleneck(cfg['BottleNeck'], self.in_channels, self.out_channels, stride=self.stride,
                                        is_downsampling=True)]
        for _ in range(1, self.nb_layers):
            self.layers_group.append(Bottleneck(cfg['BottleNeck'], self.out_channels_2, self.out_channels))
        self.layers_group = nn.Sequential(*self.layers_group)


class ResNet50(nn.Module):
    def __init__(self, cfg):
        super(ResNet50, self).__init__()
        """
        Collects all parts of ResNet50 model
        :param cfg: cfg['model'] part of config
        """
        self.conv1 = FirstConv(cfg['FirstConv'])
        self.maxpool = nn.MaxPool2d(kernel_size=cfg['MaxPool']['kernel_size'], stride=cfg['MaxPool']['stride'],
                                    padding=cfg['MaxPool']['padding'])

        self.layer1 = LayersGroup(cfg['LayersGroup'], name='layer1').layers_group
        self.layer2 = LayersGroup(cfg['LayersGroup'], name='layer2').layers_group
        self.layer3 = LayersGroup(cfg['LayersGroup'], name='layer3').layers_group
        self.layer4 = LayersGroup(cfg['LayersGroup'], name='layer4').layers_group

        self.avgpool = nn.AdaptiveAvgPool2d(output_size=cfg['AvgPool']['output_size'])
        self.fc = nn.Linear(in_features=cfg['Linear']['in_features'], out_features=cfg['Linear']['out_features'],
                            bias=cfg['Linear']['bias'])

    def forward(self, x):
        conv1 = self.conv1(x)
        maxpool = self.maxpool(conv1)
        layer1 = self.layer1(maxpool)
        layer2 = self.layer2(layer1)
        layer3 = self.layer3(layer2)
        layer4 = self.layer4(layer3)
        avgpool = self.avgpool(layer4)
        fc = self.fc(torch.flatten(avgpool, 1))
        return fc


def get_model(cfg):
    """
    Gets ResNet-50 model
    :param cfg: cfg['model'] part of config
    :return: ResNet-50 model
    """
    model = ResNet50(cfg)

    nb_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'Trainable parameters number: {nb_trainable_params}')

    if cfg['pretrained']['load_pretrained']:
        print(f'Loading pretrained weights to initialize model...')
        state_dict = load_state_dict_from_url(cfg['pretrained']['url'], progress=cfg['pretrained']['progress'])
        model.load_state_dict(state_dict)
    else:
        print(f'Initializing weights with xavier uniform...')
        model_parameters = model.parameters()
        for i, param in enumerate(model_parameters):
            if len(param.size()) == 4:
                torch.nn.init.xavier_uniform_(param)
    return model

In [7]:
""" utils/debug_utils.py """

import torchvision
import torch
import numpy as np


def overfit_on_batch(cfg_overfit_on_batch, cfg_train, train_dl, model, optimizer, criterion):
    """
    Overfits on one batch
    :param cfg_overfit_on_batch: cfg['debug']['overfit_on_batch'] part of config
    :param train_dl: train dataloader
    :param model: resnet50 model
    :param optimizer: optimizer
    :param criterion: criterion
    """
    train_dl = iter(train_dl)
    images, labels = next(train_dl)
    model = model.cuda()
    accuracies = []

    for iter_ in range(cfg_overfit_on_batch['nb_iters']):
        optimizer.zero_grad()
        logits = model(images.cuda()).cpu()
        # calculate loss
        cross_entropy_loss = criterion(logits, labels)
        l2_reg = torch.tensor(0.0, requires_grad=True)
        # for p in model.named_parameters():
        #     if '.bias' not in p[0] and '.bn' not in p[0]:  # no biases or BN params
        #         l2_reg = l2_reg + cfg_train['opt']['weight_decay'] * p[1].norm(2)
        loss = cross_entropy_loss + l2_reg
        # calculate accuracy
        _, predicted = torch.max(logits.data, 1)
        accuracy = torch.sum(predicted == labels).item() / labels.size(0) * 100
        print(f'iter: {iter_}, acc: {accuracy}, cross_entropy_loss: {cross_entropy_loss.item()}, l2_reg: {l2_reg.item()}, '
              f'total loss: {loss.item()}')

        accuracies.append(accuracy)
        if len(accuracies) >= 5 and np.min(accuracies[-5:]) == 100:
            break

        loss.backward()
        optimizer.step()
    print(f'Overfitting on batch is finished.')


def save_batch_images(cfg, train_dl, valid_dl):
    """
    Saves several batches of images as .png file
    :param cfg: cfg['debug']['save_batch'] part of config
    :param train_dl: train dataloader to saves batches from
    :param valid_dl: valid dataloader to saves batches from
    """
    for dl in [train_dl, valid_dl]:
        dataset_type = dl.dataset.dataset_type
        print(dataset_type)
        dl = iter(dl)
        for i in range(cfg['nrof_batches_to_save']):
            images, labels = next(dl)
            print(f'batch {i} labels: {labels}')
            torchvision.utils.save_image(images, cfg['path_to_save'] + f'{dataset_type}_batch_{i}.png')

In [8]:
""" utils/eval_utils.py """

import time
import numpy as np
import torch

# from utils.log_utils import log_metrics


def evaluate(cfg_train, cfg_logging, model, dl, epoch, dataset_type, criterion):
    """
    Evaluates on train/valid data
    :param cfg_eval: cfg['train'] part of config
    :param cfg_logging: cfg['logging'] part of config
    :param model: resnet-50 model
    :param dl: train/valid dataloader
    :param epoch: epoch for logging
    :param dataset_type: type of current data ('train' or 'valid')
    """
    print(f'Evaluating on {dataset_type} data...')
    eval_start_time = time.time()
    correct, total = 0, 0
    cross_entropy_losses, reg_losses, losses = [], [], []
    unique_labels = np.unique(dl.dataset.labels)
    accuracies_for_classes = [0 for _ in unique_labels]
    model = model.cuda()

    dl_len = len(dl)
    for i, (images, labels) in enumerate(dl):
        images, labels = images.cuda(), labels.cuda()

        if i % 50 == 0:
            print(f'iter: {i}/{dl_len}')

        logits = model(images)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += torch.sum(predicted == labels)

        for i, l in enumerate(labels):
            accuracies_for_classes[l] += torch.sum((predicted[i] == l))

        # calculate losses
        cross_entropy_loss = criterion(logits, labels)
        cross_entropy_losses.append(cross_entropy_loss.item())
        l2_reg = torch.tensor(0.0, requires_grad=True)
        for p in model.parameters():
            l2_reg = l2_reg + cfg_train['opt']['weight_decay'] * p.norm(2)
        reg_losses.append(l2_reg.item())
        losses.append((cross_entropy_loss + l2_reg).item())

    log_metrics([f'{dataset_type}_eval/cross_entropy_loss', f'{dataset_type}_eval/reg_loss_train',
                 f'{dataset_type}_eval/total_loss_train'],
                [np.mean(cross_entropy_losses), np.mean(reg_losses), np.mean(losses)], epoch, cfg_logging)

    accuracy = 100 * correct.item() / total
    print(f'Accuracy on {dataset_type} data: {accuracy}')
    accuracies_for_classes = [100 * acc.item() / dl.dataset.nb_images_per_class[i] for i, acc in
                              enumerate(accuracies_for_classes)]
    print(f'accuracies for classes: {accuracies_for_classes}')
    balanced_acc = sum(accuracies_for_classes) / dl.dataset.nb_classes
    print(f'Balanced accuracy: {balanced_acc}')

    for i, acc in enumerate(accuracies_for_classes):
        log_metrics([f'{dataset_type}_eval/accuracy_class_{i}'], [acc], epoch, cfg_logging)

    log_metrics([f'{dataset_type}_eval/accuracy', f'{dataset_type}_eval/balanced_accuracy'], [accuracy, balanced_acc],
                epoch, cfg_logging)
    print(f'Evaluating time: {round((time.time() - eval_start_time) / 60, 3)} min')

In [9]:
""" utils/log_utils.py """

import mlflow


def start_logging(cfg, experiment_name=None):
    """
    Starts mlflow logging
    :param cfg: cfg['logging'] part of config
    :param experiment_name: experiment name for mlflow visualization
    """
    if cfg['log_metrics']:
        experiment_name = cfg['train']['experiment_name'] if experiment_name is None else experiment_name
        mlflow.start_run(run_name=experiment_name)


def end_logging(cfg):
    """
    Finishes mlflow logging
    :param cfg: cfg['logging'] part of config
    """
    if cfg['log_metrics']:
        mlflow.end_run()


def log_metrics(names, metrics, step, cfg):
    """
    Logs metrics in given list with corresponding names
    :param names: list of names of given metrics
    :param metrics: list of given metrics
    :param step: step to log
    :param cfg: cfg['logging'] part of config
    """
    if cfg['log_metrics']:
        for name, metric in zip(names, metrics):
            mlflow.log_metric(name, metric, step)


def log_params(cfg):
    """
    Logs experiment config with all parameters
    :param cfg: cfg['logging'] part of config
    """
    if cfg['log_metrics']:
        mlflow.log_param('cfg', cfg)


In [10]:
""" utils/train_utils.py """

import torch


def get_optimizer(cfg, model):
    """
    Gets optimizer for parameters update
    :param cfg: cfg['train']['opt'] part of config
    :param model: ResNet-50 model
    :return: optimizer
    """
    if cfg['optim_type'] == 'SGD':
        opt = torch.optim.SGD(params=model.parameters(),
                              lr=cfg['learning_rate'],
                              momentum=cfg['momentum'],
                              weight_decay=cfg['weight_decay'],
                              nesterov=cfg['nesterov'])
    else:
        raise Exception
    return opt


def get_criterion():
    """
    Gets loss function
    :return: loss function
    """
    criterion = torch.nn.CrossEntropyLoss()
    return criterion


def make_training_step(cfg_train, batch, model, criterion, optimizer):
    """
    Makes single parameters updating step.
    :param cfg_train: cfg['train'] part of config
    :param batch: current batch
    :param model: resnet50 model
    :param criterion: criterion
    :param optimizer: optimizer
    :param iter_: current iteration
    :return: current loss value
    """
    images, labels = batch
    images, labels, model = images.cuda(), labels.cuda(), model.cuda()
    optimizer.zero_grad()
    logits = model(images)
    cross_entropy_loss = criterion(logits, labels)
    l2_reg = torch.tensor(0.0, requires_grad=True)
    # for p in model.named_parameters():
    #     if '.bias' not in p[0] and '.bn' not in p[0]:  # no biases or BN params
    #         l2_reg = l2_reg + cfg_train['opt']['weight_decay'] * p[1].norm(2)
    loss = cross_entropy_loss + l2_reg
    loss.backward()
    optimizer.step()
    return loss.item(), l2_reg.item(), cross_entropy_loss.item()

In [None]:
""" train/main.py """

import time
import numpy as np
import torch
import tarfile
import os

# from data.dataloader import get_dataloader
# from utils.train_utils import get_optimizer, get_criterion, make_training_step
# from utils.eval_utils import evaluate
# from utils.debug_utils import save_batch_images, overfit_on_batch
# from utils.log_utils import start_logging, end_logging, log_metrics, log_params
# from models.resnet_model import get_model
# from configs.config import cfg


def train(cfg, train_dl, valid_dl, model, opt, criterion):

    # check data before training
    if cfg['debug']['save_batch']['enable']:
        save_batch_images(cfg['debug']['save_batch'], train_dl, valid_dl)

    # check training procedure before training
    if cfg['debug']['overfit_on_batch']['enable']:
        overfit_on_batch(cfg['debug']['overfit_on_batch'], cfg['train'], train_dl, model, opt, criterion)

    # save experiment name and experiment params to mlflow
    start_logging(cfg['logging'], experiment_name='baseline')
    log_params(cfg['logging'])

    global_step, start_epoch = 0, 0
    if cfg['logging']['load_model']:
        print(f'Trying to load checkpoint from epoch {cfg["logging"]["epoch_to_load"]}...')
        checkpoint = torch.load(cfg['logging']['checkpoints_dir'] + f'checkpoint_{cfg["logging"]["epoch_to_load"]}.pth')
        load_state_dict = checkpoint['model']
        model.load_state_dict(load_state_dict)
        start_epoch = checkpoint['epoch'] + 1
        global_step = checkpoint['global_step'] + 1
        print(f'Successfully loaded checkpoint from epoch {cfg["logging"]["epoch_to_load"]}.')

    # evaluate on train and test data before training
    if cfg['eval']['evaluate_before_training']:
        model.eval()
        with torch.no_grad():
            if cfg['eval']['evaluate_on_train_data']:
                evaluate(cfg['train'], cfg['logging'], model, train_dl, -1, 'train', criterion)
            evaluate(cfg['train'], cfg['logging'], model, valid_dl, -1, 'valid', criterion)
        model.train()

    nb_iters_per_epoch = len(train_dl.dataset) // train_dl.batch_size

    # training loop
    for epoch in range(start_epoch, cfg['train']['epochs']):
        losses, reg_losses, cross_entropy_losses = [], [], []
        epoch_start_time = time.time()
        print(f'Epoch: {epoch}')
        for iter_, batch in enumerate(train_dl):
            loss, reg_loss, cross_entropy_loss = make_training_step(cfg['train'], batch, model, criterion, opt)
            losses.append(loss)
            reg_losses.append(reg_loss)
            cross_entropy_losses.append(cross_entropy_loss)
            global_step += 1

            log_metrics(['train/loss', 'train/reg_loss', 'train/cross_entropy_loss'], [loss, reg_loss, cross_entropy_loss],
                        global_step, cfg['logging'])

            if global_step % 100 == 0:
                mean_loss = np.mean(losses[:-20]) if len(losses) > 20 else np.mean(losses)
                mean_reg_loss = np.mean(reg_losses[:-20]) if len(reg_losses) > 20 else np.mean(reg_losses)
                mean_cross_entropy_loss = np.mean(cross_entropy_losses[:-20]) if len(cross_entropy_losses) > 20 \
                    else np.mean(cross_entropy_losses)
                print(f'step: {global_step}, total_loss: {mean_loss}, cross_entropy_loss: {mean_cross_entropy_loss}, '
                      f'reg_loss: {mean_reg_loss}')

        # log mean loss per epoch
        log_metrics(['train/mean_loss'], [np.mean(losses[:-nb_iters_per_epoch])], epoch, cfg['logging'])
        print(f'Epoch time: {round((time.time() - epoch_start_time) / 60, 3)} min')

        # save model
        if cfg['logging']['save_model'] and epoch % cfg['logging']['save_frequency'] == 0:
            print('Saving current model...')
            state = {
                'model': model.state_dict(),
                'epoch': epoch,
                'global_step': global_step,
                'opt': opt.state_dict(),
            }
            torch.save(state, f'checkpoint_{epoch}.pth')

        # save mlruns as tarfile
        files_to_save = os.listdir('/kaggle/working/')
        tar = tarfile.open(f'mlruns_epoch_{epoch}.tar.gz', 'w:gz')
        for item in files_to_save:
            if item.startswith('mlruns') and not item.startswith('mlruns_epoch'):
                tar.add(item)
        tar.close()

        # evaluate on train and test data
        model.eval()
        with torch.no_grad():
            if cfg['eval']['evaluate_on_train_data']:
                evaluate(cfg['train'], cfg['logging'], model, train_dl, epoch, 'train', criterion)
            evaluate(cfg['train'], cfg['logging'], model, valid_dl, epoch, 'valid', criterion)
        model.train()

    end_logging(cfg['logging'])


def run(cfg):
    train_dl = get_dataloader(cfg['data'], 'train')
    valid_dl = get_dataloader(cfg['data'], 'valid')

    model = get_model(cfg['model'])
    opt = get_optimizer(cfg['train']['opt'], model)
    criterion = get_criterion()

    # run training
    train(cfg, train_dl, valid_dl, model, opt, criterion)


if __name__ == '__main__':
    start_time = time.time()
    run(cfg)
    print(f'Total time: {round((time.time() - start_time) / 60, 3)} min')

Creating ImageFolder for train set...
Creating ImageFolder for valid set...
Trainable parameters number: 23532620
Initializing weights with xavier uniform...
Evaluating on valid data...
iter: 0/94
iter: 50/94
Accuracy on valid data: 11.524225410688294
accuracies for classes: [0.0, 0.0, 0.0, 0.0, 0.0, 9.328726554787758, 0.0, 94.68280161349469, 0.0, 0.0, 0.0, 0.0]
Balanced accuracy: 8.667627347356872
Evaluating time: 5.252 min
Epoch: 0
step: 100, total_loss: 2.8380133479833605, cross_entropy_loss: 2.8380133479833605, reg_loss: 0.0
step: 200, total_loss: 2.6127919528219437, cross_entropy_loss: 2.6127919528219437, reg_loss: 0.0
step: 300, total_loss: 2.5220432571002416, cross_entropy_loss: 2.5220432571002416, reg_loss: 0.0
step: 400, total_loss: 2.47405909174367, cross_entropy_loss: 2.47405909174367, reg_loss: 0.0
step: 500, total_loss: 2.4401849458614984, cross_entropy_loss: 2.4401849458614984, reg_loss: 0.0
Epoch time: 23.958 min
Saving current model...
Evaluating on valid data...
iter: 