# Cerebro

#### <font color='blue'>Imports</font>

In [None]:
from cerebro.etl.etl_spec import ETLSpec
from cerebro.experiment import Experiment
from cerebro.mop.sub_epoch_spec import SubEpochSpec

### <font color='blue'> Initialize Data Preprocessing </font>

In [None]:
class ImagenetETLSpec(ETLSpec):
    def __init__(self):
        self.is_feature_download = [False, False, True, False, False]
        
    def initialize_worker(self):
        pass
    
    def row_preprocessor(self, row, mode, object_dir):
        import torch
        import numpy as np
        from PIL import Image

        input_image_path = object_dir + "/" + str(row["filepath"])

        pil_image = Image.open(input_image_path)
        image = np.asarray(pil_image.convert('RGB').resize((112, 112)))
        image = image / 255.0
        image = image - [0.485, 0.456, 0.406]
        image = image / [0.229, 0.224, 0.225]

        torch_image = torch.from_numpy(image).float()
        image = torch.reshape(torch_image, (torch_image.shape[2], torch_image.shape[0], torch_image.shape[1]))
        if mode == 'test':
            return image, None
        else:
            label = torch.tensor(row["label"])
            return image, label

### <font color='blue'> Initialize Model Building </font>

In [None]:
class ImagenetTrainingSpec(SubEpochSpec):
    def __init__(self):
        import torch
        import torch.nn as nn

        self.criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
        self.log_softmax = torch.nn.LogSoftmax().cuda() if torch.cuda.is_available() else torch.nn.LogSoftmax()

    def initialize_worker(self):
        pass

    def accuracy(self, output, target, topk=(1, ), binary=False):
        import torch

        """Computes the precision@k for the specified values of k"""
        if binary:
            batch_size = target.size(0)
            _, pred = torch.max(output.data, 1)
            correct = (pred == target).sum().item()
            res = [torch.tensor(correct / batch_size)]
        else:
            maxk = max(topk)
            maxk = min(maxk, output.shape[1])
            batch_size = target.size(0)

            _, pred = output.topk(maxk, 1, True, True)
            pred = pred.t()
            correct = pred.eq(target.view(1, -1).expand_as(pred))
            res = []
            for k in topk:
                correct_k = correct[:k].reshape(-1).float().sum(0)
                res.append(correct_k.mul_(1.0 / batch_size))
        return res

    def train(self, parallelize, save_checkpoint, model_file, train_loader, hyperparams, device, logger):
        import os
        import math
        import torch
        import numpy as np
        from torchvision import models

        batch_size = hyperparams["batch_size"]
        learning_rate = hyperparams["learning_rate"]
        lambda_value = hyperparams["lambda_value"]
        model_type = hyperparams["model_type"]
        train_results = {
            "stats": [],
            "additive_metrics": {}
        }

        if model_type == "resnet50":
            model = models.resnet50(pretrained=False)
        elif model_type == "vgg16":
            model = models.vgg16(pretrained=False)
        model = model.to(device)
        model = parallelize(model)
        optimizer = torch.optim.Adam(model.parameters(),
                                         lr=learning_rate,
                                         weight_decay=lambda_value)
        
        if os.path.isfile(model_file):
            checkpoint = torch.load(model_file)
            model.load_state_dict(checkpoint["model"])
            optimizer.load_state_dict(checkpoint["optimizer"])
        
        model.train()

        i_step = 0
        total_subepoch_loss = 0
        total_subepoch_top_1_acc = 0
        total_subepoch_top_5_acc = 0
        subepoch_total_step = math.ceil(len(train_loader.dataset) / batch_size)

        for batch in train_loader:
            images, labels = batch[0].to(device), torch.tensor(batch[1]).to(device)
            optimizer.zero_grad()
                        
            outputs = model(images)
            
            loss = self.criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            outputs_softmax = self.log_softmax(outputs)
            
            top_1_acc, top_5_acc = self.accuracy(outputs_softmax, labels, (1, 5))
            
            subepoch_loss = loss.item()
            subepoch_top_1_acc = top_1_acc.item()
            subepoch_top_5_acc = top_5_acc.item()

            total_subepoch_loss += subepoch_loss
            total_subepoch_top_1_acc += subepoch_top_1_acc
            total_subepoch_top_5_acc += subepoch_top_5_acc

            stats_dict = {
                "subepoch_loss": subepoch_loss,
                "subepoch_top_1_acc": subepoch_top_1_acc,
                "subepoch_top_5_acc": subepoch_top_5_acc
            }

            stats = "Train step [%d/%d], subepoch_loss: %.4f, subepoch_top_1_acc: %5.4f, , subepoch_top_5_acc: %5.4f" \
                        % (i_step, subepoch_total_step,subepoch_loss, 
                           subepoch_top_1_acc, subepoch_top_5_acc)

            print("\r" + stats, end="")

            train_results["stats"].append(stats_dict)

            i_step += 1

        train_results["additive_metrics"] = {
            "total_subepoch_loss": total_subepoch_loss,
            "total_subepoch_top_1_acc": total_subepoch_top_1_acc,
            "total_subepoch_top_5_acc": total_subepoch_top_5_acc
        }
    
        logger(train_results)

        save_checkpoint({"model": model.state_dict(),
                    "optimizer" : optimizer.state_dict()
                   })
    
    def test(self, parallelize, save_checkpoint, model_file, test_loader, hyperparams, device, logger):
        import os
        import math
        import torch
        import numpy as np
        from torchvision import models

        batch_size = hyperparams["batch_size"]
        model_type = hyperparams["model_type"]
        test_results = {
            "stats": []
        }

        if model_type == "resnet50":
            model = models.resnet50(pretrained=False)
        elif model_type == "vgg16":
            model = models.vgg16(pretrained=False)
        model = model.to(device)
        model = parallelize(model)

        if os.path.isfile(model_file):
            checkpoint = torch.load(model_file)
            model.load_state_dict(checkpoint["model"])
        
        model.eval()

        epoch_loss = 0.0
        epoch_top_1_acc = 0.0
        epoch_top_5_acc = 0.0
        subepoch_total_step = math.ceil(len(test_loader.dataset) / batch_size)
        
        batch_num = 1
        with torch.no_grad():    
            for batch in test_loader:
                images, labels = batch[0].to(device), torch.tensor(batch[1]).to(device)
                outputs = model(images)
                loss = self.criterion(outputs, labels)
                outputs_softmax = self.log_softmax(outputs)
                top_1_acc, top_5_acc = self.accuracy(outputs_softmax, labels, (1, 5))

                epoch_loss += (loss.item() / batch_num )
                epoch_top_1_acc += (top_1_acc.item() / batch_num)
                epoch_top_5_acc += (top_5_acc.item() / batch_num)
            
                batch_num += 1

                stats = "Test step [%d/%d], loss: %.4f, top_1_acc: %5.4f, , top_5_acc: %5.4f" \
                            % (batch_num, subepoch_total_step,epoch_loss, 
                            epoch_top_1_acc, epoch_top_5_acc)

            print("\r" + stats, end="")

        test_results["stats"] = {
            "total_epoch_loss": epoch_loss,
            "total_epoch_top_1_acc": epoch_top_1_acc,
            "total_epoch_top_5_acc": epoch_top_5_acc
        }
    
        logger(test_results)

#### <font color='blue'> Model Building specifications </font>

In [None]:
num_epochs = 2
param_grid = {
    'batch_size': [128, 256],
    'learning_rate': [1e-2, 1e-3],
    'lambda_value': [1e-3, 1e-4],
    'model_type': ['vgg16', 'resnet50']
}

#### <font color='blue'> Initialize Experiment </font>

In [None]:
experiment = Experiment()
etl_spec = ImagenetETLSpec()
sub_epoch_spec = ImagenetTrainingSpec()

#### <font color='blue'> Run Data Preprocessing </font>

In [None]:
experiment.run_etl(etl_spec, fraction=0.1)

#### <font color='blue'> Run Model Building </font>

In [None]:
experiment.run_fit(sub_epoch_spec, param_grid, num_epochs)