# Data Preprocessing

### <font color='blue'>Cerebro Imports</font>

In [None]:
from cerebro.etl_spec import ETLSpec
from cerebro.experiment import Experiment
from cerebro.sub_epoch_spec import SubEpochSpec

### <font color='blue'> Initialize Data Preprocessing </font>

##### <font color='grey'> Set up data preprocessing workers by installing necessary package dependencies. Only include one-time execution statements, such as downloading nltk's "punkt" library as shown below. </font>

In [None]:
class CocoETLSpec(ETLSpec):
    def __init__(self):
        from coco_proc.vocabulary import Vocabulary

        self.miscellaneous_path = "/data/cerebro_data_storage/miscellaneous"
        self.is_feature_download = [True, False, False, False]
        
        vocab_threshold = 5
        annotations_file = self.miscellaneous_path + "/captions_train2017.json"
        self.train_vocab = Vocabulary(vocab_threshold, annotations_file=annotations_file, vocab_from_file=False)
        self.max_caption_len = 55

    def initialize_worker(self):
        try:
            import nltk
            nltk.download("punkt")
        except:
            pass
    
    def row_preprocessor(self, row, mode, object_dir):
        """
        Convert a given dataset row to tensor format (suitable for training)
        Data processing is a data parallel map operation. So, the same row_preprocessing_routine() will
        be called on every row of the dataset.

        Parameters
        ----------
        row : pandas.core.series.Series
            metadata pandas dataframe row
        mode : str
            train/valid/test
        object_dir : str
            Path where all multimedia files will be stored on a node

        Returns
        -------
        id : str
            uniquely identifying this row
        input_tensor : torch.Tensor
            All the input features should be combined to form a single input tensor
        output_tensor : torch.Tensor
            All the output features should be combined to form a single output tensor

        """
        import nltk
        import torch
        from PIL import Image
        from torchvision import transforms
            
        max_caption_len = self.max_caption_len
        vocab = self.train_vocab
        
        if mode == "train":
            # Convert image to tensor and pre-process using transform
            img_transform = transforms.Compose([ 
                transforms.Resize(256),                          # smaller edge of image resized to 256
                transforms.RandomCrop(224),                      # get 224x224 crop from random location
                transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
                transforms.ToTensor(),                           # convert the PIL Image to a tensor
                transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                                    (0.229, 0.224, 0.225))])
        else:
            img_transform = transforms.Compose([ 
                transforms.Resize(256),                          # smaller edge of image resized to 256
                transforms.CenterCrop(224),                      # get 224x224 crop from the center
                transforms.ToTensor(),                           # convert the PIL Image to a tensor
                transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                                    (0.229, 0.224, 0.225))])
        
        # reading input features and converting to tensor
        input_image_path = object_dir + "/" + str(row["file_name"])
        image = Image.open(input_image_path).convert("RGB")
        image_tensor = img_transform(image)
        
        # reading output features and converting to tensor
        output_caption = row["captions"]
        tokens = nltk.tokenize.word_tokenize(str(output_caption).lower())
        caption = []
        caption.append(vocab(vocab.start_word))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab(vocab.end_word))
        
        # padding
        nremaining = max_caption_len - len(tokens)
        if nremaining > 0:
            for i in range(nremaining):
                caption.append(vocab(vocab.end_word))
        
        caption_tensor = torch.Tensor(caption).long()

        return image_tensor, caption_tensor

### <font color='blue'> Define the model training and validation functions </font>

In [None]:
class CocoTrainingSpec(SubEpochSpec):
    def __init__(self):
        self.miscellaneous_path = "/data/cerebro_data_storage/miscellaneous"

        from coco_proc.vocabulary import Vocabulary
        vocab_threshold = 5
        annotations_file = self.miscellaneous_path + "/captions_train2017.json"
        self.train_vocab = Vocabulary(vocab_threshold, annotations_file=annotations_file, vocab_from_file=False)

    def initialize_worker(self):
        try:
            import nltk
            nltk.download("punkt")
        except:
            pass

    def train(self, model_file, train_dataset, config, logging, device):
        """
        User has to define a train function which will load a model configuration and perform training.

        Parameters
        ----------       
        model_file : str
            Load model file if it exists else create model and save model after every sub epoch
        train_dataset : torch.utils.data.IterableDataset
            Dataset is in the form of Pytorch dataset which can be processed by Pytorch Dataloaders
        config : dict
            Dictionary of hyperparameters (key is hyperparam name and value is its value) for a model training setup
        logging : Logger
            Logger for logging any form of training information

        Returns
        -------
        ml_metrics : dict

        """
        # TODO : explain ml_metrics (simplify it?)
        import os
        import time
        import sys
        import math
        import numpy as np
        import torch
        import torch.nn as nn
        import tensorflow as tf
        from torch.utils.data import DataLoader
        from coco_proc.model import EncoderCNN, DecoderRNN

        
        vocab = self.train_vocab
        vocab_size = len(vocab)
        learning_rate = config["learning_rate"]
        batch_size = config["batch_size"]
        embed_size = config["embed_size"]
        hidden_size = config["hidden_size"]

        train_loader = DataLoader(
                    train_dataset, batch_size)
        
        encoder = EncoderCNN(embed_size)
        decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
        # Define the loss function
        criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
        # Specify the learnable parameters of the model
        params = list(decoder.parameters()) + list(encoder.embed.parameters()) + list(encoder.bn.parameters())
        # move model to GPU
        encoder.to(device)
        decoder.to(device)
        # Define the optimizer
        optimizer = torch.optim.Adam(params=params, lr=learning_rate)

        if os.path.isfile(model_file):
            checkpoint = torch.load(model_file)
            encoder.load_state_dict(checkpoint['encoder'])
            decoder.load_state_dict(checkpoint['decoder'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        encoder.train()
        decoder.train()

        # Start time for every 100 steps
        start_train_time = time.time()
        i_step = 0
        total_loss = 0.0
        subepoch_total_step = math.ceil(len(train_loader.dataset) / batch_size)
        train_results = {
            "stats": [],
            "additive_metrics": {}
        }

        for batch in train_loader:
            images, captions = batch[0].to(device), batch[1].to(device)
            features = encoder(images)
            outputs = decoder(features, captions)
            loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            stats_dict = {
                "loss": loss.item(),
                "perplexity": np.exp(loss.item())
            }

            stats = "Train step [%d/%d], %ds, Loss: %.4f, Perplexity: %5.4f" \
                        % (i_step, subepoch_total_step, time.time() - start_train_time,
                           loss.item(), np.exp(loss.item()))

            print("\r" + stats, end="")
            sys.stdout.flush()
            logging.info("Training Stats: {}".format(stats))

            train_results["stats"].append(stats_dict)

            i_step += 1

        torch.save({"encoder": encoder.state_dict(),
                    "decoder": decoder.state_dict(),
                    "optimizer" : optimizer.state_dict(),
                    "total_loss": total_loss
                   }, model_file)

        
        train_results["additive_metrics"]["sub_epoch_total_loss"] = total_loss

        return train_results
    
    def test(self, model_file, test_dataset, config, logging, device):
        """
        User has to define a validation function.

        Parameters
        ----------
        model_file : str
            Load model file for validation
        test_dataset : torch.utils.data.IterableDataset
            Dataset is in the form of Pytorch dataset which can be processed by Pytorch Dataloaders
        config : dict
            Dictionary of hyperparameters (key is hyperparam name and value is its value)
        logging : Logger
            Logger for logging any form of training information
            
        Returns
        -------
        ml_metrics : dict

        """
        import os
        import time
        import sys
        import math
        import numpy as np
        import nltk
        import torch
        import torch.nn as nn
        import tensorflow as tf
        from torch.utils.data import DataLoader
        from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

        from coco_proc.model import EncoderCNN, DecoderRNN


        """Validate the model for one epoch using the provided parameters. 
        Return the epoch's average validation loss and Bleu-4 score."""

        def get_actual_annotations(annotations_path):
            data_json = None
            with open(annotations_path) as f:
                data_json = json.load(f)
            annotations = {}
            annotations_list = data_json['annotations']
            for i in annotations_list:
                if not i["image_id"] in annotations:
                    annotations[i["image_id"]] = []
                annotations[i["image_id"]].append(i["caption"])
            return annotations

        def word_list(word_idx_list, vocab):
            word_list = []
            for i in range(len(word_idx_list)):
                vocab_id = word_idx_list[i]
                word = vocab.idx2word[vocab_id]
                if word == vocab.end_word:
                    break
                if word != vocab.start_word:
                    word_list.append(word)
            return word_list


        # initialize device for GPU
        val_annotations_path = self.miscellaneous_path + "/captions_val2017.json"

        annotations_valid = get_actual_annotations(val_annotations_path)

        vocab = self.train_vocab

        vocab_size = len(vocab)
        learning_rate = config["learning_rate"]
        batch_size = config["batch_size"]
        embed_size = config["embed_size"]
        hidden_size = config["hidden_size"]

        val_loader = DataLoader(
                    test_dataset, batch_size)

        subepoch_total_step = math.ceil(len(val_loader.dataset) / batch_size)
        start_step=1 
        start_loss=0.0
        start_bleu = 0.0


        encoder = EncoderCNN(embed_size)
        decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

        # Define the loss function
        criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

        # Specify the learnable parameters of the model
        params = list(decoder.parameters()) + list(encoder.embed.parameters()) + list(encoder.bn.parameters())

        # move model to GPU
        encoder.to(device)
        decoder.to(device)

        # Define the optimizer
        optimizer = torch.optim.Adam(params=params, lr=learning_rate)

        if os.path.isfile(model_file):
            checkpoint = torch.load(model_file)
            encoder.load_state_dict(checkpoint['encoder'])
            decoder.load_state_dict(checkpoint['decoder'])
            optimizer.load_state_dict(checkpoint['optimizer'])

        # Switch to validation mode
        encoder.eval()
        decoder.eval()

        # Initialize smoothing function
        smoothing = SmoothingFunction()

        # Keep track of validation loss and Bleu-4 score
        total_loss = start_loss
        total_bleu_4 = start_bleu

        # Start time for every 100 steps
        start_val_time = time.time()
        test_results = {
            "stats": {}
        }

        # Disable gradient calculation because we are in inference mode
        dc = 0
        with torch.no_grad():
            # Obtain the batch
            for batch in val_loader:
                images, captions, row_ids = batch[0].to(device), batch[1].to(device), batch[2]

                # Pass the inputs through the CNN-RNN model
                features = encoder(images)
                outputs = decoder(features, captions).to("cpu")

                # move outputs back to CPU
                captions = captions.to("cpu")

                # Calculate the total Bleu-4 score for the batch
                batch_bleu_4 = 0.0
                # Iterate over outputs. Note: outputs[i] is a caption in the batch
                # outputs[i, j, k] contains the model's predicted score i.e. how 
                # likely the j-th token in the i-th caption in the batch is the 
                # k-th token in the vocabulary.
                for i in range(len(outputs)):
                    predicted_ids = []
                    for scores in outputs[i]:
                        # Find the index of the token that has the max score
                        predicted_ids.append(scores.argmax().item())
                    # Convert word ids to actual words
                    predicted_word_list = word_list(predicted_ids, vocab)
                    caption_word_list = word_list(captions[i].numpy(), vocab)

                    # Calculate Bleu-4 score and append it to the batch_bleu_4 list
                    tokenized_references = [nltk.tokenize.word_tokenize(str(caption).lower())
                                           for caption in annotations_valid[row_ids[i].item()]]
                    batch_bleu_4 += sentence_bleu(tokenized_references, 
                                                   predicted_word_list, 
                                                   smoothing_function=smoothing.method1)
                total_bleu_4 += batch_bleu_4 / len(outputs)

                # Calculate the batch loss
                loss = criterion(outputs.view(-1, len(vocab)), captions.view(-1))
                total_loss += loss.item()

                # Get validation statistics
                stats = "Val step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Batch Bleu-4: %.4f" \
                        % (start_step, subepoch_total_step,
                           loss.item(), np.exp(loss.item()), batch_bleu_4 / len(outputs))
                # Print validation statistics (on same line)
                print("\r" + stats, end="")
                sys.stdout.flush()
                logging.info("Validation Stats: {}".format(stats))

                start_step += 1

            test_results["stats"]["total_loss"] = total_loss / subepoch_total_step
            test_results["stats"]["total_bleu_4"] = total_bleu_4 / subepoch_total_step
            return test_results

### <font color='blue'> Run Cerebro </font>

In [None]:
num_epochs = 2
param_grid = {
    'learning_rate': [1e-2, 1e-3],
    'embed_size': [256],
    'hidden_size': [256],
    'batch_size': [128]
}

In [None]:
experiment = Experiment()
etl_spec = CocoETLSpec()
sub_epoch_spec = CocoTrainingSpec()

In [None]:
experiment.run_etl(etl_spec, fraction=0.1)

In [None]:
experiment.run_fit(sub_epoch_spec, param_grid, num_epochs)