In [None]:
#Imports cell
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import datasets
from sklearn import model_selection


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Starter Notebook with Explanations

1. The purpose of this notebook is to analyze the reasoning behind the code in each cell from Abishek Thakur's 4 steps. 
    - https://www.kaggle.com/c/petfinder-pawpularity-score/discussion/274026
    - all credit for code goes to him. I'm just following along and trying to understand everything.
2. I wanted to go through line by line and fill in the knowledge gaps I was lacking. Hopefully this helps the next noob. I learned alot. 

3. When running the notebook, be sure to turn on accelerator settings to GPU before running or you will get error.

## Step 1. - Create the folds

He is doing the following: <br>
1. Creating a function where inputs are "data" and "num_splits"
    - create "kfolds" column with all values as -1
<br>    
2. int(np.floor(1 + np.log2(len(data)))) - This formula is from ["Sturges Rule"](https://answerminer.com/blog/binning-guide-ideal-histogram)
    - Ideally it can be any of the methods in the website above. He went with Sturges Rule for choice of bin numbers
    - He also did "floor" whereas in other websites they use "ceil". It doesn't really matter because we are forcing the number to become an integer.
<br>    
3. Using .loc method on data, he makes a change for all rows in the column "bins"
    - He adds a numeric label for each bin range for each value 
        - This is simply part of the method to finally get kfold numbers
        - The data.bins.values means he will treat the bin.values as "categories" to create splits across
<br>        
4. [StratifiedKfold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html)
    - Provides train/test indices to split data in train/test sets.
    - This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.
    -for the sake of my sanity, i renamed "kf" variable to skf to signify "STRATIFIED K-FOLD"
    - In classifications tasks with imbalanced class distributions, we should prefer StratifiedKFold over KFold. The bin labels are treated as "classes"
<br>    
5. The for loop
    - Using the Stratified K-fold cross-validation object, he applies "split" method with inputs 
        - x = data
        - y = the numeric label of each bin range, this is our "df.target"
    - this returns the indices of the split data of which he iterates through using "enumerate"
    - for each of the "f, (t_, v_)" values, f is the enumerate number, and (t_,v_) is the (x,y) value after split
        - access the specified index in array "v_" with column 'kfold' using .loc method on dataframe
            - set the specified index value for kfold to indicate which fold dictated by enumerate "f" from the split            
    - WE DONT CARE ABOUT THE [0] TUPLE POSITION. WE CARE ABOUT THE ARRAY OF INDICES FOR [1] TUPLE POSITION WHICH WILL UPDATE TO THE RESPECTIVE KFOLD NUMBER IN KFOLD COLUMN
<br>    
6. Once this is done, delete "bins" column and you are left with a dataframe with additional column of "kfold" which assigns a fold number to each of the rows which reflects a stratified kfold

In [None]:
def create_folds(data, num_splits):
    data["kfold"] = -1
    num_bins = int(np.floor(1 + np.log2(len(data))))
#     print(num_bins)

    data.loc[:, "bins"] = pd.cut(data["Pawpularity"], bins=num_bins, labels=False)

    skf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    for f, (t_, v_) in enumerate(skf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
             
    data = data.drop("bins", axis=1)

    return data

### Once the function is defined, call the "create_folds" function and generate whatever amount of folds you want to create!

1. In below steps, he arbitrarily chooses 5 and 10 folds. 
    - How would we figure out what is the optimal number of folds?
2. Once folds are created, save data to csv and proceed onto next step which is to train the model with the folded data
    - If you don't see that data in the "output/kaggle/working" directory, there is a little refresh button next to /kaggle/working on the right side. Click and you will see the data there.

In [None]:
# df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")

# #Call "create_folds" function
# df_5 = create_folds(df, num_splits=5)
# df_10 = create_folds(df, num_splits=10)

# #Save data to csv
# df_5.to_csv("train_5folds.csv", index=False)
# df_10.to_csv("train_10folds.csv", index=False)

## Step 2. Train the data

### Imports

1. <b>Tez</b> - My understanding of Tez is that it is an easier way of putting in parameters requirements when setting up a model in pytorch 
    - I had issues running "import sys --> sys.path.append("../input/tez-lib/") and sys.path.append("../input/timmmaster/")
    - As a workaround, I manually clicked "Add data" at the top right corner of the notebook
        - Searched for "tez-lib" and "timmmaster" in the search box and clicked the blue "Add" button
   
   - https://github.com/abhishekkrthakur/tez
2. <b>albumentations</b> - A fast/flexible python library for image augmentations
    - https://albumentations.ai/
    - used by industry leaders
3. <b>pandas</b> - for manipulating dataframes
4. <b>cv2</b> - OpenCV - A huge open-source library for computer vision, machine learning, and image processing
5. <b>numpy</b> - python library - allows for large multi-dimensional arrays and do high level math functions on them
6. <b>timm</b> - Part of pytorch image models is a deep-learning library created by Ross Wightman and is a collection of STATE OF THE ART computer vision models, layers, utilities, optimizers, schedulers, data-loaders, augmentations and also training/validating scripts with ability to reproduce ImageNet training results. 
    - https://fastai.github.io/timmdocs/
7. <b>torch and torch.nn</b> - PyTorch and .nn is a PyTorch library that simplifies creation of neural net models. We use this instead of making them from scratch.
8. <b>sklearn.metrics</b> - metrics module implements functions assessing prediction error for specific purposes
9. <b>tqdm</b> - Used for those fancy looking progress bars in the output cells
    - https://tqdm.github.io/
10. <b>Tez.EarlyStopping</b> is used to checkpoint the best model and stop the training prematurely if needed.    

In [None]:
## Keep these in here, I was having issues importing tez and timm even after manually clicking "Add Data"
import sys
sys.path.append("../input/tez-lib/")
sys.path.append("../input/timmmaster")
##
import pandas as pd
import numpy as np
from sklearn import metrics
import albumentations
import cv2
import torch
import torch.nn as nn
from tqdm import tqdm
import tez
from tez.callbacks import EarlyStopping
import timm

### The "Args" Class 
- will be called in the later functions for albumentations and the neural net model. This is made to simplify the functions and take out the hardcoding of numbers. We can make changes to the args at any time and the functions will use those changes.

In [None]:
class args:
    batch_size = 64
    image_size = 256
    epochs = 20
    fold = 0

### "PawpularDataset" Class
https://stackoverflow.com/questions/625083/what-init-and-self-do-in-python
<br>
https://docs.python.org/3/tutorial/classes.html
<br>
https://rszalski.github.io/magicmethods/
1. Basically it is object oriented programming (OOP) where there are oop constructs that are passed in order like loops.
   - e.g. first it is init, then len, and then finally getitem
   - looks like a commonly used class structure
       - https://debuggercafe.com/image-augmentation-using-pytorch-and-albumentations/
2. within init, he is setting the arguments that he will pass into the next functions below
3. within len - I don't understand why this is needed. Only 2 occurences of len. I'll ignore for now.
4. within getitem - __getitem__ is a magic method in python that which when used in a class, allows its instances to use the [] (indexer) operators. Say x is an instance of this class, then x[i] is roughly equivalent to "type(x).__getitem__(x, i)"
    - https://www.geeksforgeeks.org/__getitem__-in-python/
        - magic method is usually used for list indexing, dictionary lookup, or accessing ranges of values.
    - he uses OpenCv to read each row's information under "Id" column
    - augmentations will be related to <b>albumentations</b>
    - np.transpose
        - he is transposing along the axis (2,0,1). I don't understand this too well but i am guessing he has to make the array a certain shape in order to work with pytorch "torch.tensor"
    - uses torch.tensor to create tensor-matrix of the data
        - creates 3 tensors: image, features, targets

In [None]:
class PawpularDataset:
    def __init__(self, image_paths, dense_features, targets, augmentations):
        self.image_paths = image_paths
        self.dense_features = dense_features
        self.targets = targets
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        image = cv2.imread(self.image_paths[item])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
            
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        features = self.dense_features[item, :]
        targets = self.targets[item]
        
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "features": torch.tensor(features, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.float),
        }

### "PawpularModel" Class

1. Here, he is setting up the structure of the model without fitting it to any data. This class will have all the params and metrics related to the model. He also uses <b>Tez</b> here to simplify the setup of the model. The stuff he uses in tez are basically the inputs needed to set up a model in pytorch.
    - All the functions within the class are cut/paste template from <b>Tez</b> https://pypi.org/project/tez/
2. init(self) - is just part of the code.
    - He uses "tf_efficientnet_b0_ns" as the pytorch model. i could try v2
        - pretrained = True because we are loading in a pretrained model
    - He chooses efficientnet_b0. According to the paper, theres b0 all the way up to b7/b8. Maybe see how that improves the efficiency
        - https://arxiv.org/pdf/1905.11946.pdf
    - much more can be found on the timm documentation https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/efficientnet.py
    - To understand what "super" does: https://rhettinger.wordpress.com/2011/05/26/super-considered-super/
    - step scheduler changes the learning rate afer each epoch
    - Epochs and forward pass diagrams - https://www.baeldung.com/cs/epoch-neural-networks
    - <b>Here we have the network layers. Total 4 layers. Plus the step scheduler<b>
    - dropout is important - improves regularization, reduces overfitting
        - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
        - https://arxiv.org/abs/1207.0580
    - nn.Linear - I don't understand too well
        - https://stackoverflow.com/questions/54916135/what-is-the-class-definition-of-nn-linear-in-pytorch
        - According to this, nn.Linear applies a transformation to the data
            - the data, "x" is already a tensor, but tensors don't have dimension limits. They can be multidimensional arrays
            - we restrict the tensor size with the argument for out_features where in this notebook the out_features=1
        - I don't understand the reasoning behind 128 + 12 for in_features.
            -might be from the classifier line - 128, but where is the 12 coming from?
3. RMSE is used to track error
    
4. Apply gradient descent to the learning rate! Moar efficiency for neural net!
    - CosineAnnealingWarmRestarts is same as SGDR or "Stochastic Gradient Descent with Restarts"
        - https://github.com/pytorch/pytorch/issues/20028
        - https://towardsdatascience.com/https-medium-com-reina-wang-tw-stochastic-gradient-descent-with-restarts-5f511975163
        
5. "Adam" optimizer
    - https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/
        - "Adam is an optimization algorithm that can be used instead of the classical stochastic gradient descent procedure <b>to update network weights iterative based in training data</b>.
        - It seems this optimizer is really good. Adam was applied to the logistic regression algorithm on the MNIST digit recognition and IMDB sentiment analysis datasets.

6. Forward Pass - basically a forward run of the model. 
    - https://theneuralblog.com/forward-pass-backpropagation-example/
    - https://deeplizard.com/learn/video/MasG7tZj-hw
        - forward() method accepts a tensor as input, and then, returns a tensor as output
        - it runs through the 4 layers
    - he sets up loss function MSELoss, returns the model, loss and error
        - If there are targets, return the loss
        - If there are no more targets, return the output, 0 and empty collection

In [None]:
class PawpularModel(tez.Model):
    def __init__(self):
        super().__init__()

        self.model = timm.create_model("tf_efficientnet_b0_ns", pretrained=True, in_chans=3)
        self.model.classifier = nn.Linear(self.model.classifier.in_features, 128)
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(128 + 12, 1)
        
        self.step_scheduler_after = "epoch"

    def monitor_metrics(self, outputs, targets):
        outputs = outputs.cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        rmse = metrics.mean_squared_error(targets, outputs, squared=False)
        return {"rmse": rmse}

    def fetch_scheduler(self):
        sch = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )
        return sch

    def fetch_optimizer(self):
        opt = torch.optim.Adam(self.parameters(), lr=1e-4)
        return opt

    def forward(self, image, features, targets=None):

        x = self.model(image)
        x = self.dropout(x)
        x = torch.cat([x, features], dim=1)
        x = self.out(x)

        if targets is not None:
            loss = nn.MSELoss()(x, targets.view(-1, 1))
            metrics = self.monitor_metrics(x, targets)
            return x, loss, metrics
        return x, 0, {}

### Using Albumentations to change the photos

1. In the training_aug and validation_aug, he sets up the params which he will pass into "augmentations" in PawpularDatasets Class
    - Compose API in albumentations - https://albumentations.ai/docs/api_reference/core/composition/
        - Using Compose, he inputs a list of transformations he wants on the images 
    - HueSaturationValue - randomly change the hues of the images
    - RandomBrightnessContrast - randomly change contrast
2. The validation set doesnt need any randomization. We are testing the model against test data
    - Resize and Normalize I am guessing are used so maintain the image size to be standard for all images for better training/testing
    - The values he put in for Normalize are the exact values in the albumentations documentation
        - https://albumentations.ai/docs/api_reference/full_reference/#albumentations.augmentations.transforms.Normalize
3. When the datasets class calls these augmentations, albumentations is called and the transformations are done on the images in the dataset
    - Which then are passed into the model for training.

In [None]:
train_aug = albumentations.Compose(
    [
        albumentations.Resize(args.image_size, args.image_size, p=1),
        albumentations.HueSaturationValue(
            hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5
        ),
        albumentations.RandomBrightnessContrast(
            brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5
        ),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

valid_aug = albumentations.Compose(
    [
        albumentations.Resize(args.image_size, args.image_size, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

### Importing the dataframes for training

1. I don't need to have a path to give for read_csv since the train_10folds.csv is in my working directory.
    - My question here is why is df_train 8920 rows and df_valid 992 rows. Shouldn't the ratio be 80/20?
        - https://towardsdatascience.com/train-validation-and-test-sets-72cb40cba9e7
        - https://miro.medium.com/max/875/1*Nv2NNALuokZEcV6hYEHdGA.png
    - To answer my own question, the validation data is used from the training set to test model fit and then the "gold standard" will be the test set    
    - Also how do we know the optimal kfold number to begin with? 5 and 10 sound arbitrary. Maybe the most accurate model lies in the range of folds.
    
2. Super fancy guy, he calls args class and then the fold number in args class.
    - args.fold
    - He sets aside fold number = 0 for his validation set
    - the kfolds are numbered from 0-9 as we have 10 folds created. Any kfold can be chosen for validation set, doesn't have to be kfold=0. They're all the same size

In [None]:
# df = pd.read_csv("train_10folds.csv")
# df_train = df[df.kfold != args.fold].reset_index(drop=True)
# df_valid = df[df.kfold == args.fold].reset_index(drop=True)

### Dense_features, train_img_paths, valid_img_paths

1. Here, he is making a list for each of the variables which will be inputs into the "PawpularDataset.init" class
    - dense_features is the list of columns in the dataset
    - train_img_paths is the list of all image id names (the first column) in the training dataset
    - valid_img_paths is the list of all image id names (the first column) in the validation dataset
        - he uses f strings for both - https://realpython.com/python-f-strings/#python-f-strings-the-pesky-details
            - f strings execute in runtime. So he can run a for loop with his string to generate list of Id values
                 - looping using list comprehension - https://www.w3schools.com/python/python_lists_loop.asp

In [None]:
# dense_features = [
#     'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
#     'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
# ]
# train_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_train["Id"].values]
# valid_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_valid["Id"].values]

### Using PawpularDataset class to generate tensors for training and validation 
1. The below cell returns class objects so my guess is that the output is the 3 tensors that were generated at the end of the PawpularDataset.
    - we get the following instance "<__main__.PawpularDataset at 0x7f9ec460d2d0>"
2. He is creating training and validation data using the PawpularDataset class created in previous cell  
      - the "image", "features", "targets" variables are used in the PawpularModel class

In [None]:
# train_dataset = PawpularDataset(
#     image_paths=train_img_paths,
#     dense_features=df_train[dense_features].values,
#     targets=df_train.Pawpularity.values,
#     augmentations=train_aug,
# )

# valid_dataset = PawpularDataset(
#     image_paths=valid_img_paths,
#     dense_features=df_valid[dense_features].values,
#     targets=df_valid.Pawpularity.values,
#     augmentations=valid_aug,
# )

### Fitting the Model
1. He runs the PawpularModel class now that we have all the variables
2. EarlyStopping is used to stop the model early if there is a high accuracy model already found
    - There was not much documentation on EarlyStopping. But I see Tez's Earlystopping class is similar that found in Keras
        - Earlystopping is based on validation data rmse
            - nice pics here - https://towardsdatascience.com/activate-early-stopping-in-boosting-algorithms-to-mitigate-overfitting-9c1b12cc6729
        - https://theailearner.com/2019/07/15/keras-callbacks-earlystopping/
        - Monitor - the arguments are like "val_" etc so that means similarly in tez it would be from the validation data rmse
        - Model_path - uses f-string again. Within f string he calls args class, name will be "model_f0.bin"
            - what does it do with this information? I don't see model_f0.bin anywhere as a path or something. Maybe when the model runs it will generate a .bin file?
        - patience = 3 - this is the number of epochs the funtion will wait before stopping
        - mode = "min" 
            - the model will stop when the quantity monitored reaches a minimum
            - this from keras documentation but it makes sense here - https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
        - save_weights_only = True - this will save the weights of the best model which was stopped by EarlyStopping
3. From which import is he calling .fit for model.fit()?    
    - the tez.model
        - https://github.com/abhishekkrthakur/tez/blob/main/tez/model/model.py
            - here there is a .fit subclass, similar to that found in keras
4. Model is saved as model_f0.bin
5. I understand the model.fit arguments but I don't know the real documentation behind it. I think it is from the tez.Model which then goes to nn.model, but in the Tez github documentation, the model.fit part says it is similar to keras. So I'm going to assume that these params are similar to keras style model.fit.
    - Batch size we can also choose to change - might affect speed of model training, probably he chose those batch sized from experience
        - https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network
    -fp16 is some float-16 param used with nvidia GPUs
    - epochs - https://stackoverflow.com/questions/44907377/what-is-epoch-in-keras-models-model-fit

In [None]:
# model = PawpularModel()

# es = EarlyStopping(
#     monitor="valid_rmse",
#     model_path=f"model_f{args.fold}.bin",
#     patience=3,
#     mode="min",
#     save_weights_only=True,
# )

# model.fit(
#     train_dataset,
#     valid_dataset=valid_dataset,
#     train_bs=args.batch_size,
#     valid_bs=2*args.batch_size,
#     device="cuda",
#     epochs=args.epochs,
#     callbacks=[es],
#     fp16=True,
# )



# Step 3. Inference

### Similar to the previous step of training the data
1. Create new class args which will be utilized by the albumentations and the final for loop for testing on the kfolds
2. The same class PawpularDataset is used
3. For PawpularModel class, there are differences. I will rename this class to PawpularModel1
    - pretrained = False
    - he adds different Linear transformations. 
        - 140 is 128+12 from before
        - where and why does he use 64?
    - what are in_chans=3?

In [None]:
class args1:
    batch_size = 64
    image_size = 512

### PawpularModel1 class
1. This class is much shorter than the training class. 
2. The forward pass function shows the different layers
    - uses dropoout to randomly zero out values based on 0.1 probability to reduce overfitting
        - this is a value we can change to see how it affects the model
    - he uses torch.cat to add features
        - features is the torch tensor outputted from the PawpularDataset class from earlier
    - he applies the first nn.Linear transform
        - https://ashwinhprasad.medium.com/pytorch-for-deep-learning-nn-linear-and-nn-relu-explained-77f3e1007dbb
        - He might be using the nn.Linear method to add random weights and biases for that layer of the NN
    - torch.relu activation function and then another nn.Linear transform for another random weights/biases to apply to tensor
    - returns the output


In [None]:
class PawpularDataset:
    def __init__(self, image_paths, dense_features, targets, augmentations):
        self.image_paths = image_paths
        self.dense_features = dense_features
        self.targets = targets
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        image = cv2.imread(self.image_paths[item])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
            
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        features = self.dense_features[item, :]
        targets = self.targets[item]
        
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "features": torch.tensor(features, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.float),
        }

In [None]:
class PawpularModel(tez.Model):
    def __init__(self, model_name):
        super().__init__()

        self.model = timm.create_model(model_name, pretrained=False, in_chans=3)
        self.model.classifier = nn.Linear(self.model.classifier.in_features, 128)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(140, 64)
        self.dense2 = nn.Linear(64, 1)

    def forward(self, image, features, targets=None):

        x = self.model(image)
        x = self.dropout(x)
        x = torch.cat([x, features], dim=1)
        x = self.dense1(x)
        x = torch.relu(x)
        x = self.dense2(x)
        return x, 0, {}

### test_aug - augmenting the test images
1. He uses the same albumentations standards as used in the training section for validation 

In [None]:
test_aug = albumentations.Compose(
    [
        albumentations.Resize(args.image_size, args.image_size, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

### Final Prediction and General Steps
1. Create empty list which will be used to generate the final submission.csv file
2. Because he used 10 kfolds, he will use a for loop with range 10. We can change the range if we change the number of kfolds.
3. He loads the pretrained timm model. Here i will use the name PawpularModel1 to indicate the shorter class made during inference section
4. He creates the variables for test data path, image path, and features list
5. Similar to the training validation dataset, he runs PawpularDataset class to create the augmented/normalized dataset to run the model on with test data
    - the targets = np.ones(len(test_img_paths)) he creates an array filled with ones
        - why?
6. test_predictions
    - I understand he is calling predict method for the model
        - why is batch size 2x the args?
            - seems batch size is related to memory consumption and model performance
            - https://datascience.stackexchange.com/questions/12532/does-batch-size-in-keras-have-any-effects-in-results-quality
            - seems 2x batch size is used to help the gradient
            - also batch size probably related to the GPU power
7. model.predict - from tez - within model.py there is predict function which is called
8. He appends each test prediction to the list super_final_predictions
    - final_test_predictions.extend(preds.ravel().tolist())
        - returns a contiguous flattened array which he adds to the finah predictions
            - https://numpy.org/doc/stable/reference/generated/numpy.ravel.html
9. np.mean(np.column_stack
    - he averages the predictions over each fold
        - then what would the output be? there would be 10 dictionaries with lists nested?
10. set Pawpularity column to predicted values, add Id column and create submission.csv file.        

In [None]:
super_final_predictions = []

for fold_ in range(10):
    model = PawpularModel(model_name="tf_efficientnet_b0_ns")
    model.load(f"../input/pawpular-models/model_f{fold_}.bin", device="cuda", weights_only=True)

    df_test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
    test_img_paths = [f"../input/petfinder-pawpularity-score/test/{x}.jpg" for x in df_test["Id"].values]

    dense_features = [
        'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
        'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
    ]

    test_dataset = PawpularDataset(
        image_paths=test_img_paths,
        dense_features=df_test[dense_features].values,
        targets=np.ones(len(test_img_paths)),
        augmentations=test_aug,
    )
    test_predictions = model.predict(test_dataset, batch_size=2*args1.batch_size, n_jobs=-1)

    final_test_predictions = []
    for preds in tqdm(test_predictions):
        final_test_predictions.extend(preds.ravel().tolist())
    
    super_final_predictions.append(final_test_predictions)

super_final_predictions = np.mean(np.column_stack(super_final_predictions), axis=1)
df_test["Pawpularity"] = super_final_predictions
df_test = df_test[["Id", "Pawpularity"]]
df_test.to_csv("submission.csv", index=False)