In [None]:
# ENV = "COLAB"
ENV = "KAGGLE"

VAL_SPLIT = 0.1

IMG_SIZE = 128

TRAIN_ONLY_CLASSIFIER_FOR_EPOCHS = 5
N_EPOCHS = 5

BATCH_SIZE = 64

In [None]:
!ls /kaggle/input/vgg11-bn
if ENV == 'COLAB':
    # Will automatically show cell execution times.
    !pip install ipython-autotime
    %load_ext autotime

# Replicate the kaggle directory structure in Google Colab


In [None]:
import os

# Only run this cell in Colab
if ENV == "COLAB" and not os.path.exists('/kaggle'):
    from google.colab import drive
    drive.mount('/content/gdrive')
    !cp -r '/content/gdrive/My Drive/kaggle/petfinder-pawpularity-score.zip' petfinder-pawpularity-score.zip
    !mkdir /kaggle
    !mkdir /kaggle/input
    !unzip -qq petfinder-pawpularity-score.zip -d /kaggle/input/petfinder-pawpularity-score/
    !ls

# Get a pretrained model from PyTorch

In [None]:
import torch
import torchvision


if ENV == "COLAB":
    !pip install torch torchvision -U
    model = torchvision.models.vgg11_bn(pretrained=True)
    torch.save(model, "vgg11_bn.model")
    !ls -lh
    !cp vgg11_bn.model '/content/gdrive/My Drive/kaggle/vgg11_bn.model'
else:  # ENV = "KAGGLE"
    model = torch.load('/kaggle/input/vgg11-bn/vgg11_bn.model')

print(model)

# Add the last classification layer to our VGG.

In [None]:
import torch

model.classifier = torch.nn.Sequential(
    torch.nn.Linear(in_features=25088, out_features=4096, bias=True),
    torch.nn.ReLU(inplace=True),
    torch.nn.Dropout(p=0.5, inplace=False),
    torch.nn.Linear(in_features=4096, out_features=4096, bias=True),
    torch.nn.ReLU(inplace=True),
    torch.nn.Dropout(p=0.5, inplace=False),
    torch.nn.Linear(in_features=4096, out_features=1, bias=True)
)
print(model)

# Define the Dataset

In [None]:
import matplotlib.pyplot as plt
import torchvision

def show(imgs):
    if not isinstance(imgs, list):
        imgs = [imgs]
    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = img.detach()
        img = torchvision.transforms.functional.to_pil_image(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

In [None]:
import torch
import random
from torch.utils.data import Dataset
from PIL import Image, ImageOps
import numpy as np
import pandas as pd
import os
import scipy
from tqdm.notebook import trange, tqdm

# https://stackoverflow.com/questions/43391205/add-padding-to-images-to-get-them-into-the-same-shape
def padding(img, expected_size):
    desired_size = expected_size
    delta_width = desired_size - img.size[0]
    delta_height = desired_size - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)

# https://stackoverflow.com/questions/43391205/add-padding-to-images-to-get-them-into-the-same-shape
def resize_with_padding(img, expected_size):
    img.thumbnail((expected_size[0], expected_size[1]))
    # print(img.size)
    delta_width = expected_size[0] - img.size[0]
    delta_height = expected_size[1] - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)



class ImagesDataset(Dataset):
    def __init__(self, labels):
      self.labels_ = labels.copy()
      self.cached_items_ = {}

    def cache_all(self):
      print("[ImagesDataset] caching all samples")
      for i in trange(len(self.labels_)):
        self.cached_items_[i] = self._compute_for(i)

    def __len__(self):
      return len(self.labels_)

    def _compute_for(self, idx):
      img_id = self.labels_.iat[idx, 0]
      img_filename = os.path.join('/kaggle/input/petfinder-pawpularity-score/train/', img_id + '.jpg')
      pawpularity = self.labels_.at[idx, 'Pawpularity']
      image = torchvision.transforms.ToTensor()(np.asarray(
          resize_with_padding(Image.open(img_filename), (IMG_SIZE, IMG_SIZE))
          ))
      image = torchvision.transforms.functional.convert_image_dtype(image, dtype=torch.float)
      return image, torch.as_tensor([pawpularity], dtype=torch.float)

    def __getitem__(self, idx):
      if idx in self.cached_items_:
        return self.cached_items_[idx]
      self.cached_items_[idx] = self._compute_for(idx)
      return self.cached_items_[idx]


class DatasetWithTransforms(Dataset):
    def __init__(self, dataset, transforms):
      self.dataset_ = dataset
      self.transforms_ = transforms
    
    def __len__(self):
      return (len(self.transforms_) + 1) * len(self.dataset_)
    
    def __getitem__(self, idx):
      transform_id = idx % (len(self.transforms_) + 1)
      sample_id = idx // (len(self.transforms_) + 1)
      x, y = self.dataset_[sample_id]
      if transform_id == len(self.transforms_):
        return x, y
      return self.transforms_[transform_id](x), y


def create_datasets(val_split=0.2, do_cache = True, transforms = []):
    """Returns a pair (train_dataset, val_dataset)"""
    with open('/kaggle/input/petfinder-pawpularity-score/train.csv', 'r') as labels_file:
        labels = pd.read_csv(labels_file)
    all_idxs = list(range(len(labels)))
    random.shuffle(all_idxs)
    num_val_samples = int(val_split * len(all_idxs))
    val_idxs = all_idxs[:num_val_samples]
    train_idxs = all_idxs[num_val_samples:]
    # Assert that the two subsets are distinct.
    assert len(val_idxs) + len(train_idxs) == len(set(val_idxs + train_idxs))
    train_dataset = DatasetWithTransforms(
        ImagesDataset(labels.iloc[train_idxs].reset_index(drop=True)),
        transforms
    )
    if do_cache:
      train_dataset.dataset_.cache_all()
    val_dataset = ImagesDataset(labels.iloc[val_idxs].reset_index(drop=True))
    if do_cache:
      val_dataset.cache_all()
    return train_dataset, val_dataset


transforms = [
  torchvision.transforms.ColorJitter(brightness=.5, hue=.3),
  torchvision.transforms.RandomHorizontalFlip(p=1),
  torchvision.transforms.RandomAffine((-90, 90)),
  torchvision.transforms.functional.autocontrast
]
train_dataset, val_dataset = create_datasets(val_split=VAL_SPLIT,
                                             do_cache=True,
                                             transforms=transforms)
print(len(train_dataset), len(val_dataset))
show(train_dataset[0][0])
show(train_dataset[1][0])
show(train_dataset[2][0])
show(train_dataset[3][0])

# Train

In [None]:
from tqdm.notebook import trange, tqdm


trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

SHOW_DISTR_EVERY_N = 15
MAX_ITERATIONS = 10000


criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
device = torch.device("cuda:0")
model.to(device)

running_loss = []
running_rmse = []

for epoch in range(N_EPOCHS):  # loop over the dataset multiple times

    # Training
    if epoch + 1 <= TRAIN_ONLY_CLASSIFIER_FOR_EPOCHS:
        model.eval()  # Don't train layers of VGG, only the classifier.
        model.classifier.train()
    else:
        model.train()

    t = tqdm(enumerate(trainloader, 0), total=len(trainloader.dataset) / trainloader.batch_size)
    for i, data in t:
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # compute RMSE loss
        mse_loss_fn = torch.nn.MSELoss()
        rmse = torch.sqrt(mse_loss_fn(outputs, labels))
  
        # print statistics
        running_loss.append(loss.item())
        running_rmse.append(rmse.item())
        t.set_description('[TRAIN] [%d, %5d] loss: %.3f avg loss: %.3f rmse: %.3f avg rmse: %.3f' %
              (epoch + 1, i + 1, loss.item(), np.mean(running_loss[-40:]),
               rmse.item(), np.mean(running_rmse[-40:])))
        if i + 1 == MAX_ITERATIONS:
            break
    
    plt.plot(running_rmse)
    plt.plot(list(map(lambda x: np.mean(running_rmse[max(0, x-40):x]),
                      range(len((running_rmse))))))
    plt.title('Training - epoch %d/%d' % (epoch + 1, N_EPOCHS))
    plt.show()

    # Validation
    if VAL_SPLIT == 0.:
        continue
    valloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                            shuffle=True, num_workers=2)
    model.eval()

    val_loss_all = []
    val_rmse_all = []
    t = tqdm(enumerate(valloader, 0), total=len(valloader.dataset) / valloader.batch_size)
    for i, data in t:
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        mse_loss_fn = torch.nn.MSELoss()
        rmse = torch.sqrt(mse_loss_fn(outputs, labels))

        val_loss_all.append(loss.item())
        val_rmse_all.append(rmse.item())
        t.set_description('[VAL] [%d, %5d] loss: %.3f avg loss: %.3f rmse: %.3f avg rmse: %.3f' %
              (epoch + 1, i + 1, loss.item(), np.mean(val_loss_all),
               rmse.item(), np.mean(val_rmse_all)))

# Generate submissions


In [None]:
with open('/kaggle/input/petfinder-pawpularity-score/test.csv', 'r') as labels_file:
    test_labels = pd.read_csv(labels_file)
print(test_labels)

predictions = []
for id in test_labels['Id']:
    img_filename = os.path.join('/kaggle/input/petfinder-pawpularity-score/test/', id + '.jpg')
    image = torchvision.transforms.ToTensor()(np.asarray(Image.open(img_filename).resize((IMG_SIZE, IMG_SIZE))))
    image = torchvision.transforms.functional.convert_image_dtype(image, dtype=torch.float)
    # show(image)
    image = torch.reshape(image, (1, 3, IMG_SIZE, IMG_SIZE)).cuda()
    pawpularity = model(image)
    predictions.append(min(100, pawpularity.item()))
    # print(pawpularity)

test_labels['Pawpularity'] = predictions
print(test_labels[['Id', 'Pawpularity']])
test_labels[['Id', 'Pawpularity']].to_csv('submission.csv', index=False)
!cat submission.csv
# 
# pawpularity = self.labels_.at[idx, 'Pawpularity']