In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
from fastai.vision.all import *
from fastai.text.all import *

from pathlib import Path

import PIL

## Data PreProcessing

Set the path using pathlib library and get all file names.
Then we can use the list to read other files

In [None]:
data_path = Path('../input/petfinder-pawpularity-score')

In [None]:
files = data_path.ls()
files

**Read the CSV files**

using files list read the csv files and have a look on their structure.

In [None]:
submission_df = pd.read_csv(files[0])
train_df = pd.read_csv(files[1])
test_df = pd.read_csv(files[2])

train df has different attributes of each image. The test df does not has the popularity column (this is what we want to predict).
You can also check the submission file (their is a popularity index corresponding to each image id).

In [None]:
train_df.head()

Their are total 9912 images and each image has corresponding record in the table

In [None]:
train_df.info()

**Check the Images**

In [None]:
image_files = get_image_files(files[4])
image_files

In [None]:
img = PIL.Image.open(image_files[21])

print(img.size)
img

### EDA

Using histogram we can see how the popularity index is distributed. 
From the graph we can see that most pet images has a popularity value between 20 and 40.

In [None]:
train_df['Pawpularity'].hist(figsize = (10, 5))

# let calculate some other usefull stats
print(f"The mean of the images: {train_df['Pawpularity'].mean()}")
print(f"The median of the images: {train_df['Pawpularity'].median()}")
print(f"The standard deviation of the images: {train_df['Pawpularity'].std()}")

## Data Processing

To treat the problem as regression we have to normalize the pawpularity values between 0 and 1. So that the model can predict it.

In [None]:
train_df['Pawpularity'] /= 100
train_df.head(2)

Since we will directly feed the dataframe into the dataloader. We have to change image id column to image path. so the model can directly load it.

In [None]:
# using lambda function to change each row
train_df['name'] = train_df['Id'].map(lambda image_id : 'train/' + image_id + '.jpg')

# remove the old id column
# train_df = train_df.drop('Id', 1)

train_df.head(2)

## DataLoaders

Now we have to create a Image dataloader. So that we can feed the images and the target into the model for training.

Directly create dataloader without first creating a datablock.

https://docs.fast.ai/vision.data.html#ImageDataLoaders.from_df

We can pass the dataset Path and even folder name where the images reside along with image names. 

If it is not an classification problem then we have to define the b_block functionality.

In [None]:
dls = ImageDataLoaders.from_df(train_df,
                              
                               # path to dataset
                               data_path,
                               
                               # seed for reproducibility
                               seed = 1000,
                               
                               # image name
                               fn_col = 'name',
                               # label column
                               label_col = 'Pawpularity',
                               
                               # type of task
                               y_block = RegressionBlock,
                               
                               # batch size
                               bs = 32,
                               
                               num_workers = 8,
                               
                               # transformation
                               item_tfms=Resize(224),   # individual
                               batch_tfms = setup_aug_tfms([Brightness(), Contrast(), Hue(), Saturation()])
                              )

In [None]:
# take a look if created correctly
dls.show_batch()

## Model 

Now we have to create a model for training to predict the pawpularity from images.

We will try to use swim model. First we have to download the pretrained weights.

Need to add 2 more datasets (Both are from kaggle):-

1. swin transformer -> containing the swin transformer model

    https://github.com/rwightman/pytorch-image-models  (official models)

2. timm (pytorch image models) -> pretrained weights for image models

    https://github.com/microsoft/Swin-Transformer (official implementation)

    https://arxiv.org/pdf/2103.14030.pdf (Research paper)

### Import create model function from timm

We also need to import the create model function from the downloaded dataset (timm).

For importing timm we need to add the path to system. So that it allows us to import timm.

In [None]:
# add this path to system paths 
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
from timm import create_model

In [None]:
# ?? create_model

### Swin transformer model 

We have to create a checkpoints folder and copy our pretrained swin model to that folder.

Only after this we were able to create the model using timm.

In [None]:
# create checkpoints folder
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
    os.makedirs('/root/.cache/torch/hub/checkpoints/')
    
# copy the model to this folder
!cp '../input/swin-transformer/swin_large_patch4_window7_224_22kto1k.pth' '/root/.cache/torch/hub/checkpoints/swin_large_patch4_window7_224_22kto1k.pth'

In [None]:
dls.c

We are treating it as a classification problem with 1 class and predict your score for this class (of cuteness).

In [None]:
# LOAD THE SWIN TRANSFORMER MODEL
model = create_model('swin_large_patch4_window7_224', 
                    pretrained=True,  # use pretrained weights
                    num_classes = dls.c
                    )

## LEARNING

Create a RMSE function for pawpularity score prediction. We will multiply by 100 to remove normalization.

In [None]:
def RMSE_pawpularity(input, target):
    return 100 * torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

Create fastai learner for training. We will use BCEWithLogitsLossFlat as loss function. fp_16 is used to reduce space requirements.

In [None]:
learn = Learner(dls, model, loss_func=BCEWithLogitsLossFlat(), metrics = RMSE_pawpularity).to_fp16()

In [None]:
# DONT KNOW WHY - BUT THE IT WAS TRYING TO SAVE THE MODEL IN INPUT DIRECTORY
learn.path = Path('./')

In [None]:
# find a good learning rate
learn.lr_find()

In [None]:
learn.fit_one_cycle(1, 4e-5)

In [None]:
# print the journey
learn.recorder.plot_loss()

In [None]:
learn.save('fine_tuned_6')

In [None]:
learn.export()

## INFERENCE

In [None]:
# using lambda function to change each row
test_df['name'] = test_df['Id'].map(lambda image_id : 'test/' + image_id + '.jpg')

In [None]:
test_df.head(2)

In [None]:
test_df.shape

**CREATE TEST DATALOADER**

I only declared the 'train' folder in the iamge dataloader. Now i should redefine the folder to get test dataloader. 

In [None]:
test_dl = dls.test_dl(test_df)

In [None]:
test_dl.show_batch()

**GET PREDICTIONS**

In [None]:
preds = learn.get_preds(dl = test_dl)
preds

**CREATE SUBMISSION FILE**

In [None]:
submission_df

In [None]:
# CREAT EMPTY DATAFRAME
data = {'Id':[],
       'Pawpularity':[]}

ans = pd.DataFrame(data)

In [None]:
# ADD PREDICTIONS
ans['Id'] = test_df['Id']
ans['Pawpularity'] = preds[0].float().numpy() * 100

In [None]:
ans

In [None]:
ans.to_csv('submission.csv', index=False)