# Pawpularity Prediction

## Group Name: Noob 

## 1. Load Metadata

In [None]:
import os
import numpy as np
import pandas as pd

data_path = '/kaggle/input/petfinder-pawpularity-score/{:s}'

In [None]:
train_metadata = pd.read_csv(data_path.format("train.csv")).set_index('Id')

The metadata training set includes 9912 samples, with 12 features and 1 parpularity label in each feature.

## 2. Preparation

First, load the PyTorch library.

In [None]:
import torch
import torchvision

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import datasets, models, transforms
from torchvision.datasets import ImageFolder

from torch.utils.data import DataLoader,Dataset,ConcatDataset
from torchvision.utils import make_grid

import time
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

Set the global parameters.

In [None]:
BATCH_SIZE=32
path='/kaggle/input/petfinder-pawpularity-score/train'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Define the transform to convert the image file to input Tensor for the PyTorch model.

In [None]:
transfrom = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(240),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

Define the dataset class for model input.

In [None]:
from PIL import Image
from torchvision.datasets import VisionDataset

class PawpularityDataset(VisionDataset):
    def __init__(self, root_dir, df, transforms=None):
        self.root_dir = root_dir
        self.df = df
        self.file_names = df.index
        self.targets = df['Pawpularity'].values
        self.meta = df.drop(columns=['Pawpularity']).values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.root_dir + '/' + self.file_names[index] + '.jpg'
        with open(img_path, 'rb') as f:
            img = Image.open(f)
            img = img.convert('RGB')
        meta = self.meta[index, :]
        target = self.targets[index]
        
        if self.transforms:
            img = self.transforms(img)
            
        return img, meta, target

Load the training dataset.

In [None]:
train_dataset = PawpularityDataset(path, train_metadata, transforms=transfrom)

Load the pretrained model. Due to the unavailable Internet setting in Kaggle submisson notebook. Here we pre-download the pretrained model from 

In [None]:
model = torch.jit.load('../input/effcient/eff_b1.pth')
model = model.to(device)

Here is the code of downloading the pretrained model.

After download the original model. We replace the last layer to `nn.Identity` to obtain the feature vector.

In [None]:
# model = torch.load('/kaggle/input/effinetb0/EfficientNet_BaseModel')
# for param in model.parameters():
#     param.requires_grad = False
# model.classifier = nn.Identity()
# model = model.to(device)

## 3. Feature Extraction

Here, we input the training dataset to the model. The output of the model is the feature vectors.

*This cell takes about 3 min to generate all features of the training set.*

In [None]:
from tqdm import tqdm

train_loader=DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=False)
model.eval()

feature, label = [], []
for data in tqdm(train_loader):
  image, _, score = data[0].to(device), data[1], data[2]
  feature.append(model(image))
  label.append(score)
  

To perform the regression, we define a grid search estimator class for all estimators in library `scikit-learn`.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

class TradModelEstimator(object):
    '''
    Grid search for different models.
    '''
    def __init__(self, model=Ridge, param={}) -> None:
        super().__init__()
        fixed_param = {}
        search_grid = {}
        for key, value in param.items():
            if len(value) > 1:
                search_grid[key] = value
            else:
                fixed_param[key] = value[0]
        self.Estimator = model(**fixed_param)
        self.GSCV = GridSearchCV(self.Estimator, param_grid=search_grid,
                                 n_jobs=-1, verbose=0, cv=5, refit='neg_root_mean_squared_error',
                                 scoring=['neg_root_mean_squared_error', 'neg_mean_squared_error',
                                          'neg_mean_absolute_error'])

    def grid_search(self, trainX, trainY) -> any:
        self.GSCV.fit(trainX, trainY)
        return self.GSCV.cv_results_


Here we use the simple **ridge regression** model to perform regression.

In [None]:
SEED = 2021
PARAM = {
    'Ridge': {
        "alpha": np.logspace(0, 7, 8),
        "random_state": [SEED],
        "tol": [1e-1]
    }
}

The cross-validation result is shown as below:

In [None]:
MODEL = Ridge
Est = TradModelEstimator(MODEL, PARAM[MODEL.__name__])

feature = np.hstack([train_metadata.iloc[:, :-1].to_numpy(), torch.vstack(feature).to('cpu')])
label = train_metadata.iloc[:, -1]

feature = feature[label < 100, :]
label = label[label<100].to_numpy()

prediction = Est.grid_search(feature, label)
print('RMSE: {:.6f}'.format(-Est.GSCV.best_score_))

## Generate Result for Test Set

Here we set the global parameters for test set.

In [None]:
BATCH_SIZE_test=32
path_test='/kaggle/input/petfinder-pawpularity-score/test'
device_test = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Here we redefine the dataset class for test set, which doesn't contain the label data.

In [None]:
class PawpularityTestDataset(VisionDataset):
    def __init__(self, root_dir, df, transforms=None):
        self.root_dir = root_dir
        self.df = df
        self.file_names = df.index
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.root_dir + '/' + self.file_names[index] + '.jpg'
        with open(img_path, 'rb') as f:
            img = Image.open(f)
            img = img.convert('RGB')
        
        if self.transforms:
            img = self.transforms(img)
            
        return img

Here we load the test metadata.

In [None]:
test_metadata = pd.read_csv(data_path.format("test.csv")).set_index('Id')

Load the test dataset.

In [None]:
test_dataset = PawpularityTestDataset(path_test, test_metadata, transforms=transfrom)

Use the pretrained model to generate feature vectors.

In [None]:
test_loader=DataLoader(test_dataset, batch_size=BATCH_SIZE_test,shuffle=False)
model.eval()

feature_test = []
for data in tqdm(test_loader):
  feature_test.append(model(data.to(device)))

In [None]:
y_pred = Est.GSCV.best_estimator_.predict(np.hstack([test_metadata.to_numpy(), torch.vstack(feature_test).to('cpu')]))

Use the best regressor in cross-validation to generate the predicted Pawpularity score.

In [None]:
submission = pd.DataFrame({'Id': test_metadata.index, 'Pawpularity': y_pred})

Output the result to file.

In [None]:
submission.to_csv("submission.csv", index=False)