In [None]:
import sys
sys.path.append("../input/timmmaster/")



import os
import math
import cv2
import timm
import torch
import random
import numpy as np
import pandas as pd
import torch.optim as optim
import albumentations
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from typing import List, Optional
from dataclasses import dataclass
from torch import nn
from torch.utils.data import DataLoader
from torch.utils import data as torch_data
from catalyst import dl
from catalyst.data import ToTensor
from catalyst.contrib.nn import BatchScheduler
from catalyst.utils.torch import set_optimizer_momentum
from sklearn.model_selection import StratifiedKFold, train_test_split

<img align="left" src="https://storage.googleapis.com/kaggle-competitions/kaggle/25383/logos/header.png?t=2021-08-31-18-49-29" data-canonical-src="https://storage.googleapis.com/kaggle-competitions/kaggle/25383/logos/header.png?t=2021-08-31-18-49-29" width="1350" />


1. [EDA](#1)
    * [Pawpularity](#1.1)
    * [Additional Characteristics](#1.2)
    * [Image for each charecteristic](#1.3)
2. [Train](#2)
3. [Inference](#3)
4. [References](#4)

<a id="1"></a>
## 1. EDA

In [None]:
def set_seed(seed: int):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

@dataclass
class CFG:
    image_size: int = 256
    vflip_p: float = 0.01
    fold: int = 0
    lr: float = 5e-4
    reg_dropout: float = 0.1
    batch_size: int = 64
    reg_epochs: int = 5
    seed: int = 2809
        
cfg = CFG()
set_seed(cfg.seed)

In [None]:
df = pd.read_csv("../input/same-old-creating-folds/train_10folds.csv")

df_train = df[df.kfold != cfg.fold].reset_index(drop=True)
df_valid = df[df.kfold == cfg.fold].reset_index(drop=True)

characteristics = [
    'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
    'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
]

train_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_train["Id"].values]
valid_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_valid["Id"].values]

<a id="1.1"></a>
### Pawpularity
The Pawpularity Score is derived from each pet profile's page view statistics at the listing pages, using an algorithm that normalizes the traffic data across different pages, platforms (web & mobile) and various metrics.

In [None]:
def get_center(x):
    hist, bins = np.histogram(x, bins=128)
    a = hist.argmax()
    return (bins[a + 1] + bins[a]) / 2

In [None]:
fig = ff.create_distplot([df_train.Pawpularity.values, df_valid.Pawpularity.values], ['Train', 'Valid'])
fig.update_layout(
    title="Pawpularity Distribution",
    xaxis_title="Pawpularity",
    yaxis_title="Density")
CX = get_center(df.Pawpularity.values)
LNX = CX
RNX = 100 - CX
fig.add_vline(get_center(df.Pawpularity.values), line_width=3, line_color="red")
fig.show()

In [None]:
def norm(x, center=CX, lscale=LNX, rscale=RNX):
    x = x.copy().astype(np.float64)
    x -= center
    x[x <= 0] /= lscale
    x[x > 0] /= rscale
    return x

def inorm(x, center=CX, lscale=LNX, rscale=RNX):
    x = x.copy().astype(np.float64)
    x[x <= 0] *= lscale
    x[x > 0] *= rscale
    return x + center

In [None]:
fig = ff.create_distplot([norm(df_train.Pawpularity.values), norm(df_valid.Pawpularity.values)], ['Train', 'Valid'], bin_size=0.01)
fig.update_layout(
    title="Pawpularity Distribution",
    xaxis_title="Pawpularity",
    yaxis_title="Density")
fig.show()

In [None]:
_c = [
    'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
    'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur', 'Pawpularity'
]
fig = fig = px.imshow(df[_c].corr())
fig.update_layout(
    title="Characteristics - Pawpularity Correlation")
fig.show()

<a id="1.2"></a>
### Additional Characteristics
* Focus - Pet stands out against uncluttered background, not too close / far.
* Eyes - Both eyes are facing front or near-front, with at least 1 eye / pupil decently clear.
* Face - Decently clear face, facing front or near-front.
* Near - Single pet taking up significant portion of photo (roughly over 50% of photo width or height).
* Action - Pet in the middle of an action (e.g., jumping).
* Accessory - Accompanying physical or digital accessory / prop (i.e. toy, digital sticker), excluding collar and leash.
* Group - More than 1 pet in the photo.
* Collage - Digitally-retouched photo (i.e. with digital photo frame, combination of multiple photos).
* Human - Human in the photo.
* Occlusion - Specific undesirable objects blocking part of the pet (i.e. human, cage or fence). Note that not all blocking objects are considered occlusion.
* Info - Custom-added text or labels (i.e. pet name, description).
* Blur - Noticeably out of focus or noisy, especially for the pet’s eyes and face. For Blur entries, “Eyes” column is always set to 0.


In [None]:
widths = np.array([10 for _ in characteristics])

data = {
    "Positive": df[characteristics].values.sum(0),
    "Negative": len(df) -  df[characteristics].values.sum(0)
}

fig = go.Figure()
for key in data:
    fig.add_trace(go.Bar(
        name=key,
        y=data[key],
        x=np.cumsum(widths)-widths,
        width=widths,
        offset=0,
        customdata=np.transpose([characteristics, data[key]]),
        texttemplate="%{customdata[0]}<br>%{customdata[1]}",
        textposition="inside",
        textangle=0,
        textfont_color="white",
        hovertemplate="<br>".join([
            "characteristic: %{customdata[0]}",
            "Total Num.: %{customdata[1]}",
        ])
    ))

fig.update_xaxes(
    tickvals=np.cumsum(widths)-widths/2,
    ticktext= ["%s" % (l) for l in characteristics]
)

fig.update_xaxes(range=[0,sum(widths)])
fig.update_yaxes(range=[0,len(df)])

fig.update_layout(
    title_text="Characteristic Pos./Neg. Disttribution",
    barmode="stack",
    uniformtext=dict(mode="hide", minsize=10),
)

<a id="1.3"></a>
### Image for each charecteristic

In [None]:
def visualize_img_characteristic(characteristic):
    neg = [cv2.cvtColor(
        cv2.imread(f"../input/petfinder-pawpularity-score/train/{idx}.jpg"), 
        cv2.COLOR_BGR2RGB) for idx in df.loc[df[characteristic] == 0, 'Id'].sample(1)]
    pos = [cv2.cvtColor(
        cv2.imread(f"../input/petfinder-pawpularity-score/train/{idx}.jpg"), 
        cv2.COLOR_BGR2RGB) for idx in df.loc[df[characteristic] == 1, 'Id'].sample(1)]

    fig, axs = plt.subplots(1, 2, figsize=(16, 5))

    for p, n in zip(pos, neg):
        axs[0].imshow(p)
        axs[0].set_title('1')
        axs[0].axis('off')
    
    
        axs[1].imshow(n)
        axs[1].set_title('0')
        axs[1].axis('off')
    fig.suptitle(characteristic)
    plt.show()

In [None]:
for c in characteristics:
    visualize_img_characteristic(c)

<a id="2"></a>
## 2. Train

In [None]:
class DataRetriever(torch_data.Dataset):
    def __init__(self, image_paths: List[str], targets: np.ndarray, augmentations: albumentations.Compose):
        self.image_paths = image_paths
        self.targets = norm(targets)
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, index):
        image = cv2.imread(self.image_paths[index])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
            
        image = torch.tensor(np.transpose(image, (2, 0, 1)).astype(np.float32), dtype=torch.float)
        
        y = torch.tensor(self.targets[index], dtype=torch.float)
        return {"features": image, "targets": y}

In [None]:
class RegModel(nn.Module):
    def __init__(self, base_model: nn.Module, config: CFG):
        super().__init__()
        self.model = base_model
        self.model.classifier = nn.Sequential(
                    nn.Dropout(config.reg_dropout),
                    nn.Linear(in_features=self.model.classifier.in_features, out_features=1, bias=False)
        )
    
    def forward(self, x):
        out = self.model(x)
        return out[:, 0]

In [None]:
class CosineHardRestartWarmupBatchShedulerWrapper(BatchScheduler):
    def __init__(self, optimizer: optim.Optimizer, 
                       num_warmup_steps: int, 
                       num_training_steps: int, 
                       num_cycles: int = 1,
                       last_itter: int = -1,
                       gamma: float = 0.9,
                       verbose: bool = False):
        self.__num_warmup_steps = num_warmup_steps
        self.__num_training_steps = num_training_steps
        self.__num_cycles = num_cycles
        self.__gamma = gamma
        self.__steps_per_epoch = (num_training_steps - num_warmup_steps) // (num_cycles)
        self.total_groups = len(optimizer.param_groups)
        super().__init__(optimizer, last_itter, verbose)
        
    def get_momentum(self):
        return [] * self.total_groups
    
    
    def get_lr(self):
        return [lr * self._form_function(self._step_count) * self.__gamma ** (self._step_count // self.__steps_per_epoch) for lr in self.base_lrs]
    
    def _form_function(self, count):
        if count < self.__num_warmup_steps:
            return float(count) / float(max(1, self.__num_warmup_steps))
        progress = float(count - self.__num_warmup_steps) / float(max(1, self.__num_training_steps - self.__num_warmup_steps))

        if progress >= 1.0:
            return 0
        return max(0, 0.5 * (1.0 + math.cos(math.pi * ((float(self.__num_cycles) * progress) % 1.0))))

In [None]:
train_aug = albumentations.Compose(
    [
        albumentations.Resize(cfg.image_size, cfg.image_size, p=1),
        albumentations.augmentations.transforms.HorizontalFlip(p=cfg.vflip_p),
        albumentations.Normalize(),
    ],
    p=1.0,
)

valid_aug = albumentations.Compose(
    [
        albumentations.Resize(cfg.image_size, cfg.image_size, p=1),
        albumentations.Normalize(),
    ],
    p=1.0,
)

In [None]:
tr_dataset = DataRetriever(train_img_paths, df_train.loc[:, 'Pawpularity'].values, train_aug)
vl_dataset = DataRetriever(valid_img_paths, df_valid.loc[:, 'Pawpularity'].values, valid_aug)

tr_loader = DataLoader(tr_dataset, batch_size=cfg.batch_size, num_workers=8)
vl_loader = DataLoader(vl_dataset, batch_size=cfg.batch_size, num_workers=8)
loaders = {"train": tr_loader, "valid": vl_loader}

In [None]:
base_model = timm.create_model("tf_efficientnet_b0_ns", pretrained=False, in_chans=3)
base_model.load_state_dict(torch.load('../input/timms-effb0/tf_efficientnet_b0_ns-c0e6a31c.pth'))

reg_model = RegModel(base_model, cfg)
criterion = nn.MSELoss()
optimizer = optim.Adam(reg_model.parameters(), lr=cfg.lr)
scheduler = CosineHardRestartWarmupBatchShedulerWrapper(
                                    optimizer, 
                                    num_warmup_steps=len(tr_loader), 
                                    num_training_steps=len(tr_loader) * cfg.reg_epochs,
                                    num_cycles=cfg.reg_epochs - 1
)

model_parameters = filter(lambda p: p.requires_grad, reg_model.parameters())
print("Total N params",sum([np.prod(p.size()) for p in model_parameters]))

runner = dl.SupervisedRunner()
# model training
runner.train(
    model=reg_model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    num_epochs=cfg.reg_epochs,
    logdir="./logs_reg",
    valid_loader="valid",
    valid_metric="loss",
    minimize_valid_metric=True,
    verbose=True
)

In [None]:
valid_metrics = pd.read_csv('./logs_reg/logs/valid.csv')
train_metrics = pd.read_csv('./logs_reg/logs/train.csv')

fig = px.line(
    pd.DataFrame(
        {
        'Epoch': train_metrics.step,
        'Valid RMSE': np.sqrt(valid_metrics.loss.values),
        'Train RMSE': np.sqrt(train_metrics.loss.values)
        }
    ),
    x='Epoch',
    y=['Valid RMSE', 'Train RMSE'],
    title='Train/Valid RMSE'
)

fig.show()

In [None]:
reg_model.load_state_dict(torch.load('logs_reg/checkpoints/best_full.pth')['model_state_dict'])

In [None]:
reg_model.eval()
valid_pred = []
valid_target = []

with torch.no_grad():
    for x in tqdm(vl_loader):
        valid_pred.extend(reg_model(x['features'].to('cuda')).cpu().numpy().tolist())
        valid_target.extend(x['targets'].numpy().tolist())
        
fig = ff.create_distplot([inorm(np.array(valid_pred)), inorm(np.array(valid_target))], ['Predicted', 'Target'])
fig.update_layout(
    title="Pawpularity Distribution",
    xaxis_title="Pawpularity",
    yaxis_title="Density")
fig.show()

In [None]:
reg_model.eval()
pred = []
target = []

with torch.no_grad():
    for x in tqdm(tr_loader):
        pred.extend(reg_model(x['features'].to('cuda')).cpu().numpy().tolist())
        target.extend(x['targets'].numpy().tolist())
        
fig = ff.create_distplot([inorm(np.array(pred)), inorm(np.array(target))], ['Predicted', 'Target'])
fig.update_layout(
    title="Pawpularity Distribution",
    xaxis_title="Pawpularity",
    yaxis_title="Density")
fig.show()

<a id="3"></a>
## 3. Inference

In [None]:
df_test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
test_img_paths = [f"../input/petfinder-pawpularity-score/test/{x}.jpg" for x in df_test["Id"].values]

test_dataset = DataRetriever(
    image_paths=test_img_paths,
    targets=np.ones(len(test_img_paths)),
    augmentations=valid_aug,
)

reg_model.eval()
final_test_predictions = []

with torch.no_grad():
    for x in tqdm(DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=8)):
        final_test_predictions.extend(reg_model(x['features'].to('cuda')).cpu().numpy().tolist())

final_test_predictions = inorm(np.array(final_test_predictions))
final_test_predictions[final_test_predictions < 0] = 0
final_test_predictions[final_test_predictions > 100] = 100
df_test["Pawpularity"] = final_test_predictions
df_test = df_test[["Id", "Pawpularity"]]
df_test.to_csv("submission.csv", index=False)

<a id="4"></a>
## References
* [same old creating folds](https://www.kaggle.com/abhishek/same-old-creating-folds)