## Summary
* Trained model to identify breeds of dogs and cats
* Included script to classify between dog and cat
* Trained ligthgbm to predict Pawpularity

I used several datasets for breed identidication and preprocess them, so that labels do not repeat and trained swin-tiny on this data. Also I included dataset which was collected using API to gather information from [PetFinder.my](PetFinder.my) *../input/cat-breeds-dataset* but it's not clean, so I didn't use it. Dataset is slightly imbalanced, but all the techniques that I tried: weightedSampler, give weights to classes didn't work out. **F1 macro score on validation is ~0.8**. There are a lot of things that can improve score: train larger model, add more augmentations, tune hyperparameters and etc...
### References

- I used slightly modified code from https://github.com/amitrajitbose/cat-v-dog-classifier-pytorch.git to predict whether animal is a cat or dog.

In [None]:
!pip install pytorch-lightning timm python-box -U albumentations wandb > /dev/null

## Import all needed libraries

In [None]:
import sys
sys.path.append('../input/catvdogclassifier/cat-v-dog-classifier-pytorch')

from predict import ModelInference
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
from glob import glob
from pathlib import Path
from tqdm import tqdm
from box import Box
import wandb
import cv2
import os

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from albumentations.pytorch.transforms import ToTensorV2
from torchmetrics import ConfusionMatrix, Accuracy, F1, Precision, Recall
from sklearn.metrics import classification_report
import albumentations as A

from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import callbacks
import pytorch_lightning as pl

import torch
from timm import create_model
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()

## Preprocess data
Class **paths** needed to specify all paths to datasets that will be used. You can delete some datasets or add.

In [None]:
class paths:
  #cat_breeds_petfinder = Path('../input/cat-breeds-dataset')
  cat_breeds_oxford = Path('../input/the-oxfordiiit-pet-dataset')
  dog_breeds = Path('../input/dog-breeds')
  dog_breeds_kaggle = Path('../input/dog-breed-identification')
  petfinder_old = Path('../input/petfinder-adoption-prediction')

#out columns
columns = ['filepath', 'breed', 'data_source']

In [None]:
seed_everything(2021, workers=True)

Useful class to preprocess all datasets and output final one in the form of pandas DataFrame.

In [None]:
class DogsCatsData:
  def __init__(self, paths, columns):
    self.paths = paths
    self.columns = columns

  def get_data(self, min_th = 10, keep_th = 300):
    """
    min_th: the minimun number of images that breed has
    keep_th: the maximum number of images that breed has
    """
    
    data = []
    for attr in vars(self.paths):
      if not attr.startswith('_'):
        data_function = getattr(self, f'{attr}_data')
        path = getattr(paths, attr)
        data.append(data_function(path))

    data = pd.concat([d[self.columns] for d in data])
    data = self._preprocess_data(data, min_th, keep_th)
    data = self._label_data(data)
    return data

  def _preprocess_data(self, data: pd.DataFrame, min_th = 15, keep_th = 300):
    """
    min_th: the minimun number of images that breed has
    keep_th: the maximum number of images that breed has
    """
    
    data['breed'] = data['breed'].str.lower()
    data['breed'] = data['breed'].apply(lambda x: x.replace('_', ' '))
    data = self._class_corections(data)

    breeds_sizes = data.groupby('breed').size()
    valid_breeds = breeds_sizes[breeds_sizes > min_th].index

    data = (
        data[data.breed.isin(valid_breeds)]
        .groupby('breed')
        .apply(lambda x: x.sample(keep_th, replace = True))
        .reset_index(drop=True)
        .drop_duplicates()
        )
    return data

  def _label_data(self, data: pd.DataFrame):
    self.le = LabelEncoder()
    data['breed_id'] = self.le.fit_transform(data['breed'].values)
    return data

  def _class_corections(self, data: pd.DataFrame):
    class_corection = {
        'afghan': 'afghan hound',
        'airedale': 'airedale terrier',
        'blenheim': 'blenheim spaniel',
        'boston bull': 'boston terrier',
        'chinese crested dog': 'chinese crested',
        'chow chow': 'chow',
        'cocker': 'cocker spaniel',
        'dalmation': 'dalmatian',
        'doberman pinscher': 'doberman',
        'english springer': 'english springer spaniel',
        'german sheperd': 'german shepherd',
        'german shepherd dog': 'german shepherd',
        'german shorthaired': 'german short-haired pointer',
        'irish spaniel': 'irish water spaniel',
        'jack russell terrier (parson russell terrier)': 'jack russell terrier',
        'labrador retriever': 'labrador',
        'leonberg': 'leonberger',
        'lhasa': 'lhasa apso',
        'maltese': 'maltese dog',
        'mex hairless': 'mexican hairless',
        'pekinese': 'pekingese',
        'pit bull': 'pit bull terrier',
        'rhodesian': 'rhodesian ridgeback',
        'scottish terrier scottie': 'scottish terrier',
        'shetland sheepdog sheltie': 'shetland sheepdog',
        'shih tzu': 'shih-tzu',
        'sphynx (hairless cat)': 'sphynx',
        'staffordshire bull terrier': 'staffordshire bullterrier',
        'west highland white terrier westie': 'west highland white terrier',
        'wire-haired fox terrier': 'wirehaired terrier',
        'yorkie': 'yorkshire terrier', 
        'yorkshire terrier yorkie': 'yorkshire terrier',
        }
    
    for breed in data['breed'].unique():
      if breed not in class_corection.keys():
        class_corection[breed] = breed

    data['breed'] = data['breed'].map(class_corection)
    return data

  @staticmethod
  def get_label_weights(data: pd.DataFrame, device: str):
    label_count = (
        data['breed_id']
        .value_counts()
        .to_frame()
        .sort_index()
        .values
    )
    weigths = torch.from_numpy(np.power(label_count, -1.)).float().squeeze()
    return weigths.to(device)

  @staticmethod
  def cat_breeds_oxford_data(path: Path):
    with open(path/'annotations/annotations/list.txt', 'r') as f:
      for _ in range(6):
        f.readline()
      cats_oxford = pd.read_csv(f, sep=" ", header=None)

      cats_oxford.columns = ["id", "CLASS-ID", "SPECIES", "BREED-ID"]
      cats_oxford['breed'] = cats_oxford['id'].apply(lambda x: ' '.join(x.split('_')[:-1]))
      cats_oxford['data_source'] = 'cats_oxford'
      cats_oxford['filepath'] = cats_oxford['id'].apply(lambda x: path/f'images/images/{x}.jpg')
    return cats_oxford

  @staticmethod
  def cat_breeds_petfinder_data(path: Path):
    cats = {'id': [], 'breed': [], 'filepath': []}
    for path in glob(os.path.join(path, 'images/*/*.jpg')):
      breed, id = path.split('/')[-2:]
      id = id.rstrip('.jpg')
      
      cats_petfinder['id'].append(id)
      cats_petfinder['breed'].append(breed)
      cats_petfinder['filepath'].append(path)
      
      cats_petfinder = pd.DataFrame(cats)
      cats_petfinder['data_source'] = 'cats_petfinder'
    return cats_petfinder

  @staticmethod
  def dog_breeds_data(path: Path):
    dogs = pd.read_csv(path/'dogs.csv')
    dogs['data_source'] = 'dog_breeds'
    dogs['filepath'] = dogs['filepaths'].apply(lambda x: path/x)
    dogs.rename(columns = {'labels': 'breed'}, inplace = True)
    return dogs

  @staticmethod
  def dog_breeds_kaggle_data(path: Path):
    dogs_kaggle = pd.read_csv(path/'labels.csv')
    dogs_kaggle['filepath'] = dogs_kaggle['id'].apply(lambda x: path/f'train/{x}.jpg')
    dogs_kaggle['data_source'] = 'dogs_kaggle'
    return dogs_kaggle

  @staticmethod
  def petfinder_old_data(path: Path):
    train_petfinder = pd.read_csv(path/'train/train.csv')
    mappings = pd.read_csv(path/'breed_labels.csv')
    mappings = (
        mappings[['BreedID', 'BreedName']]
        .set_index('BreedID')
        .to_dict()['BreedName']
    )

    train_petfinder['breed'] = train_petfinder['Breed1'].map(mappings)
    train_petfinder['filepath'] = train_petfinder['PetID'].apply(lambda x: path/f'{x}-1.jpg')
    train_petfinder['data_source'] = 'petfinder_old'

    most_common = train_petfinder['breed'].value_counts().index[0]
    train_petfinder['breed'] = train_petfinder['breed'].fillna(most_common)
    return train_petfinder

In [None]:
class BreedDataset(Dataset):
  def __init__(self, df: pd.DataFrame, img_size = (224, 224), transforms = None):
    self.df = self._make_dataset(df)
    self.img_size = img_size
    self.transforms = self.__transforms(transforms)

  def _make_dataset(self, df):
    print('dropping wrong images...')
    for i, row in df.iterrows():
      img = self.read_img(row.filepath)
      if not isinstance(img, np.ndarray):
        df.drop(axis = 0, index = i, inplace = True)
    return df

  def __transforms(self, transforms):
    if transforms is None:
      transforms = A.Compose([
                       A.Resize(*self.img_size),
                       A.Normalize(
                           mean = [0.485, 0.456, 0.406],
                           std = [0.229, 0.224, 0.225],
                           always_apply = True
                           ),
                       ToTensorV2(),
                       ])
    return transforms

  def __len__(self):
    return len(self.df)

  def __getitem__(self, indx):
    path = self.df.iloc[indx].filepath
    label = self.df.iloc[indx].breed_id
    img = self.prepare_img(path)
    return img, label

  @staticmethod
  def read_img(path):
    if isinstance(path, Path):
      path = path.as_posix()
    img = cv2.imread(path)
    if isinstance(img, np.ndarray): 
      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

  def prepare_img(self, path):
    img = self.read_img(path)
    img = self.transforms(image=img)['image']
    return img

In [None]:
class CustomDataModule(LightningDataModule):
  def __init__(
      self,
      data: DogsCatsData,
      transforms: A.Compose = None,
      weights = None,
      test_size = 0.25,
      img_size = (224, 224),
      batch_size = 64
      ):
    super().__init__()
    self.train_df, self.val_df = self.split_data(data, test_size)
    self.batch_size = batch_size
    self.transforms = transforms
    self.img_size = img_size
    self.weights = weights

  @staticmethod
  def split_data(data, test_size):
    train_df, val_df = train_test_split(
        data,
        test_size = test_size,
        random_state = 2021,
        )
    return train_df, val_df

  def train_dataloader(self):
    train_split = BreedDataset(
        self.train_df, 
        self.img_size, 
        self.transforms
        )
    return DataLoader(
        train_split,
        batch_size=self.batch_size, 
        shuffle=True, 
        num_workers=4,
        )

  def val_dataloader(self):
    val_split = BreedDataset(self.val_df, self.img_size)
    return DataLoader(
        val_split, 
        batch_size=self.batch_size, 
        shuffle=False,
        num_workers=4,
        )

In [None]:
class CustomModel(LightningModule):
  def __init__(self, cfg):
    super().__init__()
    self.cfg = cfg
    self.__build_model()
    self.save_hyperparameters(cfg)

  def __build_model(self):
    self.backbone = create_model(
        self.cfg.model_name, 
        drop_rate = self.cfg.dropout_backbone, 
        pretrained=True, 
        num_classes=0, 
        in_chans=3
        )
    self.fc = nn.Sequential(
        nn.Dropout(self.cfg.dropout_fc),
        nn.LazyLinear(self.cfg.num_classes)
        )
    
  def forward(self, x):
    f = self.backbone(x)
    out = self.fc(f)
    return out

  def configure_optimizers(self):
    optimizer = eval(self.cfg.optimizer.name)(
        self.parameters(), 
        **self.cfg.optimizer.params
        )
    scheduler = eval(self.cfg.scheduler.name)(
        optimizer,
        **self.cfg.scheduler.params
        )
    return [optimizer], [scheduler]

  def __share_step(self, batch):
    img, labels = batch
    logits = self(img)
    preds = logits.argmax(dim = -1)

    loss = F.cross_entropy(logits, labels, weight = self.cfg.weights)
    return loss, labels, preds

  def __share_epoch(self, outputs, stage):
    def calculate_metrics(preds, labels):
      accuracy_score = Accuracy(num_classes = self.cfg.num_classes, average = 'macro')
      f1_score = F1(self.cfg.num_classes, average = 'macro')
      pr_score = Precision(self.cfg.num_classes, average = 'macro')
      r_score = Recall(self.cfg.num_classes, average = 'macro')

      accuracy = accuracy_score(preds, labels)
      f1 = f1_score(preds, labels)
      precision = pr_score(preds, labels)
      recall = r_score(preds, labels)
      return {
          'accuracy': accuracy, 
          'f1': f1, 
          'precision': precision, 
          'recall': recall
          }

    preds = torch.cat([out['preds'] for out in outputs]).cpu()
    labels = torch.cat([out['labels'] for out in outputs]).cpu()

    metrics = calculate_metrics(preds, labels)
    for k, v in metrics.items():
      self.log(f'{stage}_{k}', v)

  def training_step(self, batch, batch_idx):
    loss, labels, preds = self.__share_step(batch)
    self.log('train_loss', loss)
    return {'loss': loss, 'preds': preds, 'labels': labels}
        
  def validation_step(self, batch, batch_idx):
    loss, labels, preds = self.__share_step(batch)
    self.log('val_loss', loss)
    return {'loss': loss, 'preds': preds, 'labels': labels}

  def training_epoch_end(self, outputs):
    self.__share_epoch(outputs, 'train')

  def validation_epoch_end(self, outputs):
    self.__share_epoch(outputs, 'val')

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    img, labels = batch
    return self(img)

In [None]:
class ImagePredictionLogger(callbacks.Callback):
    def __init__(self, val_samples):
        super().__init__()
        self.val_imgs, self.val_labels = val_samples

    def on_validation_epoch_end(self, trainer, pl_module):
        val_imgs = self.val_imgs.to(device=pl_module.device)
        val_labels = self.val_labels.to(device=pl_module.device)
        
        logits = pl_module(val_imgs)
        preds = torch.argmax(logits, -1)

        trainer.logger.experiment.log({
            "examples":[wandb.Image(x, caption=f"Pred:{pred}, Label:{y}") 
                           for x, pred, y in zip(val_imgs, 
                                                 preds, 
                                                 val_labels)]
            })

Main config for the model. You can play with parameters and model name.

In [None]:
cfg = {
    'model_name': 'swin_tiny_patch4_window7_224',
    'dropout_backbone': 0,
    'dropout_fc': 0,
    'epoch': 3,
    'batch_size': 64,
    'img_size': (224, 224),
    'test_size': 0.2,
    'device': 'cuda:0',
    'weights': None,
    'optimizer':{
        'name': 'optim.AdamW',
        'params':{
            'lr': 1e-4,
            #'weight_decay': 1e-4,
        },
    },
    'scheduler':{
        'name': 'optim.lr_scheduler.CosineAnnealingWarmRestarts',
        'params':{
            'T_0': 10,
            'eta_min': 1e-6
        },
    },
    'logger': {
        'save_dir': './',
        'name': 'swin_tiny_breed',
        'project': 'Breeds',
        'log_model': True,
    },
    'trainer': {
        'gpus': 1,
        'accumulate_grad_batches': 1,
        'auto_lr_find': False,
        'progress_bar_refresh_rate': 3,
        'fast_dev_run': False,
        'num_sanity_val_steps': 2,
        'resume_from_checkpoint': None,
    },
}
cfg = Box(cfg)

In [None]:
train_transforms = A.Compose([
                 A.HorizontalFlip(p = 0.5),
                 A.VerticalFlip(p = 0.5),
                 A.RandomBrightnessContrast(p=0.3),
                 A.ShiftScaleRotate(p=0.3),
                 A.Resize(height=cfg.img_size[0], width=cfg.img_size[1]),
                 A.Normalize(
                     mean = [0.485, 0.456, 0.406],
                     std = [0.229, 0.224, 0.225],
                     always_apply = True
                     ),
                 ToTensorV2(),                                
                 ])

I used WandbLogger to log results because it's convenient and easy but you can skip it and don't add.

In [None]:
data = DogsCatsData(paths, columns).get_data()
cfg.num_classes = data['breed'].nunique()
cfg.train_transforms = train_transforms
#cfg.weights = DogsCatsData.get_label_weights(data, cfg.device)

model = CustomModel(cfg)
datamodule = CustomDataModule(
    data, 
    cfg.train_transforms, 
    cfg.weights,
    cfg.test_size, 
    cfg.img_size,
    cfg.batch_size
    )

val_samples = next(iter(datamodule.val_dataloader()))
img_predictions = ImagePredictionLogger(val_samples)
earystopping = EarlyStopping(monitor="val_loss", patience = 3)
lr_monitor = callbacks.LearningRateMonitor('step')

loss_checkpoint = callbacks.ModelCheckpoint(
    dirpath = os.path.join(cfg.logger.save_dir, cfg.logger.name),
    filename=cfg.logger.name,
    monitor="val_loss",
    save_top_k=1,
    mode="min",
    save_last=False,
    )
wandb_logger = WandbLogger(
    name = cfg.logger.name,
    project = cfg.logger.project,
    log_model = True,
    )

In [None]:
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("api_key")
    wandb.login(key=secret_value_0)
except:
    raise RuntimeError('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

## Training

In [None]:
trainer = pl.Trainer(
      max_epochs=cfg.epoch,
      logger = wandb_logger,
      callbacks=[
            lr_monitor, 
            loss_checkpoint, 
            earystopping,
            img_predictions,
            ],
      deterministic=True,
      **cfg.trainer,
      )
trainer.fit(model, datamodule=datamodule)
wandb.finish()

## Inference

There are functions to get results from the model. This is a slow implementation, so you can change this code a little bit and add dataloaders in order to do inference with batches.

In [None]:
def get_logits(path):  
    model.eval()
    model.cuda()
    
    with torch.no_grad():
        img = BreedDataset.read_img(path)
        img = train_transforms(image = img)['image']
        logits = model(img.unsqueeze(0).cuda()).cpu()
        top_pred = logits.argmax(dim = -1)
        
    output = torch.cat([logits.squeeze(), top_pred]).numpy()
    return output

def get_breeds(path, train_or_test = 'train'):
    inf_model = ModelInference()
    df = pd.read_csv(path)
    
    df.loc[:, 'path'] = (
        df['Id']
        .apply(lambda x: os.path.join(f'../input/petfinder-pawpularity-score/{train_or_test}', f'{x}.jpg'))
        )
    df.loc[:, [f'feature_{i}' for i in range(data.breed_id.nunique())] + ['breed_id']] = np.vstack(
        df['path'].progress_apply(get_logits).values
    )
    df.loc[:, ['cat', 'dog']] = np.vstack(
        df['path'].progress_apply(lambda x: inf_model(x)).values
    )
    return df

In [None]:
train_pawpularity_df = get_breeds('../input/petfinder-pawpularity-score/train.csv', 'train')
test_pawpularity_df = get_breeds('../input/petfinder-pawpularity-score/test.csv', 'test')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_pawpularity_df.drop(columns = ['Id', 'path', 'Pawpularity']), 
    train_pawpularity_df['Pawpularity'], 
    stratify = train_pawpularity_df['Pawpularity'],
    test_size = 0.2
)

lgbm = LGBMRegressor(max_depth=4, n_estimators=100, learning_rate= 0.08)
lgbm.fit(X_train, y_train)
print(np.sqrt(mean_squared_error(y_val, lgbm.predict(X_val))))

In [None]:
test_pawpularity_df['Pawpularity'] = lgbm.predict(test_pawpularity_df.drop(columns = ['Id', 'path']))
test_pawpularity_df[['Id', 'Pawpularity']].to_csv('./submission.csv', index = False)