In [1]:
!pip install git+https://github.com/rwightman/pytorch-image-models
!pip install --upgrade wandb

Collecting git+https://github.com/rwightman/pytorch-image-models
  Cloning https://github.com/rwightman/pytorch-image-models to /tmp/pip-req-build-xql1ykk_
  Running command git clone --filter=blob:none -q https://github.com/rwightman/pytorch-image-models /tmp/pip-req-build-xql1ykk_
  Resolved https://github.com/rwightman/pytorch-image-models to commit f55c22bebf9d8afc449d317a723231ef72e0d662
  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: timm
  Building wheel for timm (setup.py) ... [?25l- \ | / done
[?25h  Created wheel for timm: filename=timm-0.5.0-py3-none-any.whl size=425107 sha256=490d87bfac86d96d167351ef1347a34a2c9ccafd8f5b33dbfc51f11df62a8ba7
  Stored in directory: /tmp/pip-ephem-wheel-cache-1nv2mr0h/wheels/69/3d/b0/be55cbadabd87a0e1875d63c7492d199097a39cc2433637650
Successfully built timm
Installing collected packages: timm
Successfully installed timm-0.5.0
Collecting wandb
  Downloading wandb-0.12.9-py2.py

In [2]:
import os
import gc
import cv2
import copy
import time
import random
from PIL import Image

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
ROOT_DIR = "../input/petfinder-pawpularity-score"
TRAIN_DIR = "../input/petfinder-pawpularity-score/train"
TEST_DIR = "../input/petfinder-pawpularity-score/test"

In [4]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
CONFIG = dict(
    seed = 42,
    backbone = 'swin_large_patch4_window12_384_in22k',
    embedder = 'tf_efficientnet_b4_ns',
    train_batch_size = 8,
    valid_batch_size = 32,
    img_size = 384,
    epochs = 20,
    learning_rate = 1e-4,
    scheduler = 'CosineAnnealingLR',
    min_lr = 1e-6,
    T_max = 100,
#     T_0 = 25,
#     warmup_epochs = 0,
    weight_decay = 1e-6,
    n_accumulate = 1,
    n_fold = 5,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    competition = 'PetFinder',
    _wandb_kernel = 'Aishik'
)

In [6]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [7]:
def get_train_file_path(id):
    return f"{TRAIN_DIR}/{id}.jpg"

In [8]:
df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['Id'].apply(get_train_file_path)

In [9]:
df.shape

(9912, 15)

In [10]:
df["Id"].nunique()

9912

In [11]:
feature_cols = [col for col in df.columns if col not in ['Id', 'Pawpularity', 'file_path']]

In [12]:
def create_folds(df, n_s=5, n_grp=None):
    df['kfold'] = -1
    
    if n_grp is None:
        skf = KFold(n_splits=n_s, random_state=CONFIG['seed'])
        target = df['Pawpularity']
    else:
        skf = StratifiedKFold(n_splits=n_s, shuffle=True, random_state=CONFIG['seed'])
        df['grp'] = pd.cut(df['Pawpularity'], n_grp, labels=False)
        target = df.grp
    
    for fold_no, (t, v) in enumerate(skf.split(target, target)):
        df.loc[v, 'kfold'] = fold_no

    df = df.drop('grp', axis=1)
    
    return df

In [13]:
df = create_folds(df, n_s=CONFIG['n_fold'], n_grp=14)
df.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,file_path,kfold
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,../input/petfinder-pawpularity-score/train/000...,0
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,../input/petfinder-pawpularity-score/train/000...,2
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,../input/petfinder-pawpularity-score/train/001...,0
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,../input/petfinder-pawpularity-score/train/001...,3
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,../input/petfinder-pawpularity-score/train/001...,4


In [14]:
class PawpularityDataset(Dataset):
    def __init__(self, root_dir, df, transforms=None):
        self.root_dir = root_dir
        self.df = df
        self.file_names = df['file_path'].values
        self.targets = df['Pawpularity'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        target = self.targets[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return img, target


In [15]:
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.HorizontalFlip(p=0.5),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

In [16]:
class PawpularityModel(nn.Module):
    def __init__(self, backbone, embedder, pretrained=True):
        super(PawpularityModel, self).__init__()
        self.backbone = timm.create_model(backbone, pretrained=pretrained)
        self.n_features = self.backbone.head.in_features
        self.backbone.reset_classifier(0)
        self.fc = nn.Linear(self.n_features, CONFIG['num_classes'])

    def forward(self, images):
        features = self.backbone(images)              # features = (bs, embedding_size)
        output = self.fc(features)                    # outputs  = (bs, num_classes)
        return output
    
model = PawpularityModel(CONFIG['backbone'], CONFIG['embedder'])
model.to(CONFIG['device']);

Downloading: "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth" to /root/.cache/torch/hub/checkpoints/swin_large_patch4_window12_384_22k.pth


In [17]:
def criterion(outputs, targets):
    return torch.sqrt(nn.MSELoss()(outputs.view(-1), targets.view(-1)))

In [18]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    scaler = amp.GradScaler()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, (images, targets) in bar:         
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)
        
        batch_size = images.size(0)
        
        with amp.autocast(enabled=True):
            outputs = model(images)
            loss = criterion(outputs, targets)
            loss = loss / CONFIG['n_accumulate']
            
        scaler.scale(loss).backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            scaler.step(optimizer)
            scaler.update()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [19]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    TARGETS = []
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, (images, targets) in bar:        
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)
        
        batch_size = images.size(0)
        
        outputs = model(images)
        loss = criterion(outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        PREDS.append(outputs.view(-1).cpu().detach().numpy())
        TARGETS.append(targets.view(-1).cpu().detach().numpy())
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    TARGETS = np.concatenate(TARGETS)
    PREDS = np.concatenate(PREDS)
    val_rmse = mean_squared_error(TARGETS, PREDS, squared=False)
    gc.collect()
    
    return epoch_loss, val_rmse

In [20]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_rmse = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss, val_epoch_rmse = valid_one_epoch(model, valid_loader, 
                                                         device=CONFIG['device'], 
                                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Valid RMSE'].append(val_epoch_rmse)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        wandb.log({"Valid RMSE": val_epoch_rmse})
        
        print(f'Valid RMSE: {val_epoch_rmse}')
        
        # deep copy the model
        if val_epoch_rmse <= best_epoch_rmse:
            print(f"{c_}Validation Loss Improved ({best_epoch_rmse} ---> {val_epoch_rmse})")
            best_epoch_rmse = val_epoch_rmse
            run.summary["Best RMSE"] = best_epoch_rmse
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "RMSE{:.4f}_epoch{:.0f}.bin".format(best_epoch_rmse, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            wandb.save(PATH)
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best RMSE: {:.4f}".format(best_epoch_rmse))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history


In [21]:
def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = PawpularityDataset(TRAIN_DIR, df_train, transforms=data_transforms['train'])
    valid_dataset = PawpularityDataset(TRAIN_DIR, df_valid, transforms=data_transforms['valid'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=4, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=4, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [22]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler


In [23]:
train_loader, valid_loader = prepare_loaders(fold=0)

In [24]:
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)

In [25]:
run = wandb.init(project='Pawpularity', 
                 config=CONFIG,
                 job_type='Train',
                 anonymous='must')

[34m[1mwandb[0m: Currently logged in as: [33maishikai[0m (use `wandb login --relogin` to force relogin)


In [26]:
model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 991/991 [22:17<00:00,  1.35s/it, Epoch=1, LR=9.8e-5, Train_Loss=19.8]
100%|██████████| 62/62 [01:34<00:00,  1.52s/it, Epoch=1, LR=9.8e-5, Valid_Loss=20.4]


Valid RMSE: 20.67266273498535
[36mValidation Loss Improved (inf ---> 20.67266273498535)
Model Saved[0m



100%|██████████| 991/991 [22:19<00:00,  1.35s/it, Epoch=2, LR=9.23e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:34<00:00,  1.52s/it, Epoch=2, LR=9.23e-5, Valid_Loss=20.4]


Valid RMSE: 20.617847442626953
[36mValidation Loss Improved (20.67266273498535 ---> 20.617847442626953)
Model Saved[0m



100%|██████████| 991/991 [22:28<00:00,  1.36s/it, Epoch=3, LR=8.32e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.53s/it, Epoch=3, LR=8.32e-5, Valid_Loss=20.4]


Valid RMSE: 20.62702178955078



100%|██████████| 991/991 [22:34<00:00,  1.37s/it, Epoch=4, LR=7.16e-5, Train_Loss=19.6]
100%|██████████| 62/62 [01:35<00:00,  1.54s/it, Epoch=4, LR=7.16e-5, Valid_Loss=20.5]


Valid RMSE: 20.810625076293945



100%|██████████| 991/991 [22:38<00:00,  1.37s/it, Epoch=5, LR=5.82e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.53s/it, Epoch=5, LR=5.82e-5, Valid_Loss=20.4]


Valid RMSE: 20.595571517944336
[36mValidation Loss Improved (20.617847442626953 ---> 20.595571517944336)
Model Saved[0m



100%|██████████| 991/991 [22:42<00:00,  1.37s/it, Epoch=6, LR=4.43e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.55s/it, Epoch=6, LR=4.43e-5, Valid_Loss=20.4]


Valid RMSE: 20.6302490234375



100%|██████████| 991/991 [22:29<00:00,  1.36s/it, Epoch=7, LR=3.08e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:34<00:00,  1.53s/it, Epoch=7, LR=3.08e-5, Valid_Loss=20.4]


Valid RMSE: 20.604991912841797



100%|██████████| 991/991 [22:35<00:00,  1.37s/it, Epoch=8, LR=1.89e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.54s/it, Epoch=8, LR=1.89e-5, Valid_Loss=20.4]


Valid RMSE: 20.588674545288086
[36mValidation Loss Improved (20.595571517944336 ---> 20.588674545288086)
Model Saved[0m



100%|██████████| 991/991 [22:40<00:00,  1.37s/it, Epoch=9, LR=9.56e-6, Train_Loss=19.6]
100%|██████████| 62/62 [01:36<00:00,  1.56s/it, Epoch=9, LR=9.56e-6, Valid_Loss=20.4]


Valid RMSE: 20.633831024169922



100%|██████████| 991/991 [22:40<00:00,  1.37s/it, Epoch=10, LR=3.42e-6, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.55s/it, Epoch=10, LR=3.42e-6, Valid_Loss=20.4]


Valid RMSE: 20.5908145904541



100%|██████████| 991/991 [22:45<00:00,  1.38s/it, Epoch=11, LR=1.02e-6, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.53s/it, Epoch=11, LR=1.02e-6, Valid_Loss=20.4]


Valid RMSE: 20.625694274902344



100%|██████████| 991/991 [22:47<00:00,  1.38s/it, Epoch=12, LR=2.56e-6, Train_Loss=19.5]
100%|██████████| 62/62 [01:35<00:00,  1.55s/it, Epoch=12, LR=2.56e-6, Valid_Loss=20.4]


Valid RMSE: 20.623394012451172



100%|██████████| 991/991 [22:33<00:00,  1.37s/it, Epoch=13, LR=7.89e-6, Train_Loss=19.6]
100%|██████████| 62/62 [01:36<00:00,  1.55s/it, Epoch=13, LR=7.89e-6, Valid_Loss=20.5]


Valid RMSE: 20.792783737182617



100%|██████████| 991/991 [22:33<00:00,  1.37s/it, Epoch=14, LR=1.66e-5, Train_Loss=19.6]
100%|██████████| 62/62 [01:33<00:00,  1.51s/it, Epoch=14, LR=1.66e-5, Valid_Loss=20.4]


Valid RMSE: 20.61182403564453



100%|██████████| 991/991 [22:35<00:00,  1.37s/it, Epoch=15, LR=2.8e-5, Train_Loss=19.4]
100%|██████████| 62/62 [01:36<00:00,  1.56s/it, Epoch=15, LR=2.8e-5, Valid_Loss=20.4]


Valid RMSE: 20.5880184173584
[36mValidation Loss Improved (20.588674545288086 ---> 20.5880184173584)
Model Saved[0m



100%|██████████| 991/991 [22:38<00:00,  1.37s/it, Epoch=16, LR=4.12e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:36<00:00,  1.56s/it, Epoch=16, LR=4.12e-5, Valid_Loss=20.4]


Valid RMSE: 20.58802032470703



100%|██████████| 991/991 [22:38<00:00,  1.37s/it, Epoch=17, LR=5.52e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:33<00:00,  1.51s/it, Epoch=17, LR=5.52e-5, Valid_Loss=20.4]


Valid RMSE: 20.68807029724121



100%|██████████| 991/991 [22:39<00:00,  1.37s/it, Epoch=18, LR=6.87e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:37<00:00,  1.57s/it, Epoch=18, LR=6.87e-5, Valid_Loss=20.4]


Valid RMSE: 20.627836227416992



100%|██████████| 991/991 [22:42<00:00,  1.37s/it, Epoch=19, LR=8.08e-5, Train_Loss=19.4]
100%|██████████| 62/62 [01:34<00:00,  1.52s/it, Epoch=19, LR=8.08e-5, Valid_Loss=20.4]


Valid RMSE: 20.642253875732422



100%|██████████| 991/991 [22:39<00:00,  1.37s/it, Epoch=20, LR=9.05e-5, Train_Loss=19.5]
100%|██████████| 62/62 [01:36<00:00,  1.55s/it, Epoch=20, LR=9.05e-5, Valid_Loss=20.4]


Valid RMSE: 20.59156036376953

Training complete in 8h 4m 15s
Best RMSE: 20.5880


In [27]:
run.finish()

VBox(children=(Label(value=' 3788.79MB of 3788.79MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, ma…

0,1
Train Loss,█▃▃▄▃▃▃▂▄▃▃▃▄▄▂▃▃▃▁▂
Valid Loss,▃▂▂█▁▂▁▁▂▁▂▂▇▁▁▁▄▂▂▁
Valid RMSE,▄▂▂█▁▂▂▁▂▁▂▂▇▂▁▁▄▂▃▁

0,1
Best RMSE,20.58802
Train Loss,19.4573
Valid Loss,20.35369
Valid RMSE,20.59156
