# Get GPU Info

In [None]:
!nvidia-smi

# Installations

In [None]:
!pip install -qq timm
!pip install -qq albumentations==1.0.3
!pip install -qq ttach

# Imports

In [None]:
# General
from tqdm.auto import tqdm
from collections import defaultdict
import pandas as pd
import numpy as np
import os
import random
import time
import gc
import cv2
from PIL import Image
import glob
gc.enable()
pd.set_option('display.max_columns', None)

# Visialisation
import matplotlib.pyplot as plt
%matplotlib inline

# Image Aug
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

# Deep Learning
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, OneCycleLR, CosineAnnealingLR
import torch
import torchvision
import timm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import BCEWithLogitsLoss
#Metrics
from sklearn.metrics import mean_squared_error


In [None]:
import psutil
from memory_profiler import profile

## set device

In [None]:
# Device Optimization
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

## set seed

In [None]:
# Random Seed Initialize
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything()

## ignore warnings

In [None]:
# Asthetics
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

# dont display warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
def petfinder_rmse(input,target):
    score = 100*torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))
    return score

In [None]:
def check_memory():
    mem = psutil.virtual_memory() 
    print(f"Memory Total:{mem.total}. Used:{mem.used}. Available:{mem.available}")

In [None]:
mem_details = []

def memory_ckpt():
    mem_details.append(psutil.virtual_memory()[3])
    mem_used_step = mem_details[-1] - mem_details[-2] if len(mem_details) > 1 else 0
    mem_used_total = mem_details[-1] - mem_details[0] if len(mem_details) > 1 else 0

    if mem_used_step > 50000000:
        print('Mem Warning, High memory usage step:', round(mem_used_step/1073741824, 2), ' GB\n')
    elif mem_used_step < -50000000:
        print('Mem Note, High memory release step:', round(mem_used_step/1073741824, 2), ' GB\n')

    if mem_used_total > 6000000000:
        print('Mem Warning, High memory usage cumulatively by the code in the kernel:', round(mem_used_total/1073741824,2), ' GB\n')
        print('Total Memory used at start of kernel before line 1:', round(mem_details[0]/1073741824,2), ' GB\n')
        print('Total Memory used as of this step:', round(mem_details[-1]/1073741824,2), ' GB\n')

# CFG

In [None]:
# TRAIN_FOLDS = [0, 1 ,2 ,3, 4]
# N_FOLDS = 5

# TRAIN_FOLDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# N_FOLDS = 10
TRAIN_FOLDS = [0, 1]
N_FOLDS = 10

In [None]:
params = {
    'model': 'swin_large_patch4_window7_224_in22k',
    'dense_features': ['is_cat','n_pets','pet_ratio'],
    'pretrained': True,
    'inp_channels': 3,
    'im_size': 224,
    'device': device,
    'lr': 2e-5,
    'weight_decay': 1e-6,
    'batch_size': 32,
    'num_workers' : 2,
#     'epochs': 8,
    'epochs': 2,
    'out_features': 1,
    'dropout': 0.5,
    'num_fold': N_FOLDS,
#     'mixup': True,
    'mixup': False,
    'mixup_alpha': 1.0,
    'scheduler_name': 'CosineAnnealingWarmRestarts',
    'T_0': 5,
    'T_max': 5,
    'T_mult': 1,
    'min_lr': 1e-6,
    'max_lr': 3e-5,
    'patience': 2
}

# Load Data

In [None]:
csv_dir = '../input/petfinder-pawpularity-score'
train_dir = '../input/petfinder-pawpularity-score/train'
# test_dir = '../input/petfinder-pawpularity-score/test'

train_file_path = '../input/make-new-csv/train_add_f.csv'
sample_sub_file_path = os.path.join(csv_dir, 'sample_submission.csv')

print(f'Train file: {train_file_path}')
print(f'Train file: {sample_sub_file_path}')

In [None]:
train_df = pd.read_csv(train_file_path)
# test_df = pd.read_csv(sample_sub_file_path)

In [None]:
# delete NaN
train_df = train_df.dropna()

In [None]:
train_df["pet_ratio"] = train_df["pet_ratio"].astype('float32')

In [None]:
def return_filpath(name, folder=train_dir):
    path = os.path.join(folder, f'{name}.jpg')
    return path

In [None]:
train_df['image_path'] = train_df['Id'].apply(lambda x: return_filpath(x))

# Set fold

In [None]:
#Sturges' rule
num_bins = int(np.floor(1+(3.3)*(np.log2(len(train_df)))))
train_df['bins'] = pd.cut(train_df['Pawpularity'], bins=num_bins, labels=False)
train_df['bins'].hist()

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

train_df['fold'] = -1

strat_kfold = StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_SEED, shuffle=True)
for i, (_, train_index) in enumerate(strat_kfold.split(train_df.index, train_df['bins'])):
    train_df.iloc[train_index, -1] = i
    
train_df['kfold'] = train_df['fold'].astype('int')

train_df.fold.value_counts().plot.bar()

In [None]:
target = ['Pawpularity']
drop_features = ['Id', 'bins', 'fold',
                 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory','Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
train_df = train_df.drop(columns=drop_features)
cols = list(train_df.columns)

In [None]:
train_df.head()

## 1. Train Augmentations

In [None]:
def get_train_transforms(DIM = params['im_size']):
    return albumentations.Compose(
        [
            albumentations.Resize(DIM,DIM),
#             albumentations.Normalize(
#                 mean=[0.51876384, 0.48398507, 0.44618937],
#                 std=[0.26810414, 0.26382494, 0.26581845],
#             ),
            ToTensorV2(p=1.0),
        ]
    )

## 2. Mixup

In [None]:
def mixup_data(x, z, y, params):
    if params['mixup_alpha'] > 0:
        lam = np.random.beta(
            params['mixup_alpha'], params['mixup_alpha']
        )
    else:
        lam = 1

    batch_size = x.size()[0]
    if params['device'].type == 'cuda':
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_z = lam * z + (1 - lam) * z[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, mixed_z, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

## 3. Valid Augmentations

In [None]:
def get_valid_transforms(DIM = params['im_size']):
    return albumentations.Compose(
        [
            albumentations.Resize(DIM,DIM),
#             albumentations.Normalize(
#                 mean=[0.51876384, 0.48398507, 0.44618937],
#                 std=[0.26810414, 0.26382494, 0.26581845],
#             ),
            ToTensorV2(p=1.0)
        ]
    )

# Dataset

In [None]:
class CuteDataset(Dataset):
    def __init__(self, images_filepaths, dense_features, targets, transform=None):
        self.images_filepaths = images_filepaths
        self.dense_features = dense_features
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.images_filepaths)

    def __getitem__(self, idx):
        image_filepath = self.images_filepaths[idx]
        image = cv2.imread(image_filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform is not None:
            image = self.transform(image=image)['image']
        
        image = torch.tensor(image, dtype=torch.int32)
        dense = self.dense_features[idx, :]
        label = self.targets[idx]

        return image, dense, label

## 1. Visualize Some Examples

In [None]:
X_train = train_df['image_path']
X_train_dense = train_df[params['dense_features']]
y_train = train_df['Pawpularity']
train_dataset = CuteDataset(
    images_filepaths=X_train.values,
    dense_features=X_train_dense.values,
    targets=y_train.values,
    transform=get_train_transforms()
)

# Metrics

In [None]:
def usr_rmse_score(output, target):
    y_pred = torch.sigmoid(output).cpu()
    y_pred = y_pred.detach().numpy()*100
    target = target.cpu()*100
    
    return mean_squared_error(target, y_pred, squared=False)

In [None]:
def get_scheduler(optimizer, scheduler_params=params):
    if scheduler_params['scheduler_name'] == 'CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(
            optimizer,
            T_0=scheduler_params['T_0'],
            eta_min=scheduler_params['min_lr'],
            last_epoch=-1
        )
    elif scheduler_params['scheduler_name'] == 'OneCycleLR':
        scheduler = OneCycleLR(
            optimizer,
            max_lr=scheduler_params['max_lr'],
            steps_per_epoch=int(((scheduler_params['num_fold']-1) * train_df.shape[0]) / (scheduler_params['num_fold'] * scheduler_params['batch_size'])) + 1,
            epochs=scheduler_params['epochs'],
        )

    elif scheduler_params['scheduler_name'] == 'CosineAnnealingLR':
        scheduler = CosineAnnealingLR(
            optimizer,
            T_max=scheduler_params['T_max'],
            eta_min=scheduler_params['min_lr'],
            last_epoch=-1
        )
    return scheduler

# Loss

In [None]:
from torch.nn.modules.loss import _WeightedLoss

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight = None, reduction = 'mean', smoothing = 0.0, pos_weight = None):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction
        self.pos_weight = pos_weight

    @staticmethod
    def _smooth(targets, n_labels, smoothing = 0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad(): targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight, pos_weight = self.pos_weight)
        del targets
        if  self.reduction == 'sum': loss = loss.sum()
        elif  self.reduction == 'mean': loss = loss.mean()  
        return loss

# Model

In [None]:
class PetNet(nn.Module):
    def __init__(self, model_name=params['model'], out_features=params['out_features'], inp_channels=params['inp_channels'],
                 pretrained=params['pretrained'], num_dense=len(params['dense_features'])):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, in_chans=inp_channels)
        n_features = self.model.head.in_features
        self.model.head = nn.Linear(n_features, 128)
        self.fc = nn.Sequential(
            nn.Linear(128 + num_dense, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, out_features)
        )
        self.dropout = nn.Dropout(params['dropout'])
    
    def forward(self, image, dense):
        embeddings = self.model(image)
        x = self.dropout(embeddings)
        x = torch.cat([x, dense], dim=1)
        output = self.fc(x)
        return output

# Train and Validation Functions

## 1. Train Function

In [None]:
def train_mixup_fn(train_loader, model, criterion, optimizer, epoch, params, loss_train_tracker, scheduler=None):
    model.train()
    stream = tqdm(train_loader)
    loss = None
    
    for i, (images, dense, target) in enumerate(stream, start=1):
        if loss is not None:
            del loss

        images, dense, target_a, target_b, lam = mixup_data(images, dense, target.view(-1, 1), params)
        images = images.to(params['device'], dtype=torch.float32)
        dense = dense.to(params['device'], dtype=torch.float32)
        target_a = target_a.to(params['device'], dtype=torch.float32)
        target_b = target_b.to(params['device'], dtype=torch.float32)
            
        output = model(images, dense)
        
        loss = mixup_criterion(criterion, output, target_a, target_b, lam)
                
        rmse_score = usr_rmse_score(output, target)
        
        loss.backward()
        optimizer.step()
            
        if scheduler is not None:
            scheduler.step()
        
        optimizer.zero_grad()
        
        stream.set_description(f"Epoch:{epoch:02}. Train Loss:{loss.item():.4f}. RMSE:{rmse_score:.4f}")
        del images, dense, target_a, target_b, lam
    
    print(f"Fold: {fold}. Epoch: {epoch}. Train Loss. {loss:.4f}. Train RMSE. {rmse_score:.4f}.")
    del loss, stream, rmse_score
            

In [None]:
def train_fn(train_loader, model, criterion, optimizer, epoch, params, loss_train_tracker, scheduler=None):
    model.train()
    stream = tqdm(train_loader)
    loss = None
    images = None
    dense = None
    target = None
    
    for i, (images, dense, target) in enumerate(stream, start=1):
        if loss is not None:
            del loss
        print(" ")
        print("load data")
        check_memory()
        images = images.to(params['device'], dtype=torch.float32)
        dense = dense.to(params['device'], dtype=torch.float32)
        target = target.to(params['device'], dtype=torch.float32).view(-1, 1)
        output = model(images, dense)
        
        loss = criterion(output, target)
        rmse_score = usr_rmse_score(output, target)
        
        loss.backward()
        optimizer.step()
            
        if scheduler is not None:
            scheduler.step()
        
        optimizer.zero_grad()
        stream.set_description(f"Epoch:{epoch:02}. Train Loss:{loss.item():.4f}. RMSE:{rmse_score:.4f}")
        del images, dense, target, output
    
    print(f"Fold: {fold}. Epoch: {epoch}. Train Loss. {loss:.4f}. Train RMSE. {rmse_score:.4f}.")
    del loss, stream, rmse_score

## 2. Validate Function

In [None]:
def validate_fn(valid_loader, model, criterion, epoch, params):
    model.eval()
    print("stream")
    check_memory()
    stream = tqdm(valid_loader)
    final_targets = []
    final_outputs = []
    loss = None
    
    with torch.no_grad():
        for i, (images, dense, target) in enumerate(stream, start=1):
            if loss is not None:
                del loss

            images = images.to(params['device'], dtype=torch.float32)
            dense = dense.to(params['device'], dtype=torch.float32)
            target = target.to(params['device'], dtype=torch.float32).view(-1, 1)

            output = model(images, dense)
            loss = criterion(output, target)
            rmse_score = usr_rmse_score(output, target)
            stream.set_description(f"Epoch: {epoch:02}. Valid Loss:{loss.item():.4f}. RMSE:{rmse_score:.4f}")
            
            targets = (target.detach().cpu().numpy()*100).tolist()
            outputs = (torch.sigmoid(output).detach().cpu().numpy()*100).tolist()
            
            final_targets.extend(targets)
            final_outputs.extend(outputs)    

    print(f"Fold: {fold}. Epoch: {epoch}. Valid Loss. {loss:.4f}. Valid RMSE. {rmse_score:.4f}.")
    del loss, stream, rmse_score, output, targets, outputs

    return final_outputs, final_targets

In [None]:
import sys
from pympler.tracker import SummaryTracker
tracker = SummaryTracker()

In [None]:
print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

In [None]:
tot_mem = round(torch.cuda.get_device_properties(0).total_memory/(1024*1024*1024),1)

# Run

In [None]:
for fold in TRAIN_FOLDS:
    print(f"---Train fold:{fold}.---")
    # Data Split to train and Validation
    train = train_df[train_df['kfold'] != fold]
    valid = train_df[train_df['kfold'] == fold]
    
    X_train = train['image_path']
    X_train_dense = train[params['dense_features']]
    y_train = train['Pawpularity']/100
 
    # Pytorch Dataset Creation
    train_dataset = CuteDataset(
        images_filepaths=X_train.values,
        dense_features=X_train_dense.values,
        targets=y_train.values,
        transform=get_train_transforms()
    )
    
    # Pytorch Dataloader creation
    train_loader = DataLoader(
        train_dataset, batch_size=params['batch_size'], shuffle=True,
        num_workers=params['num_workers'], pin_memory=True
    )
    
    # set valid data
    X_valid = valid['image_path']
    X_valid_dense = valid[params['dense_features']]
    y_valid = valid['Pawpularity']/100

    valid_dataset = CuteDataset(
        images_filepaths=X_valid.values,
        dense_features=X_valid_dense.values,
        targets=y_valid.values,
        transform=get_valid_transforms()
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=params['batch_size'], shuffle=False,
        num_workers=params['num_workers'], pin_memory=True
    )
    
    # Model, cost function and optimizer instancing
    model = PetNet()
    model = model.to(params['device'])
    criterion = SmoothBCEwLogits()
    optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'],
                                  weight_decay=params['weight_decay'],
                                  amsgrad=False)
    scheduler = get_scheduler(optimizer)
    
    # Training and Validation Loop
    best_rmse = np.inf
    best_epoch = np.inf
    best_model_name = None
    early_stop_cnt = 0
    print(f"Start Training")
    print(f"Fold:{fold} is trained.")
    
    # Train & Valid Loop
    for epoch in range(1, params['epochs'] + 1):
        print(f"Start Training. Fold:{fold}. Epoch:{epoch}")
        
        # Check Early Stopping
        if early_stop_cnt >= params['patience']:
            print(f"Early stoppping. Fold:{fold} Epoch:{epoch}.")
            continue

        # measure elapsed time
        torch.cuda.synchronize()
        start = time.time()
        print(f"Before Train.")
        check_memory()
        if params['mixup']:
            train_mixup_fn(train_loader, model, criterion, optimizer, epoch, params, scheduler)
        else:
            train_fn(train_loader, model, criterion, optimizer, epoch, params, scheduler)
        print(f"After Train. Before valid")
        check_memory()
        predictions, valid_targets = validate_fn(valid_loader, model, criterion, epoch, params)
        
        print(f"After valid.")
        check_memory()
        rmse = round(mean_squared_error(valid_targets, predictions, squared=False), 5)
        print(f"After Scoring.")
        check_memory()

        if rmse < best_rmse:
            early_stop_cnt = 0
            best_rmse = rmse
            best_epoch = epoch
            if best_model_name is not None:
                os.remove(best_model_name)
            torch.save(model.state_dict(),
                       f"{params['model']}_{epoch}_epoch_f{fold+1}_{rmse}_rmse.pth")
            best_model_name = f"{params['model']}_{epoch}_epoch_f{fold+1}_{rmse}_rmse.pth"
        else:
            early_stop_cnt += 1
            
        torch.cuda.synchronize()
        elapsed_time = time.time() - start
        
        print(f"Epoch:{epoch}. Elapsed_time: {elapsed_time/60:.2f} minutes.")
        print(f"Pred RSME rmse :{rmse}")
        print(f"Pred values MAX:{max(predictions)}, MIN:{min(predictions)}")
        print(f"Precision is not improved:{early_stop_cnt} times. Patience:{params['patience']}")
        
        #memory check function
        res_mem = round(torch.cuda.memory_reserved(0)/(1024*1024*1024),1)
        print("Reserved GPU memory: ",res_mem)
        memory_ckpt()
        
    # Print summary of this fold
    print('')
    print(f'The best RMSE: {best_rmse} for fold {fold+1} was achieved on epoch: {best_epoch}.')
    print(f'The Best saved model is: {best_model_name}')
    
    # check memory before and after del
    print(f'before delete and gc collect and cuda empty cache')
    check_memory()

    del model
    del train, X_train, X_train_dense, y_train, train_dataset, train_loader
    del valid, X_valid, X_valid_dense, y_valid, valid_dataset, valid_loader
    del predictions, valid_targets, elapsed_time
    
    gc.collect()
    torch.cuda.empty_cache()
    
    print(f'after delete and gc collect and cuda empty cache')
    check_memory()

    print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
    print(" ------------------------------------ ")
    for var_name in dir():
        if not var_name.startswith("_"):
            print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))
        
    print(f"---Trained fold:{fold}.---")

In [None]:
print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))