In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [None]:
import sys

# Data manipulation
import numpy as np
import pandas as pd

# Utils
import math
from tqdm import tqdm
import cv2
import gc
from glob import glob
import random

sys.path.append('../input/pytorch-install/tez/')
sys.path.append('../input/pytorch-install/pytorch-image-models/')

import timm
import tez
from tez.callbacks import EarlyStopping

# Augmentations
import albumentations

# Pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Rapids
import cuml, pickle
from cuml.svm import SVR
print('RAPIDS version',cuml.__version__,'\n')

# Boost
import xgboost as xgb

# Sklearn
import sklearn

# Sklearn model selection
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, auc, roc_curve, roc_auc_score, classification_report, confusion_matrix

# Wandb
# import wandb

# try:
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     api_key = user_secrets.get_secret("wandb_api")
#     wandb.login(key=api_key)
#     anony = None
# except:
#     anony = "must"
#     print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

# Set display options
pd.set_option('display.max_rows', 200,'display.max_columns', 300,'display.max_colwidth', None)

# Versions
print(f'Python {sys.version}')
print(f'NumPy {np.__version__}')
print(f'Pandas {pd.__version__}')
print(f'Pytorch Version: {torch.__version__}')
print(f'Scikit-Learn {sklearn.__version__}')

# Arguments

In [None]:
class cfg:
    image_path = '../input/petfinder-pawpularity-score'
    pretrained_path = '../input/pf-st-patch4'
    load_reg_from = None
    #load_reg_from = '../input/pf-st-patch4'
    load_train_from = '.'
    #load_train_from = '../input/pf-st-patch4'
    target = 'Pawpularity'
    batch_size = 16
    image_size = 384
    seed = 4221
    kfolds = 10
    bfolds = 1
    dweet_channel = 'pytorch-pawpularity'
    dweet_enabled = False

# Load datasets

In [None]:
# Load datasets
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
train['file_path'] = cfg.image_path + '/train/' + train['Id'] + '.jpg'
test['file_path'] = cfg.image_path + '/test/' + test['Id'] + '.jpg' 
dense_features = [col for col in train.columns if col not in ['Id', cfg.target, 'file_path']]

train = train.drop(columns=['Id'])
test = test.drop(columns=['Id'])

def create_folds(df, splits=5):
    df['norm_score'] = df[cfg.target]/100
    num_bins = int(np.floor(1+(3.3)*(np.log2(len(df)))))
    df['bins'] = pd.cut(df['norm_score'], bins=num_bins, labels=False)
    df['bins'].hist()
    df['kfold'] = -1
    
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=cfg.seed)
    
    for fold, (t, v) in enumerate(skf.split(df.index, df['bins'])):
        df.iloc[v, -1] = fold
    
    df['kfold'] = df['kfold'].astype('int')
    df.kfold.value_counts().plot.bar()
    
    df[df['kfold']==0].head()
    df[df['kfold']==0]['bins'].value_counts()
    df[df['kfold']==1]['bins'].value_counts()
    
    return df

train = create_folds(train, splits=cfg.kfolds)
train.to_csv('train.csv', index=False)
train.head()

# Utils

In [None]:
# Dweet
class Logger(object):

    CHANNEL_NAME = cfg.dweet_channel

    def __init__(self):
        self.terminal = sys.stdout

    def write(self, message):
        if message != '\n':
            self.terminal.write(message + '\n')
            payload = {'msg': message}
            quoted = urlencode(payload)
            thr = threading.Thread(target=self.send, args=(quoted,), kwargs={})
            thr.start()

    def flush(self):
        pass

    @staticmethod
    def send(msg):
        msg = 'https://dweet.io/dweet/for/' + Logger.CHANNEL_NAME + '?' + msg
        print(f'Check on https://dweet.io/get/dweets/for/{Logger.CHANNEL_NAME}')
        try:
            request.urlopen(msg).read()
        except Exception as e:
            sys.stdout.terminal.write(e)

if cfg.dweet_enabled:
    #sys.stdout = Logger()
    mydweet = Logger()
    mydweet.write('Waiting for metrics...')

# why do you divide target values by 100 and multiply sigmoid(x) by 100
# This converts the problem from RSME loss to BCE loss
# Different losses encourage the model to learn in different ways. If you try both RSME (leaving targets as is) and BCE (dividing targets by 100), then we see 
# that BCE achieves better accuracy predictions. Why this is true is hard to say.
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

# RMSE
def get_score(y_true, y_preds):
    return np.sqrt(np.mean((y_true - y_preds)**2))
    #return 100*torch.sqrt(nn.functional.mse_loss(nn.functional.sigmoid(y_true.flatten()), y_preds))

def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg.seed)

class BuildDataset:
    def __init__(self, image_paths, dense_features, targets, augmentations):
        self.image_paths = image_paths
        self.dense_features = dense_features
        self.targets = targets
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        image = cv2.imread(self.image_paths[item])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
            
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        features = self.dense_features[item, :]
        targets = self.targets[item]
        
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "features": torch.tensor(features, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.float),
        }

class BuildModel(tez.Model):
    def __init__(self, model_name):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=False, in_chans=3)
        self.model.head = nn.Linear(self.model.head.in_features, 128)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(140, 64)
        self.dense2 = nn.Linear(64, 1)

    def forward(self, image, features, targets=None):
        x1 = self.model(image)
        x = self.dropout(x1)
        x = torch.cat([x, features], dim=1)
        x = self.dense1(x)
        x = self.dense2(x)
        
        x = torch.cat([x, x1, features], dim=1)
        return x, 0, {}

augs = albumentations.Compose(
    [
        albumentations.Resize(cfg.image_size, cfg.image_size, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

def inf():
    super_final_preds = []

    for fold_ in range(10):
        model = BuildModel(model_name="swin_large_patch4_window12_384")
        model.load(f"{pretrained_path}/model_f{fold_}.bin", device="cuda", weights_only=True)

        test_dataset = BuildDataset(
            image_paths=test['file_path'].values,
            dense_features=test[dense_features].values,
            targets=np.ones(len(test['file_path'].values)),
            augmentations=augs,
        )
        
        test_preds = model.predict(test_dataset, batch_size=2*cfg.batch_size, n_jobs=-1)

        final_test_preds = []
        for preds in tqdm(test_preds):
            final_test_preds.extend(preds.ravel().tolist())

        final_test_preds = [sigmoid(x) * 100 for x in final_test_preds]
        super_final_preds.append(final_test_preds)

    super_final_preds = np.mean(np.column_stack(super_final_preds), axis=1)
    submission = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")
    submission["Pawpularity"] = super_final_preds
    submission.to_csv('submission.csv', index=False)
    submission.head()

def inf_test(prefix_dir):
    #Placeholders
    final_y_val = []
    final_o_oof_preds = []
    final_n_oof_preds = []
    final_o_test_preds = []
    final_n_test_preds = []
    
    fileid = 0
    files = glob(f'{prefix_dir}*.bin')
    nfiles = len(files)

    train = pd.read_csv(f'{cfg.load_train_from}/train.csv')
    #for fileid in range(nfiles):
    for fold in range(nfiles):    
        print(f'Number of files: {nfiles} | Loading file {cfg.pretrained_path}/model_f{fileid}.bin ...')
        model = BuildModel(model_name="swin_large_patch4_window12_384")
        model.load(f"{cfg.pretrained_path}/model_f{fileid}.bin", device="cuda", weights_only=True)      

        # Set Regression Parameters
        reg_params = {
                        'tree_method': 'gpu_hist', 
                        'gpu_id': 0, 
                        'predictor': 'gpu_predictor',
                        #'tree_method': 'hist',
                        'booster' : 'gbtree',
                        'n_estimators' : 10000,
                        'learning_rate' : 0.03628302216953097,
                        'reg_lambda' : 0.0008746338866473539,
                        'reg_alpha' : 23.13181079976304,
                        'subsample' : 0.7875490025178415,
                        'colsample_bytree' : 0.11807135201147481,
                        'max_depth' : 3,
                        'random_state': cfg.seed+fileid
        }

        # Create Regression Model
        #reg_model = cb.CatBoostRegressor(**reg_params)
        #reg_model = xgb.XGBRegressor(**reg_params)
        reg_model = SVR(C=20.0)

        #for fold in range(cfg.bfolds):
        df_train = train[train.kfold != fold].reset_index(drop=True)
        df_val = train[train.kfold == fold].reset_index(drop=True)

        y_train = df_train[cfg.target].values
        y_val = df_val[cfg.target].values

        final_y_val.append(y_val)

        print(f'\n===== File ID: {fileid} | Fold {fold+1}/{cfg.bfolds} ===============================================================================================')

        print('Predicting OOF (Validation) from NN...')
        valid_dataset = BuildDataset(
            image_paths=df_val['file_path'].values,
            dense_features=df_val[dense_features].values,
            targets=y_val/100.0,
            augmentations=augs,
        )

        val_preds = model.predict(valid_dataset, batch_size=2*cfg.batch_size, n_jobs=-1)

        prev_oof_preds = []
        x_val = np.array([]).reshape((0,128+12))
        for preds in val_preds:
            prev_oof_preds.extend(preds[:,:1].ravel().tolist())
            x_val = np.concatenate([x_val,preds[:,1:]],axis=0)

        prev_oof_preds = [sigmoid(x) * 100 for x in prev_oof_preds]

        # Fit or load model
        name = f'REG_fold_{fold}.pkl'
        if cfg.load_reg_from is None:
            train_dataset = BuildDataset(
                image_paths=df_train['file_path'].values,
                dense_features=df_train[dense_features].values,
                targets=df_train[cfg.target].values/100.0,
                augmentations=augs,
            )

            print('Extracting train embedding...')
            train_preds = model.predict(train_dataset, batch_size=2*cfg.batch_size, n_jobs=-1)

            x_train = np.array([]).reshape((0,128+12))
            for preds in train_preds: #tqdm
                x_train = np.concatenate([x_train,preds[:,1:]],axis=0)

            print(f'Train Feature Set Shape: {x_train.shape}')

            # Fit Regression Model
            #reg_model.fit(x_train, y_train, eval_set = [(x_val, y_val)], early_stopping_rounds = 100, verbose = 250)
            reg_model.fit(x_train, y_train)

            # Save Regression model
            pickle.dump(reg_model, open(name, 'wb'))
        else:
            # Load Regression model
            print(f'Loading Regression model {cfg.load_reg_from}/{name}')
            reg_model = pickle.load(open(f'{cfg.load_reg_from}/{name}', 'rb'))

        print('Predicting Regression validation...')
        y_pred = reg_model.predict(x_val)

        final_o_oof_preds.append(prev_oof_preds)
        final_n_oof_preds.append(y_pred)

        print('Predicting Test from NN...')

        test_dataset = BuildDataset(
            image_paths=test['file_path'].values,
            dense_features=test[dense_features].values,
            targets=np.ones(len(test['file_path'].values)),
            augmentations=augs
        )

        test_preds = model.predict(test_dataset, batch_size=2*cfg.batch_size, n_jobs=-1)

        prev_test_preds = []
        x_test = np.array([]).reshape((0,128+12))
        for preds in test_preds:
            prev_test_preds.extend(preds[:,:1].ravel().tolist())
            x_test = np.concatenate([x_test,preds[:,1:]],axis=0)

        print(f'Test Feature Set Shape: {x_test.shape}')

        prev_test_preds = [sigmoid(x) * 100 for x in prev_test_preds]
        reg_test_preds = reg_model.predict(x_test)

        final_o_test_preds.append(prev_test_preds)
        final_n_test_preds.append(reg_test_preds)

        # OOF Score for Regression run
        nn_oof_score = get_score(final_y_val[-1], final_o_oof_preds[-1])
        reg_oof_score = get_score(final_y_val[-1], final_n_oof_preds[-1])

        print(f'Model path: {prefix_dir}{fileid}.bin| Fold: {fold+1}/{cfg.bfolds} | NN OOF Score: {nn_oof_score}')
        print(f'Model path: {cfg.load_reg_from}/{name} | Fold: {fold+1}/{cfg.bfolds} | Regression OOF Score: {reg_oof_score}')

        # Weights
        w = 0.5
        oof2 = (1-w)*np.array(final_o_oof_preds[-1]) + w*np.array(final_n_oof_preds[-1])
        w_score = get_score(final_y_val[-1], oof2)
        print('Ensemble score =',w_score,'\n')

        print('Test Predictions Cumulative...')
        print(final_n_test_preds[:5])
            
        # Cleanup
        del df_train, df_val, y_train, y_val
        del model, reg_model
        del val_preds, test_preds
        gc.collect()

    # Final OOF score for All Feature Models
    true = np.hstack(final_y_val)

    oof = np.hstack(final_o_oof_preds)
    score = get_score(true, oof)
    print('Overall CV NN head score =',score)

    oof2 = np.hstack(final_n_oof_preds)
    score = get_score(true, oof2)
    print('Overall CV Regression head score =',score)

    oof3 = (1-w)*oof + w*oof2
    score = get_score(true, oof3)
    print('Overall CV Ensemble heads score with 50% NN and 50% Regression =',score)

    scores = []
    for ww in np.arange(0,1.05,0.05):
        oof3 = (1-ww)*oof + ww*oof2
        score = get_score(true, oof3)
        #print(f'{ww:0.2} CV Ensemble score =',score)
        scores.append(score)
    best_w = np.argmin(scores)*0.05

    final_o_test_preds = np.mean(np.column_stack(final_o_test_preds), axis=1)
    final_n_test_preds = np.mean(np.column_stack(final_n_test_preds), axis=1)

    submission = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")
    submission["Pawpularity"] = (1-best_w)*final_o_test_preds + best_w*final_n_test_preds
    submission.to_csv('submission.csv', index=False)
    submission.head()

In [None]:
def inf():
    pass

# Inference

In [None]:
inf_test(prefix_dir='../input/pf-st-patch4/model_f')