[EDA, FE, folds and stacking (meta data)](https://www.kaggle.com/sagarikajadon/pawpreds-eda-fe-folds-and-stacking-meta-data)

[[pawpreds] efficientnetB3(RMSE) training](https://www.kaggle.com/sagarikajadon/pawpreds-efficientnetb3-rmse-training)

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
import torchvision.models as model
from torch.optim.lr_scheduler import ReduceLROnPlateau

import albumentations as A
from albumentations.pytorch import ToTensorV2

# CONFIG CLASS

In [None]:
class CFG:
    DEVICE= 'cuda' if torch.cuda.is_available() else 'cpu'
    SEED= 1234
    PROBLEM= 'regression'
    MODEL_NAME= 'efficientnet_b3'
    IMG_SIZE= 512
    IMAGENET_MEAN = [0.485, 0.456, 0.406]  # RGB
    IMAGENET_STD = [0.229, 0.224, 0.225]  # RGB
    N_FOLDS= 5
    LEARNING_RATE= 1e-3
    WEIGHT_DECAY= 0
    T_MAX= 10
    T_0= 5
    ETA_MIN= 0
    SCHEDULER= 'CosineAnnealingLR'
    BATCH_SIZE= 16
    BATCH_SIZE_TEST= 4
    EPOCHS= 5

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic= True

    
set_seed(CFG.SEED)

# READ DATA AND PREPROCESS

In [None]:
train= pd.read_csv('../input/pawpreds-eda-fe-folds-and-stacking-meta-data/train_folds.csv')
train_ids= train['Id']
train_targets= train['Pawpularity']
test= pd.read_csv('../input/petfinder-pawpularity-score/test.csv')

def get_test_file_path(image_id):
    return f'../input/petfinder-pawpularity-score/test/{image_id}.jpg'


test['image_path']= test['Id'].apply(get_test_file_path)
test_ids= test['Id']

In [None]:
def preprocess(df):
    df['size']= df['image_path'].apply(lambda x: Image.open(x).size)
    df['width']= df['size'].apply(lambda x: x[0])
    df['height']= df['size'].apply(lambda x: x[1])
    df= df.drop('size', axis= 1)
    return df

test= preprocess(test)

In [None]:
meta_features= ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur', 'width', 'height']

# DATASET CLASS

In [None]:
class PawDataset:
    def __init__(self, df, augmentations= None):
        self.df= df
        self.image_paths= df['image_path'].values
        self.targets= df['Pawpularity'].values
        self.meta_df= df[meta_features].values
        if (CFG.PROBLEM == 'classification'):
            self.targets= df['Pawpularity'].values / 100
        self.augmentations= augmentations
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        target= self.targets[idx]
        meta_data= self.meta_df[idx, :]
        image= cv2.imread(self.image_paths[idx])  #BGR 
        image= cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #RGB
        if self.augmentations:
            image= self.augmentations(image= image)['image']
        return image, torch.tensor(meta_data, dtype= torch.float), torch.tensor(target, dtype= torch.float)

In [None]:
augmentations= {
    'train': A.Compose([
        A.RandomResizedCrop(height= CFG.IMG_SIZE, width= CFG.IMG_SIZE, scale= (0.85, 1.0)),
        A.Flip(p= 0.7),
        A.Perspective(p= 0.7),
        A.Rotate(limit= 40, p= 0.5, border_mode= cv2.BORDER_CONSTANT),
        A.Normalize(mean= CFG.IMAGENET_MEAN, std= CFG.IMAGENET_STD),
        ToTensorV2()]),
    
    'valid': A.Compose([
        A.Resize(height= CFG.IMG_SIZE, width= CFG.IMG_SIZE),
        A.Normalize(mean= CFG.IMAGENET_MEAN, std= CFG.IMAGENET_STD),
        ToTensorV2()])
}

In [None]:
test['Pawpularity']= np.zeros(test.shape[0])

# MODEL

In [None]:
class PawpularityModel(nn.Module):
    def __init__(self, model_name, pretrained= True):
        super(PawpularityModel, self).__init__()
        self.image_model= timm.create_model(model_name, pretrained= pretrained)
        self.input_dim= self.image_model.classifier.in_features
        self.image_model.classifier= nn.Identity() 
        
        self.fc1= nn.Linear(self.input_dim, self.input_dim//2)
        self.fc2= nn.Linear(self.input_dim//2, 1)
        
    def forward(self, image):
        emb= self.image_model(image)    #[N, 1280]
        output= F.relu(self.fc1(emb))
        output= torch.flatten(self.fc2(output))       #[N]
        return emb, output

# GET EMBEDDINGS

In [None]:
def get_embeddings(dataloader, model):
    model.eval()
    embs= np.empty(shape= (0, model.input_dim))
    
    with torch.no_grad():
        for image, meta, target in dataloader:
            image= image.to(CFG.DEVICE)
            meta= meta.to(CFG.DEVICE)
            target= target.to(CFG.DEVICE)
            
            emb, _= model(image)
            emb= emb.cpu().numpy()
            embs= np.concatenate((embs, emb))
            
    return embs

In [None]:
def get_embeddings_df(train, test, fold):
    embeddings= np.zeros((train.shape[0]))
    train_dataset= PawDataset(train, augmentations= augmentations['train'])
    train_loader= data.DataLoader(train_dataset, batch_size= CFG.BATCH_SIZE)
    model= PawpularityModel(CFG.MODEL_NAME, pretrained= False).to(CFG.DEVICE)
    model.load_state_dict(torch.load('../input/pawpreds-efficientnetb3-rmse-training/effnetb3_rmse%d.pth'% fold, map_location= CFG.DEVICE))
    embeddings= get_embeddings(train_loader, model)
    train_df= pd.DataFrame(embeddings, columns= [i for i in range(model.input_dim)])
    
    embeddings= np.zeros((test.shape[0]))
    test_dataset= PawDataset(test, augmentations= augmentations['valid'])
    test_loader= data.DataLoader(test_dataset, batch_size= CFG.BATCH_SIZE_TEST)
    model= PawpularityModel(CFG.MODEL_NAME, pretrained= False).to(CFG.DEVICE)
    model.load_state_dict(torch.load('../input/pawpreds-efficientnetb3-rmse-training/effnetb3_rmse%d.pth'% fold, map_location= CFG.DEVICE))
    embeddings= get_embeddings(test_loader, model)
    test_df= pd.DataFrame(embeddings, columns= [i for i in range(model.input_dim)])
    return train_df, test_df

In [None]:
kfold= KFold(n_splits= 10)
test_preds= np.zeros(test.shape[0])
for fold in range(CFG.N_FOLDS):
    train_df, test_df= get_embeddings_df(train, test, fold)
    oof_preds= np.zeros(train_df.shape[0])
    
    for train_idx, val_idx in kfold.split(train_df):
        X_train, X_val= train_df.iloc[train_idx], train_df.iloc[val_idx]
        y_train, y_val= train_targets.iloc[train_idx], train_targets.iloc[val_idx]
        
        lgbr= lgb.LGBMRegressor(objective= 'rmse', n_estimators= 1000)
        lgbr.fit(X_train, y_train, eval_metric= 'rmse', 
                 eval_set= [(X_val, y_val)], early_stopping_rounds= 100, verbose= 10)
    
        test_preds += (lgbr.predict(test_df)/10)
        oof_preds[val_idx]= lgbr.predict(X_val)

test_preds /= CFG.N_FOLDS

In [None]:
oof_loss= mean_squared_error(train_targets, oof_preds, squared= False)
oof_loss

In [None]:
test_preds

In [None]:
sub= pd.DataFrame({'Id': test_ids, 'Pawpularity': test_preds})

In [None]:
sub.to_csv('submission.csv', index= False)