## Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import torch
import cv2
import matplotlib.patches as patches
from tqdm import tqdm
import os
import shutil
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

## Add image path to training data

In [None]:
df_train =  pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
df_train.head()

In [None]:
def get_image_file_path(image_id):
    return f'/kaggle/input/petfinder-pawpularity-score/train/{image_id}.jpg'

df_train['file_path'] = df_train['Id'].apply(get_image_file_path)
df_train.head()

## Yolov5 features

In [None]:
!mkdir /root/.config/Ultralytics/
!cp ../input/yolo-arial/Arial.ttf /root/.config/Ultralytics/Arial.ttf

In [None]:
yolov5x6_model = torch.hub.load('../input/yolov5', 'custom', source='local', force_reload=True, path='../input/ultralyticsyolov5aweights/yolov5x6.pt')

In [None]:
def get_image_info(file_path, plot=False):
    img = plt.imread(file_path)
    h, w, c = img.shape
    
    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(8,8))
        ax[0].set_title('Detected pets', size=16)
        ax[0].imshow(img)
        
    results = yolov5x6_model(img, augment=True)

    pet_pixels = np.zeros(shape=[h, w], dtype=np.uint8)

    image_info = { 
        'n_pets': 0,
        'n_dogs': 0,
        'n_cats': 0,
        'labels': [],
        'x_min': 0,
        'x_max': w - 1,
        'y_min': 0,
        'y_max': h - 1,
        'avg_w': 0,
        'avg_h': 0,
        'avg_area': 0
    }
    
    pets_found = []
    
    for x1, y1, x2, y2, threshold, label in results.xyxy[0].cpu().detach().numpy():
        label = results.names[int(label)]
        if label in ['cat', 'dog']:
            image_info['n_pets'] += 1
            image_info['labels'].append(label)
            image_info['x_min'] = max(x1, image_info['x_min'])
            image_info['x_max'] = min(x2, image_info['x_max'])
            image_info['y_min'] = max(y1, image_info['y_min'])
            image_info['y_max'] = min(y2, image_info['y_max'])
            image_info['avg_w'] += abs(x2-x1)
            image_info['avg_h'] += abs(y2-y1)
            image_info['avg_area'] += image_info['avg_w']*image_info['avg_h']
            
            if label == 'cat':
                image_info['n_cats'] += 1
            else:
                image_info['n_dogs'] += 1
            
            pet_pixels[int(y1):int(y2), int(x1):int(x2)] = 1
            
            pets_found.append([x1, x2, y1, y2, label])
            
        res = 0.1 if image_info['n_pets'] == 0 else 0
            
        image_info['avg_w'] /= (image_info['n_pets']+res)
        image_info['avg_h'] /= (image_info['n_pets']+res)
        image_info['avg_area'] /= (image_info['n_pets']+res)

    if plot:
        for x1, x2, y1, y2, label in pets_found:
            c = 'red' if label == 'dog' else 'blue'
            rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, edgecolor=c, facecolor='none')
            ax[0].add_patch(rect)
            ax[0].text(max(25, (x2+x1)/2), max(25, y1-h*0.02), label, c=c, ha='center', size=14)
                
    image_info['pet_ratio'] = pet_pixels.sum() / (h*w)

    if plot:
        ax[1].set_title('Pixels Containing Pets', size=16)
        ax[1].imshow(pet_pixels)
        plt.show()
        
    return image_info

In [None]:
%matplotlib inline
for file_path in df_train['file_path'].head(5):
    get_image_info(file_path, plot=True)

In [None]:
IMAGES_INFO = {
    'Id': [],
    'n_pets': [],
    'n_cats': [],
    'n_dogs': [],
    'label': [],
    'x_min': [],
    'x_max': [],
    'y_min': [],
    'y_max': [],
    'avg_w': [],
    'avg_h': [], 
    'avg_area': [],
    'pet_ratio': []
}

for idx, file_path in tqdm(enumerate(df_train['file_path']),total=len(df_train)):
    image_info = get_image_info(file_path, plot=False)
    IMAGES_INFO['Id'].append(file_path.split('/')[-1].split('.')[0])
    IMAGES_INFO['n_pets'].append(image_info['n_pets'])
    IMAGES_INFO['n_cats'].append(image_info['n_cats'])
    IMAGES_INFO['n_dogs'].append(image_info['n_dogs'])
    IMAGES_INFO['x_min'].append(image_info['x_min'])
    IMAGES_INFO['x_max'].append(image_info['x_max'])
    IMAGES_INFO['y_min'].append(image_info['y_min'])
    IMAGES_INFO['y_max'].append(image_info['y_max'])
    IMAGES_INFO['avg_w'].append(image_info['avg_w'])
    IMAGES_INFO['avg_h'].append(image_info['avg_h'])
    IMAGES_INFO['avg_area'].append(image_info['avg_area'])
    IMAGES_INFO['pet_ratio'].append(image_info['pet_ratio'])
    
    labels = image_info['labels']
    if len(set(labels)) == 1:
        IMAGES_INFO['label'].append(labels[0])
    elif len(set(labels)) > 1:
        IMAGES_INFO['label'].append(labels[0])
    else:
        IMAGES_INFO['label'].append('unknown')

In [None]:
img_info_train = pd.DataFrame(IMAGES_INFO)
img_info_train.head()

## Create folds

In [None]:
def create_folds(data, num_splits):
    data["kfold"] = -1
    num_bins = int(np.floor(1 + np.log2(len(data))))

    data.loc[:, "bins"] = pd.cut(data["Pawpularity"], bins=num_bins, labels=False)

    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    data = data.drop("bins", axis=1)

    return data

In [None]:
train_data = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
df_train = pd.merge(img_info_train, train_data, on='Id')

le = LabelEncoder()
df_train['label'] = le.fit_transform(df_train['label'])
df_train.head()

In [None]:
df_5 = create_folds(df_train, num_splits=5)
df_6 = create_folds(df_train, num_splits=6)
df_7 = create_folds(df_train, num_splits=7)
df_8 = create_folds(df_train, num_splits=8)
df_9 = create_folds(df_train, num_splits=9)
df_10 = create_folds(df_train, num_splits=10)

df_5.to_csv("train_5folds.csv", index=False)
df_6.to_csv("train_6folds.csv", index=False)
df_7.to_csv("train_7folds.csv", index=False)
df_8.to_csv("train_8folds.csv", index=False)
df_9.to_csv("train_9folds.csv", index=False)
df_10.to_csv("train_10folds.csv", index=False)

In [None]:
df_10.head()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df_train.corr(), linewidths=.5, annot=True)
plt.show()