In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

In [None]:
from glob import glob
import shutil, os
from tqdm.notebook import tqdm
import torch
import cv2

test_dir = "/kaggle/input/petfinder-pawpularity-score/test/"
shutil.copytree('/kaggle/input/yolov5-official-v31-dataset/yolov5', '/kaggle/working/yolov5')

os.chdir('/kaggle/working/yolov5') # install dependencies

In [None]:
!python detect.py \
--weights /kaggle/input/ultralyticsyolov5aweights/yolov5x.pt \
--img 512\
--conf 0.1\
--iou 0.5\
--source $test_dir\
--name infer_fold \
--save-txt --save-conf --exist-ok >> /kaggle/working/log_yolo.txt

labelpaths = glob("runs/detect/infer_fold/labels/*")

predbox_ens = []
image_ids = []
score = []
label = []
x = []
y = []
w = []
h = []
for file_path in tqdm(labelpaths):
    image_id = file_path.split('/')[-1].split('.')[0]
    f = open(file_path, 'r')
    data = np.array(f.read().replace('\n', ' ').strip().split(' ')).astype(np.float32).reshape(-1, 6)
    data = data[:, [0, 5, 1, 2, 3, 4]]  
    for d in data:
        image_ids.append(image_id)
        score.append(d[1])
        label.append(d[0])
        x.append(d[2])
        y.append(d[3])
        w.append(d[4])
        h.append(d[5])

pred_df_yolo = pd.DataFrame({'Id':image_ids,'score':score,'label':label,
                             'x':x,'y':y,'w':w,'h':h
                            })

pred_df_yolo["s"] = pred_df_yolo["w"]*pred_df_yolo["h"]
pred_df_yolo["sw"] = pred_df_yolo["s"]*pred_df_yolo["score"]
pred_df_yolo.label = pred_df_yolo.label.astype(int)
objext = pred_df_yolo.groupby(["Id","label"]).agg(
    sw=("sw","sum")    
).reset_index()

test_ext = pd.pivot_table(objext, index='Id', columns='label',values='sw').fillna(0)

In [None]:
# check image information
mean = []
std = []
h = []
w = []

paths = glob("/kaggle/input/petfinder-pawpularity-score/test/*.jpg")
print(len(paths))

for path in paths:
    img = cv2.imread(path)
    h.append(img.shape[0])
    w.append(img.shape[1])
    mean.append(img.mean())
    std.append(img.std())
    
test = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/test.csv")
test["height"] = h
test["width"] = w
test["mean"] = mean
test["std"] = std
test["ratio_hw"] = test["height"]/test["width"]
test["diff_hw"] = abs(test["height"] - test["width"])

test = pd.merge(test, test_ext, on=["Id"], how="left")
test = test.dropna(how="any").reset_index(drop=True)

display(test)

In [None]:
train_base = pd.read_csv("/kaggle/input/simple-eda-using-pandas-profiling/train.csv",index_col=0)
display(train_base.head(5))
train_obj = pd.read_csv("/kaggle/input/yolopred/objectdetection.csv",index_col=0)
display(train_obj.head(5))
train_obj = pd.read_csv("/kaggle/input/yolopred/objectdetection.csv",index_col=0)
train_obj = train_obj[train_obj.label.isin([0,15,16,77])]

train_obj["s"] = train_obj["w"]*train_obj["h"]
train_obj["sw"] = train_obj["s"]*train_obj["score"]
train_obj.label = train_obj.label.astype(int)
objext = train_obj.groupby(["Id","label"]).agg(
    sw=("sw","sum")    
).reset_index()

train_ext = pd.pivot_table(objext, index='Id', columns='label',values='sw').fillna(0)
train = pd.merge(train_base,train_ext,on=["Id"],how="left")
train = train.fillna(0)

display(train)

y = train.Pawpularity.copy()
train.drop(["Id", "Pawpularity"], axis=1, inplace = True)

from sklearn.model_selection import KFold, StratifiedKFold
kfold = KFold(n_splits=5, random_state=35, shuffle=True)

In [None]:
cat_col = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/train.csv").drop(["Id","Pawpularity"],axis=1).columns.to_list()
cat_col

In [None]:
preds = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/sample_submission.csv")
preds['Pawpularity'] = 0

for fold, (train_idx, val_idx) in enumerate(kfold.split(train)):
    reg = CatBoostRegressor(cat_features=cat_col,
                            early_stopping_rounds=10,
                            random_state = 42, 
                            verbose = 0)
    reg.fit(train.loc[train_idx], y.loc[train_idx],
            eval_set=(train.loc[val_idx], y.loc[val_idx]))
    val_p = reg.predict(train.loc[val_idx])
    print(f"fold {fold} validation error: {np.sqrt(mean_squared_error(y.loc[val_idx], val_p))}")    
    if len(preds) != 8:
        preds["Pawpularity"] += reg.predict(test[train.columns]) / 5

In [None]:
feature_importance = reg.get_feature_importance()
impdf = pd.DataFrame({"name":train.columns.to_list(),"value":feature_importance})
impdf["value"] = impdf["value"]/impdf["value"].sum()
pd.options.display.max_rows = 300
display(impdf.sort_values("value",ascending=False))

In [None]:
preds.to_csv("/kaggle/working/submission.csv", index = False)

In [None]:
preds