In [None]:
import cv2

import glob
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
import optuna
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

glob.glob('/kaggle/input/petfinder-pawpularity-score/*')

In [None]:
base_path = '/kaggle/input/petfinder-pawpularity-score'
sample_submission = pd.read_csv(f'{base_path}/sample_submission.csv')
train = pd.read_csv(f'{base_path}/train.csv')
test = pd.read_csv(f'{base_path}/test.csv')

train_path = f'{base_path}/train'
test_path = f'{base_path}/test'

In [None]:
img_size_list = []
for id_ in tqdm(train['Id']):
    img = cv2.imread(f"{train_path}/{id_}.jpg")
    y, x, _ = img.shape
    img_size_list.append([x, y])

img_size_df = pd.DataFrame(img_size_list, columns=['width', 'height'])
train = pd.concat([train, img_size_df], axis=1)
train['rate'] = train['width'] / train['height']

img_size_list = []
for id_ in tqdm(test['Id']):
    img = cv2.imread(f"{test_path}/{id_}.jpg")
    y, x, _ = img.shape
    img_size_list.append([x, y])

img_size_df = pd.DataFrame(img_size_list, columns=['width', 'height'])
test = pd.concat([test, img_size_df], axis=1)
test['rate'] = test['width'] / test['height']

In [None]:
cols = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 
        'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

In [None]:
df = pd.concat([train, test])
cols2 = cols.copy()

for c1 in tqdm(cols):
    for c2 in cols:
        if c1 == c2: continue
        newcol = f'{c1}_{c2}'
        df[newcol] = df[[c1, c2]].groupby(c1)[c2].nunique()
        cols2.append(newcol)

train = df[:len(train)]
test = df[len(train):]

In [None]:
k = 5

for c in tqdm(cols2):

    train[f'{c}_Pawpularity_mean'] = np.nan
    train[f'{c}_Pawpularity_std'] = np.nan

    tmp_df = train[[c, 'Pawpularity']]

    for i, (trn_idx, val_idx) in enumerate(KFold(k).split(tmp_df)):
        agg_df = tmp_df.iloc[trn_idx].groupby(c, as_index=False).mean()
        agg_df.columns = [c, 'tmpcol']
        train = train.merge(agg_df, on=c, how='left')
        train.loc[val_idx, f'{c}_Pawpularity_mean'] = train['tmpcol']
        train.drop(['tmpcol'], axis=1, inplace=True)
        
        agg_df = tmp_df.iloc[trn_idx].groupby(c, as_index=False).std()
        agg_df.columns = [c, 'tmpcol']
        train = train.merge(agg_df, on=c, how='left')
        train.loc[val_idx, f'{c}_Pawpularity_std'] = train['tmpcol']
        train.drop(['tmpcol'], axis=1, inplace=True)
    
    agg_df = tmp_df.groupby(c, as_index=False).mean()
    agg_df.columns = [c, 'tmpcol']
    test = test.merge(agg_df, on=c, how='left')
    test[f'{c}_Pawpularity_mean'] = test['tmpcol']
    test.drop(['tmpcol'], axis=1, inplace=True)
    
    agg_df = tmp_df.groupby(c, as_index=False).std()
    agg_df.columns = [c, 'tmpcol']
    test = test.merge(agg_df, on=c, how='left')
    test[f'{c}_Pawpularity_std'] = test['tmpcol']
    test.drop(['tmpcol'], axis=1, inplace=True)

In [None]:
train.to_csv('train_dataset.csv', index=False)
test.to_csv('test_dataset.csv', index=False)

In [None]:
X_train = train.drop(['Id', 'Pawpularity'], axis=1)
y_train = train['Pawpularity']

X_test = test.drop(['Id', 'Pawpularity'], axis=1)

assert X_train.shape[1] == X_test.shape[1]

In [None]:
splits = 5
folds = KFold(n_splits=splits, shuffle=True, random_state=9)
oof = np.zeros(len(X_train))
p_test = np.zeros(len(X_test))

cv_score = []

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("Fold {}".format(fold_))
    X_fit, y_fit = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
    X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

    lgb_reg = lgb.LGBMRegressor(random_state=9)
    lgb_reg.fit(X_fit.values, y_fit.values)

    pred = lgb_reg.predict(X_val)
    pred[pred<0] = 1
    pred[pred>100] = 100
    oof[val_idx] = pred
        
    cv_score.append(np.sqrt(mean_squared_error(y_val, pred)))
    
    p_test += lgb_reg.predict(X_test)

p_test /= splits

print('cv mean : {}'.format(np.mean(cv_score)))
print('oof mean : {}'.format(np.sqrt(mean_squared_error(y_train, oof))))

In [None]:
data = pd.concat([y_train, pd.Series(oof)], axis=1, ignore_index=True).rename(columns={0: 'true', 1: 'pred'})

plt.figure(figsize=(5,5))
plt.scatter(x='true',y='pred', data=data);

plt.plot([0, 100], [0, 100])
plt.xlim((0, 100))
plt.ylim((0, 100))
plt.xlabel('true')
plt.ylabel('pred')
plt.show()

In [None]:
submission_df = sample_submission
submission_df['Pawpularity'] = p_test

display(submission_df)

submission_df.to_csv('submission.csv', index=None)