In [None]:
import os
from pathlib import Path

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

In [None]:
ls ../input/petfinder-pawpularity-score/

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')

INPUT = Path('../input/petfinder-pawpularity-score/')
TRAIN_IMG_DIR = INPUT / 'train'            
TEST_IMG_DIR = INPUT /'test'

train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['img_path'] = train['Id'].apply(lambda x: f'../input/petfinder-pawpularity-score/train/{str(x)}.jpg')
test['img_path'] = test['Id'].apply(lambda x: f'../input/petfinder-pawpularity-score/test/{str(x)}.jpg')
target_col = 'Pawpularity'
metadata_cols = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
                 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
train[metadata_cols + [target_col]].mean()

In [None]:
test[metadata_cols].mean()

# target: Pawpularity

In [None]:
plt.hist(train[target_col], bins=50);

# Image shape

In [None]:
def create_shape_feature(df):
    width_height_list = []
    file_size_list = []
    for path_ in tqdm(df['img_path']):
        width_height_list.append(Image.open(path_).size)
        file_size_list.append(os.path.getsize(path_))
    df['width_height'] = width_height_list
    df['file_size'] = file_size_list
    df['width'] = df['width_height'].apply(lambda x: x[0])
    df['height'] = df['width_height'].apply(lambda x: x[1])
    return df

In [None]:
train = create_shape_feature(train)
test = create_shape_feature(test)

In [None]:
train['width_height'].value_counts()[:20]

In [None]:
test['width_height'].value_counts()

>test/ - Folder containing randomly generated images in a format similar to the training set photos. The actual test data comprises about 6800 pet photos similar to the training set photos.

The test data images in the training phase are randomly generated images.

In [None]:
im = Image.open(test['img_path'].values[0])
plt.imshow(im);

# Image

In [None]:
# !pip install ipyplot
!python -m pip install --no-index --find-links=../input/ipyplot ipyplot
import ipyplot

## train

In [None]:
image_paths = []
labels = []
custom_texts = []

for col in metadata_cols:
    tmp_df = train[train[col] == 1]
    for i in range(4):
        image_paths.append(tmp_df.iloc[i, :]['img_path'])
        labels.append(col)
        target = str(tmp_df.iloc[i, :][target_col])
        meta = tmp_df.iloc[i, :][metadata_cols + ['width', 'height']].values
        meta = ''.join([f'{col}:{m}, ' for m, col in zip(meta, metadata_cols + ['width', 'height'])])
        custom_texts.append(f'target: {target}\n{meta}')

In [None]:
ipyplot.plot_class_tabs(image_paths, labels, custom_texts=custom_texts, force_b64=True, img_width=350)

## test

In [None]:
ipyplot.plot_images(test['img_path'].values, force_b64=True, img_width=100)

# LightGBM

In [None]:
train['area'] = train['width'] * train['height']
train['size_per_ pixel'] = train['file_size'] / train['area']

test['area'] = test['width'] * test['height']
test['size_per_ pixel'] = test['file_size'] / test['area']

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')


def calc_model_importance(model, feature_names=None, importance_type='gain'):
    # ref https://www.kaggle.com/corochann/permutation-importance-for-feature-selection-part1
    importance_df = pd.DataFrame(model.feature_importance(importance_type=importance_type),
                                 index=feature_names,
                                 columns=['importance']).sort_values('importance')
    return importance_df


def calc_mean_importance(importance_df_list):
    mean_importance = np.mean(
        np.array([df['importance'].values for df in importance_df_list]), axis=0)
    mean_df = importance_df_list[0].copy()
    mean_df['importance'] = mean_importance
    return mean_df


def plot_importance(importance_df, title='',
                    save_filepath=None, figsize=(4, 6)):
    importance_df = importance_df.iloc[-50:, :]
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()


def do_train(all_feature, params):

    models = []
    scores = []

    gain_importance_list = []
    split_importance_list = []

    y = all_feature['Pawpularity'].values
    X = all_feature.drop(['Id', 'img_path', 'width_height', 'Pawpularity'], axis=1)
    print(f'features: {X.columns.values}')
    print(f'num features: {len(X.columns)}')

    oof = np.zeros(len(X))
    
    kf = KFold(n_splits=5, shuffle=True, random_state=0)

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):

        print(f"Fold :{fold+1}")

        # create dataset
        X_train, y_train = X.iloc[trn_idx], y[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y[val_idx]

        # weight
        weights = None
        lgbm_train = lgb.Dataset(X_train, y_train, weight=weights)
        lgbm_valid = lgb.Dataset(X_valid, y_valid, reference=lgbm_train, weight=weights)

        # model
        model = lgb.train(params=params,
                          train_set=lgbm_train,
                          valid_sets=[lgbm_train, lgbm_valid],
                          num_boost_round=5000,
                          verbose_eval=100,
                          categorical_feature=metadata_cols,
                          early_stopping_rounds=30
                          )

        # validation
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
        oof[val_idx] = y_pred

        score = round(np.sqrt(mean_squared_error(y_true=y_valid, y_pred=y_pred)), 3)
        print(f'RMSE: {score}')

        # keep scores and models
        scores.append(score)
        models.append(model)
        print("*" * 5)

        # --- calc model feature importance ---
        feature_names = X_train.columns.values.tolist()
        gain_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='gain')
        gain_importance_list.append(gain_importance_df)

        split_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='split')
        split_importance_list.append(split_importance_df)

    print(scores)
    score = round(np.sqrt(mean_squared_error(y_true=y, y_pred=oof)), 3)
    print('score: ', score)

    gain_importance_df = calc_mean_importance(gain_importance_list)
    split_importance_df = calc_mean_importance(split_importance_list)

    return models, gain_importance_df, split_importance_df, oof, score

In [None]:
lgb_params = {
    'objective': 'regression',
    'max_depth': 3,
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'verbosity': -1,
}

models, gain_importance_df, split_importance_df, oof, score = do_train(train, lgb_params)

In [None]:
plot_importance(gain_importance_df, 'importance_gain')
plot_importance(split_importance_df, 'importance_split')

In [None]:
plt.scatter(train[target_col], oof, s=2)
plt.xlabel('target')
plt.ylabel('oof')
plt.title(f'cv: {score}');

In [None]:
mean_of_target = train[target_col].mean()
print(f'Target average: {mean_of_target}')

score = np.sqrt(mean_squared_error(y_true=train[target_col], y_pred = np.ones(len(train)) * mean_of_target))
print(f'RMSE when predicting the average value of the target: {score}')

Metafeature and image size features seem to contribute little to the score improvement.

## test predict

In [None]:
sample = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')
test_feature = test.drop(['Id', 'img_path', 'width_height'], axis=1)

preds = []
for model in models:
    preds.append(model.predict(test_feature, num_iteration=model.best_iteration))

sample[target_col] = np.mean(preds, axis=0)
sample.to_csv('submission.csv', index=False)