In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from PIL import Image
from tqdm import tqdm
import os
from pathlib import Path
import lightgbm as lgb

In [None]:
sampel_sub = '/kaggle/input/petfinder-pawpularity-score/sample_submission.csv'
train_metadata = '/kaggle/input/petfinder-pawpularity-score/train.csv'
test_metadata = '/kaggle/input/petfinder-pawpularity-score/test.csv'

In [None]:
# Credit to: https://www.kaggle.com/currypurin/petfinder-eda-lgb-meta-features-and-img-size
def create_shape_feature(df):
    width_height_list = []
    file_size_list = []
    for path_ in tqdm(df['img_path']):
        width_height_list.append(Image.open(path_).size)
        file_size_list.append(os.path.getsize(path_))
    df['width_height'] = width_height_list
    df['file_size'] = file_size_list
    df['width'] = df['width_height'].apply(lambda x: x[0])
    df['height'] = df['width_height'].apply(lambda x: x[1])
    df['area'] = df['width'] * df['height']
    df['size_per_pixel'] = df['area'] / df['file_size']
    return df

In [None]:
df_train = pd.read_csv(train_metadata)
df_test = pd.read_csv(test_metadata)

df_train['img_path'] = df_train['Id'].apply(lambda x: f'../input/petfinder-pawpularity-score/train/{str(x)}.jpg')
df_test['img_path'] = df_test['Id'].apply(lambda x: f'../input/petfinder-pawpularity-score/test/{str(x)}.jpg')

df_train = create_shape_feature(df_train)
df_test = create_shape_feature(df_test)

metadata = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

In [None]:
df_train.head()

In [None]:
fig = plt.figure(figsize = (12,12))
ax = fig.gca()
df_train.hist(ax=ax)
plt.show()

In [None]:
fig = plt.figure(figsize = (12,12))
ax = fig.gca()
df_test.hist(ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(4, 3,figsize=(15,18))

i = 0
j = 0

for x in metadata:
    sns.boxplot(x=x, y="Pawpularity", data=df_train, ax=ax[i, j])
    i+=1
    if i > 3:
        i = 0
        j += 1

In [None]:
corr = df_train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})

## "Baselining"

In previous versions I used tweedie regression from LGBM but transforming the target variable so it matches a tweedie distribution. (Credits to the following discussion: https://www.kaggle.com/c/petfinder-pawpularity-score/discussion/274073). However it seems that for this particular problem using standard RMSE yield better results

In [None]:
#df_train['Pawpularity_tgt'] = 100 - df_train['Pawpularity']

In [None]:
def rmse(y, yhat):
    return np.sqrt(np.sum(np.power(y - yhat, 2)))

In [None]:
seed = 42
def train_and_optimize_lgb(p):
    print(p)
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': p['max_depth'],
        'max_bin':p['max_bin'],
        'min_data_in_leaf': p['min_data_in_leaf'],
        'learning_rate': p['learning_rate'],
        'subsample': p['subsample'],
        'subsample_freq': p['subsample_freq'],
        'feature_fraction': p['feature_fraction'],
        'lambda_l1': p['lambda_l1'],
        'lambda_l2': p['lambda_l2'],
        'seed':seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'n_jobs':-1,
        'verbose': -1}
    
    features = metadata + ['width', 'height','file_size', 'area', 'size_per_pixel']
    oof_predictions = np.zeros(df_train.shape[0])
    kfold = KFold(n_splits = 4, random_state = seed, shuffle = True)
    
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(df_train)):
        #print(f'Training fold {fold + 1}')

        x_train, x_val = df_train[features].loc[trn_ind], df_train[features].loc[val_ind]
        y_train, y_val = df_train['Pawpularity'].loc[trn_ind], df_train['Pawpularity'].loc[val_ind]

        train_dataset = lgb.Dataset(x_train, y_train)
        val_dataset = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(params = params,
                          num_boost_round=800,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = -1,
                          early_stopping_rounds=20)
        
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        print(rmse(df_train['Pawpularity'], oof_predictions))
    return rmse(df_train['Pawpularity'], oof_predictions)

def make_predictions(p, kf_size=4):
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': int(p['max_depth']),
        'max_bin':int(p['max_bin']),
        'min_data_in_leaf': int(p['min_data_in_leaf']),
        'learning_rate': p['learning_rate'],
        'subsample': p['subsample'],
        'subsample_freq': int(p['subsample_freq']),
        'feature_fraction': p['feature_fraction'],
        'lambda_l1': p['lambda_l1'],
        'lambda_l2': p['lambda_l2'],
        'seed':seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'n_jobs':-1,
        'verbose': -1}
    
    features = metadata + ['width', 'height','file_size', 'area', 'size_per_pixel']
    kfold = KFold(n_splits = kf_size, random_state = seed, shuffle = True)
    pawpularity_test = np.zeros(df_test.shape[0])
    
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(df_train)):
        x_train, x_val = df_train[features].loc[trn_ind], df_train[features].loc[val_ind]
        y_train, y_val = df_train['Pawpularity'].loc[trn_ind], df_train['Pawpularity'].loc[val_ind]
        
        train_dataset = lgb.Dataset(x_train, y_train) 
        val_dataset = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(params = params,
                          num_boost_round=800,
                          train_set = train_dataset,
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = -1,
                          early_stopping_rounds=20)
        
        pawpularity_test += model.predict(df_test[features])/kf_size
        
    df_test['Pawpularity'] = pawpularity_test
    df_test[['Id', 'Pawpularity']].to_csv('submission.csv', index=False)

In [None]:
param_space = {
    'max_depth': scope.int(hp.uniform('max_depth', 2, 8)),
    'max_bin': scope.int(hp.uniform('max_bin', 2, 100)),
    'min_data_in_leaf': scope.int(hp.uniform('min_data_in_leaf', 10, 1000)),
    'learning_rate': hp.uniform('learning_rate',0.001,0.1),
    'subsample': hp.uniform('subsample', 0.2, 0.9),
    'subsample_freq': scope.int(hp.uniform('subsample_freq',1,30)),
    'feature_fraction': hp.uniform('feature_fraction',0.5, 0.9),
    'lambda_l1': hp.uniform('lambda_l1',0.1,3),
    'lambda_l2': hp.uniform('lambda_l2',0.1,3)
}

trials = Trials()

hopt = fmin(fn = train_and_optimize_lgb, 
            space = param_space, 
            algo = tpe.suggest, 
            max_evals = 500, 
            trials = trials
           )

In [None]:
make_predictions(hopt, 4)