In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#!pip install -q imagesize
#import imagesize

from IPython import display as ipd

# Packages
import gc
import time
import logging
import re, math

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import random
from tqdm import tqdm

import lightgbm as lgb
print(lgb.__version__)

from kaggle_datasets import KaggleDatasets

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import roc_curve, auc, accuracy_score, cohen_kappa_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix

import optuna 
from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_param_importances


## PetFinder.my - Pawpularity Contest

In this competition, you’ll analyze raw images and metadata to predict the “Pawpularity” of pet photos. You'll train and test your model on PetFinder.my's thousands of pet profiles. Winning versions will offer accurate recommendations that will improve animal welfare.

If successful, your solution will be adapted into AI tools that will guide shelters and rescuers around the world to improve the appeal of their pet profiles, automatically enhancing photo quality and recommending composition improvements. As a result, stray dogs and cats can find their "furever" homes much faster. With a little assistance from the Kaggle community, many precious lives could be saved and more happy families created.


## Photo Metadata

The train.csv and test.csv files contain metadata for photos in the training set and test set, respectively. Each pet photo is labeled with the value of 1 (Yes) or 0 (No) for each of the following features:

- Focus - Pet stands out against uncluttered background, not too close / far.
- Eyes - Both eyes are facing front or near-front, with at least 1 eye / pupil decently clear.
- Face - Decently clear face, facing front or near-front.
- Near - Single pet taking up significant portion of photo (roughly over 50% of photo width or height).
- Action - Pet in the middle of an action (e.g., jumping).
- Accessory - Accompanying physical or digital accessory / prop (i.e. toy, digital sticker), excluding collar and leash.
- Group - More than 1 pet in the photo.
- Collage - Digitally-retouched photo (i.e. with digital photo frame, combination of multiple photos).
- Human - Human in the photo.
- Occlusion - Specific undesirable objects blocking part of the pet (i.e. human, cage or fence). Note that not all blocking objects are considered occlusion.
- Info - Custom-added text or labels (i.e. pet name, description).
- Blur - Noticeably out of focus or noisy, especially for the pet’s eyes and face. For Blur entries, “Eyes” column is always set to 0.

In [None]:
def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')

In [None]:
RANDOM_SEED = 42
DEBUG = True
TUNING = False

DATA_PATH = '/kaggle/input/petfinder-pawpularity-score/'

train = pd.read_csv(DATA_PATH+'train.csv')
test = pd.read_csv(DATA_PATH+'test.csv')
submission = pd.read_csv(DATA_PATH+'sample_submission.csv')

seeding(RANDOM_SEED)
display(train.head(2))

In [None]:
FEATURES = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
target = train.Pawpularity
train = train[FEATURES]
test = test[FEATURES]

print('train shape:',train.shape)
print('test shape:',test.shape)
print('target shape:',target.shape)

## Adding features

idea is from excelent notebook by Ekaterina Dranitsyna:
https://www.kaggle.com/ekaterinadranitsyna/xgboost-for-tabular-data

In [None]:
CAT_FEATURES = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

def add_cross_features(df):
    for feature1 in FEATURES:    
        for feature2 in FEATURES:
            if feature1 != feature2:
                x2_feature_name = f'{feature1}-{feature2}'
                df[x2_feature_name] = df[feature1].astype(str) + '_' + df[feature2].astype(str)
                CAT_FEATURES.append(x2_feature_name)
                for feature3 in FEATURES:
                    if feature3 != feature2 and feature3 != feature1:
                        x3_feature_name = f'{feature1}-{feature2}-{feature3}'
                        df[x3_feature_name] = df[feature1].astype(str) + '_' + df[feature2].astype(str) + '_' + df[feature3].astype(str)
                        CAT_FEATURES.append(x3_feature_name)
    return df
                
                
train = add_cross_features(train)
test = add_cross_features(test)

## set all features as categorical
for c in train.columns:
    train[c] = train[c].astype('category')
    test[c] = test[c].astype('category')

print('train shape:',train.shape)
print('test shape:',test.shape)

## PCA

In [None]:
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

pca = PCA(n_components=0.999)
pca.fit_transform(train_scaled)
pca.transform(test_scaled)

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

## reduce dimensionality to speed things up
n_components = len(pca.explained_variance_ratio_)

pca = PCA(n_components=n_components)
pca_train = pca.fit_transform(train_scaled)
pca_test = pca.transform(test_scaled)

X = pd.DataFrame(pca_train, columns=['PCA%i' % i for i in range(n_components)], index=train.index)

In [None]:
## doublecheck shapes

print('train shape:',X.shape)
print('test shape:',pca_test.shape)
print('target shape:',target.shape)

## Tuning

In [None]:
NUM_BOOST_ROUND = 500
EARLY_STOPPING_ROUNDS = 100
VERBOSE_EVAL = 100

def objective(trial, X, y):
    
    param_grid = {
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'n_estimators': trial.suggest_categorical('n_estimators', [2000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 50, 2000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 2000, step=100),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100, step=5),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100, step=5),        
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }    
        
    X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.25, random_state=RANDOM_SEED, shuffle=True)
    eval_results = {}  # to record eval results for plotting
    
    model = lgb.train(
        param_grid, valid_names=["train", "valid"], 
        train_set=lgb.Dataset(X_train, y_train ), 
        num_boost_round = NUM_BOOST_ROUND,
        valid_sets = [lgb.Dataset(X_valid, y_valid)],
        verbose_eval = VERBOSE_EVAL,
        evals_result=eval_results,
        early_stopping_rounds = EARLY_STOPPING_ROUNDS,
    )    
    
    yhat = model.predict(X_valid)
    return mean_squared_error(yhat, y_valid)        

In [None]:
N_TRIALS = 200

if TUNING:
    study = optuna.create_study(direction='minimize')
    objective_func = lambda trial: objective(trial, X, target)
    study.optimize(objective_func, n_trials=N_TRIALS)  # number of iterations

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

## Model and Training

In [None]:
def run_train(X, y, run_params, splits, num_boost_round, verbose_eval, early_stopping_rounds ):
    models = []
    oof_predicted = []
    evals_results = {}  # to record eval results for plotting
    folds = StratifiedKFold(n_splits=splits)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'Fold {fold_n+1} started')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model = lgb.train(
            run_params, valid_names=["train", "valid"], 
            train_set=lgb.Dataset(X_train, y_train ), 
            num_boost_round = num_boost_round,
            valid_sets = [lgb.Dataset(X_valid, y_valid)],
            verbose_eval = verbose_eval,
            evals_result=evals_results,
            early_stopping_rounds = early_stopping_rounds,
        )
            
        oof_predicted.append(model.predict(X_valid))
        models.append(model)
    return models, oof_predicted, evals_results


In [None]:
TOTAL_SPLITS = 3
NUM_BOOST_ROUND = 4000
EARLY_STOPPING_ROUNDS = 100
VERBOSE_EVAL = 100

run_params = {
    'verbose': -1, 
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'n_estimators': 1000,
    
    'n_estimators': 2000,
    'learning_rate': 0.09802528890284778,
    'num_leaves': 1200,
    'max_depth': 7,
    'min_data_in_leaf': 200,
    'max_bin': 257,
    'lambda_l1': 100,
    'lambda_l2': 10,
    'feature_fraction': 0.5291871221439385,
    'bagging_fraction': 0.4529000420532695,
    'bagging_freq': 7,
    'min_child_samples': 7,
}

models, oof_predicted, evals_results = run_train(X, target, run_params, TOTAL_SPLITS, 
                                                NUM_BOOST_ROUND, VERBOSE_EVAL, EARLY_STOPPING_ROUNDS)

In [None]:
predicted = []
for model in models:
    predicted.append(model.predict(pca_test))

avg_preds = np.mean(predicted, axis=0)    

In [None]:
submission = pd.DataFrame({'Id': submission['Id'], 'Pawpularity': avg_preds})
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)