In [None]:
!pip install ../input/torchnotebook/skorch-0.10.0-py3-none-any.whl

In [None]:
import os
import numpy as np
import pandas as pd
from copy import deepcopy
import cv2


from sklearn.model_selection import StratifiedKFold

# encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

# machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# neuralnet
from skorch import NeuralNetRegressor
from sklearn.base import BaseEstimator, TransformerMixin

# inspection
from sklearn.inspection import permutation_importance

# torch
from torch import optim
from torch.optim.lr_scheduler import CyclicLR

import torch
import torch.nn as nn

# visualize
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

random_seed = 2021
num_folds = 5

### Load Data

In [None]:
df = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
df.head()

In [None]:
for col in df.columns:
    print(col, df[col].nunique())

### Feature Engeneering

In [None]:
def load_img(img_file_path):
    img = cv2.imread(img_file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img 

In [None]:
img_file_paths = [f'/kaggle/input/petfinder-pawpularity-score/train/{img_filename}.jpg' for img_filename in df['Id']]

#### strategy1: add image info (width, height, file-size)

In [None]:
img_stg1_df = df.copy()

In [None]:
def get_shape_info(img, img_file_path):
    return pd.Series([img.shape[0], img.shape[1], img.shape[0] / img.shape[1], os.path.getsize(img_file_path)])

def add_shape_info(img_file_path):
    img = load_img(img_file_path)
    return get_shape_info(img, img_file_path)

In [None]:
shape_cols = ['width', 'height', 'w_h_ratio', 'size']

In [None]:
img_stg1_df[shape_cols] =  [add_shape_info(img_file_path) for img_file_path in tqdm(img_file_paths)]

#### strategy2: add image rgb info (min, max, mean, std)

In [None]:
img_stg2_df = df.copy()

In [None]:
def get_stats(data):
    return np.min(data), np.max(data), np.mean(data), np.std(data)

def get_rgb_info(img):
    r = img[0]
    g = img[1]
    b = img[2]
    
    r_stats =  get_stats(r)
    g_stats =  get_stats(g)
    b_stats =  get_stats(b)
    
    return np.concatenate([r_stats, g_stats, b_stats])

def add_rgb_info(img_file_path):
    img = load_img(img_file_path)
    return get_rgb_info(img)

In [None]:
rgb_cols = np.concatenate([[f'{prefix}_{postfix}' for postfix in ['min', 'max', 'mean', 'std']] for prefix in ['r', 'g', 'b']])

In [None]:
img_stg2_df[rgb_cols] =  [add_rgb_info(img_file_path) for img_file_path in tqdm(img_file_paths)]

#### strategy3: strategy1 + strategy2

In [None]:
img_stg3_df = df.copy()

In [None]:
img_stg3_df[shape_cols] = img_stg1_df[shape_cols]
img_stg3_df[rgb_cols] = img_stg2_df[rgb_cols]

In [None]:
img_stg3_df.head()

### Split Data

In [None]:
def split_data(df):
    X = df.drop(['Id', 'Pawpularity'], axis=1)
    y = df['Pawpularity']
    tmp_label = [Pawpularity // 4 for Pawpularity in df['Pawpularity']]

    skf = StratifiedKFold(n_splits=num_folds, random_state=random_seed, shuffle=True)

    data_per_fold = dict()

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, tmp_label)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        data_per_fold[fold] = {}
        data_per_fold[fold]['X_train'] = X_train
        data_per_fold[fold]['X_test'] = X_test
        data_per_fold[fold]['y_train'] = y_train
        data_per_fold[fold]['y_test'] = y_test
        
    return data_per_fold

### Build Pipeline 
- All features are binary

In [None]:
class RegressorModule(nn.Module): 
    def __init__(self, num_input):
        super(RegressorModule, self).__init__()
        
        self.sequence = nn.Sequential(nn.Linear(num_input, 16),
                                   nn.ReLU(),
                                   nn.Linear(16, 16),
                                   nn.ReLU(),
                                   nn.Linear(16, 12),
                                   nn.ReLU(),
                                   nn.Linear(12, 8),
                                   nn.ReLU(),
                                   nn.Linear(8, 1),
                                   )
        
    def forward(self, x):
        return self.sequence(x)


class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss

In [None]:
def get_preprocessor(features, num_features, degree):  
    num_features.sort()
    num_transformer = Pipeline(
        steps=[
            ("polynomial", PolynomialFeatures(degree=degree)), 
            ("scaler", RobustScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[('num', num_transformer, num_features)],
        remainder='passthrough'
    )
    
    return preprocessor

In [None]:
class FloatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, x):
        return np.array(x, dtype=np.float32)

In [None]:
def get_model(method, num_features):
    if method == 'lr':
        model = LinearRegression(fit_intercept=True)
    elif method == 'rf':
        model = RandomForestRegressor(
            n_estimators=200,
            min_samples_split=3,
            min_samples_leaf=2,   
        )
    elif method == 'nn':        
        model = NeuralNetRegressor(
            RegressorModule(num_input=num_features),
            max_epochs=100, verbose=0,
            warm_start=True,
#             device='cuda',
            criterion=RMSELoss,
            optimizer = optim.AdamW,
            optimizer__lr = 0.001
        )
        
    return model

In [None]:
def build_pipeline(features, method, degree=1, num_features=None):    
    if num_features is None:
        model = get_model(method, len(features))
        return Pipeline(
            [
                ('float64to32', FloatTransformer()),
                ('model', model)
            ]
        )
    else:
        preprocessor = get_preprocessor(features, num_features, degree)
        model = get_model(method, len(features) + 1)

        return Pipeline(
            [
                ('preprocessor', preprocessor),
                ('float64to32', FloatTransformer()),
                ('model', model)
            ]
        )

### Metric

In [None]:
def rmse_score(pred, true):
    return np.sqrt(np.mean((pred - true) ** 2))

### Run

In [None]:
def run(df, num_features=None):
    data_per_fold = split_data(df)
    features = data_per_fold[0]['X_train'].columns.tolist()

    model_lr = build_pipeline(features, 'lr', num_features=num_features)
    model_rf = build_pipeline(features, 'rf', num_features=num_features)
    model_nn = build_pipeline(features, 'nn', num_features=num_features)
    
    for fold in range(num_folds):
        print('fold',fold)

        data = data_per_fold[fold]
        X_train = data_per_fold[fold]['X_train']
        X_test = data_per_fold[fold]['X_test']
        y_train = data_per_fold[fold]['y_train']
        y_test = data_per_fold[fold]['y_test']

        model_lr.fit(X_train, y_train)
        pred_lr = model_lr.predict(X_test)
        rmse_lr = rmse_score(pred_lr, y_test.to_numpy())
        print('lr: ', rmse_lr)

        model_rf.fit(X_train, y_train)
        pred_rf = model_rf.predict(X_test)
        rmse_rf = rmse_score(pred_rf, y_test.to_numpy())
        print('rf: ', rmse_rf)

        model_nn.fit(X_train, y_train.astype(np.float32).values.reshape(-1, 1))
        pred_nn = model_nn.predict(X_test)
        rmse_nn = rmse_score(pred_nn, y_test.to_numpy())
        print('nn: ', rmse_nn)

        rmse_ensemble = rmse_score((pred_lr + pred_rf + pred_nn) / 3, y_test.to_numpy())
        print('ensemble: ', rmse_ensemble)

        pi_lr = permutation_importance(model_lr, X_test, y_test, n_repeats=30, random_state=random_seed)
        pi_rf = permutation_importance(model_rf, X_test, y_test, n_repeats=30, random_state=random_seed)
        pi_nn = permutation_importance(model_nn, X_test, y_test, n_repeats=30, random_state=random_seed)

        # 시각화
        fig, axs = plt.subplots(ncols=3, figsize=(15, 5), constrained_layout=True, sharey=True)

        for ax, pi, title in zip(axs, [pi_lr, pi_rf, pi_nn], ["Linear Reg.", "Random Forest", "Neural Net"]):
            ax.barh(X_test.columns, pi.importances_mean, xerr=pi.importances_std, color="orange")
            ax.invert_yaxis()
            ax.set_xlim(0, )
            ax.set_title(title, pad=16)

        plt.show()

In [None]:
# run(df)

In [None]:
# run(img_stg1_df, shape_cols)

In [None]:
# run(img_stg2_df, rgb_cols)

In [None]:
# run(img_stg3_df, np.concatenate([shape_cols, rgb_cols]))

### Feature selection

- delete features have least effect on experiments above
    - 'Subject Focus', 'Action', 'Collage'
- add features have positive effect on experiments above
    - 'w_h_ratio', 'size'
    - I think 'width' and 'height' can be replaced with 'w_h_ratio', 'size'
- deprecate rgb features because it is considered that using embedding from deep learning model would be better

In [None]:
final_df = df.drop(['Subject Focus', 'Action', 'Collage'], axis=1)
final_df[shape_cols] =  [add_shape_info(img_file_path) for img_file_path in img_file_paths]
final_df = final_df.drop(['width', 'height'], axis=1)

features = final_df.columns.tolist()

### Predict

In [None]:
X_train = final_df.drop(['Id', 'Pawpularity'], axis=1)
y_train = final_df['Pawpularity']

features = X_train.columns.tolist()
num_features = ['w_h_ratio', 'size'] 

In [None]:
# run(final_df, num_features)

In [None]:
test_df = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')

test_img_file_paths = [f'/kaggle/input/petfinder-pawpularity-score/test/{img_filename}.jpg' for img_filename in test_df['Id']]

X_test = test_df.drop(['Id', 'Subject Focus', 'Action', 'Collage'], axis=1)
X_test[shape_cols] =  [add_shape_info(img_file_path) for img_file_path in test_img_file_paths]
X_test = X_test.drop(['width', 'height'], axis=1)

In [None]:
model_lr = build_pipeline(features, 'lr', num_features=num_features)
model_rf = build_pipeline(features, 'rf', num_features=num_features)
model_nn = build_pipeline(features, 'nn', num_features=num_features)


model_lr.fit(X_train, y_train)
pred_lr = model_lr.predict(X_test)

model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)

model_nn.fit(X_train, y_train.astype(np.float32).values.reshape(-1, 1))
pred_nn = model_nn.predict(X_test).squeeze(1)

pred_ensemble = (pred_lr + pred_rf + pred_nn) / 3

In [None]:
submission = test_df[['Id']].copy()
submission['Pawpularity'] = pred_ensemble
submission.to_csv('submission.csv', index=False)

In [None]:
submission