In [None]:
import numpy as np 
import pandas as pd
import random
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten
from tensorflow.keras import layers
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

In [None]:
IMG_SIZE = 224
CHANNELS = 3
BATCH_SIZE = 16
EPOCHS = 10
SEED = 2021

DATA_DIR = '../input/petfinder-pawpularity-score/'
TRAIN_DIR = DATA_DIR + 'train/'
TEST_DIR = DATA_DIR + 'test/'

In [None]:
# Configure Strategy. Assume TPU...if not set default for GPU/CPU
tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    # Enable XLA
    tf.config.optimizer.set_jit(enabled = "autoclustering")
    strategy = tf.distribute.get_strategy()

In [None]:
# Load Train Data
sample_df = pd.read_csv(f'{DATA_DIR}train.csv')
sample_df['Id'] = sample_df['Id'].apply(lambda x: f'{TRAIN_DIR}{x}.jpg')

# Label value to be used for feature model 'classification' training.
sample_df['target_value'] = sample_df['Pawpularity'] / 100.

In [None]:
sample_df = shuffle(sample_df, random_state=SEED)
train_size = int(len(sample_df)*0.8)
train_df = sample_df[:train_size]
validation_df = sample_df[train_size:]
train_df.head()

In [None]:
training_data = tf.data.Dataset.from_tensor_slices((train_df['Id'].values, train_df['target_value'].values))
validation_data = tf.data.Dataset.from_tensor_slices((validation_df['Id'].values, validation_df['target_value'].values))

In [None]:
def load_image_and_label_from_path(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=CHANNELS)
    img = tf.image.resize_with_pad(img, IMG_SIZE, IMG_SIZE)
    img = tf.image.adjust_brightness(img, 0.5)
    img = tf.image.adjust_saturation(img, 3)
    return img, label

AUTOTUNE = tf.data.experimental.AUTOTUNE #메모리 동적 할당을 위한 AUTOTUNE
training_data = training_data.map(load_image_and_label_from_path, num_parallel_calls=AUTOTUNE) #train 데이터를 불러옴
validation_data = validation_data.map(load_image_and_label_from_path,num_parallel_calls=AUTOTUNE) #validation 데이터를 불러옴

In [None]:
#train 및 validation 데이터를 훈련하기 좋게 batch로 자름
training_data_batches = training_data.shuffle(buffer_size=1000).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
validation_data_batches = validation_data.shuffle(buffer_size=1000).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

# DATA preparing


In [None]:
tabular_train = pd.read_csv("../input/petfinder-pawpularity-score/train.csv", index_col='Id')
tabular_test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv", index_col = 'Id')

# EDA

In [None]:
train_df.head()

# heatmap

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data = tabular_train.corr(), annot=True, 
fmt = '.2f', linewidths=.5, cmap='Blues')

In [None]:
y = tabular_train['Pawpularity']
X = tabular_train.drop(['Action', 'Human', 'Pawpularity'], axis=1)
tabular_test = tabular_test.drop(['Action', 'Human', ], axis=1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3)

# Modeling

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.01, max_depth=500, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.01, n_estimators=2000,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11
                             )

In [None]:
model_CBR = CatBoostRegressor(iterations=3000,
                              learning_rate=0.01,
                              depth=1,
                              l2_leaf_reg=20,
                              bootstrap_type='Bernoulli',
                              subsample=0.6, eval_metric='RMSE',
                              metric_period=50, od_type='Iter',
                              od_wait=45, random_seed=17,
                              allow_writing_files=False, verbose=False
                                )

In [None]:
model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)
model_CBR.fit(X_train, y_train)

In [None]:
print(np.sqrt(mean_squared_error(model_xgb.predict(X_val), y_val)))
print(np.sqrt(mean_squared_error(model_lgb.predict(X_val), y_val)))
print(np.sqrt(mean_squared_error(model_CBR.predict(X_val), y_val)))
# 21.15766550903435, 21.167607670109277, 21.15390571577174
# 

In [None]:
pred_xgb = model_xgb.predict(X_val)
pred_lgb = model_lgb.predict(X_val)
pred_CBR = model_CBR.predict(X_val)

In [None]:
pred_ensemble = (pred_xgb + pred_lgb + pred_CBR)/3

In [None]:
print(np.sqrt(mean_squared_error(pred_ensemble, y_val)))
# 21.155598840619007

In [None]:
# Load Test Data
test_df = pd.read_csv(f'{DATA_DIR}test.csv')
test_df['Id'] = test_df['Id'].apply(lambda x: f'{TEST_DIR}{x}.jpg')
test_df['Pawpularity'] = 0

# Summary
print(f'test_df: {test_df.shape}')
test_df.head()

In [None]:
pred_xgb1 = model_xgb.predict(tabular_test)
pred_lgb1 = model_lgb.predict(tabular_test)
pred_CBR1 = model_CBR.predict(tabular_test)

pred_ensemble1 = (pred_xgb1 + pred_lgb1 + pred_CBR1)/3

In [None]:
submission = pd.DataFrame(columns={"Id", "Pawpularity"})
submission['Id'] = test_df['Id'].map(lambda i : i.split('/')[-1].split('.')[0])
submission['Pawpularity'] = pred_ensemble1
submission['Pawpularity'] = submission['Pawpularity'].map(lambda i: i*100)
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)