In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import cv2
import enum
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, roc_auc_score
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
PATH = '../input/petfinder-pawpularity-score'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

In [None]:
train.shape # sub.shape, test.shape

In [None]:
train.head() 

In [None]:
test.head()

In [None]:
sub.head()

In [None]:
plt.rcParams["figure.figsize"] = (15,6)
ax = sns.boxplot(x=train.Pawpularity, whis=[5, 95], color='red')

In [None]:
dataplot = sns.heatmap(train.corr(), cmap="BuPu", annot=True, color='red')
plt.show()

In [None]:
columns = [col for col in train.columns if col not in ['Id', 'Pawpularity', 'bins'] ]

In [None]:
IMG_PATH = [os.path.join(PATH, 'train/'+idx+'.jpg')  for idx in train.Id]
def visualize_image(figsize=(16, 16), n_images=2):
    plt.figure(figsize=figsize)
    w = int(n_images ** .5)
    h = math.ceil(n_images / w) 
    all_names = IMG_PATH
    image_names = random.sample(all_names, n_images)
    for ind, image_name in enumerate(image_names):   
            img = cv2.imread(image_name)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
            plt.subplot(w, h, ind + 1)
            plt.imshow(img)
            plt.axis("off")          
    plt.show()

In [None]:
visualize_image(n_images=25)

In [None]:
xgb_params= {"n_estimators": 30000,"max_depth": 20,"objective":"reg:squarederror","n_jobs": 4,"seed": 3001,'tree_method': "gpu_hist","gpu_id": 0,"eval_metric": "rmse",  "subsample": 0.7,"colsample_bytree": 0.7,"learning_rate": 0.05}

In [None]:
def random_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
class Config(enum.Enum):
    SEED = 3001
    N_FOLDS = 25
    EARLY_STOP = 300
random_seed(Config.SEED.value)

In [None]:

rmse_score = []

targets = train['Pawpularity'].values

num_bins = int(np.floor(1 + np.log2(len(train))))

train.loc[:, "bins"] = pd.cut(train["Pawpularity"], bins=num_bins, labels=False)

kf = StratifiedKFold(n_splits = Config.N_FOLDS.value, shuffle=True, random_state=Config.SEED.value)    
        
oof = np.zeros((train.shape[0],))
test_preds = 0

for f, (train_idx, val_idx) in tqdm(enumerate(kf.split(X=train, y=train.bins.values))):
        df_train, df_val = train.iloc[train_idx][columns], train.iloc[val_idx][columns]
        train_target, val_target = targets[train_idx], targets[val_idx]
        
        model = xgb.XGBRegressor(**xgb_params)
        
        model.fit(
            df_train[columns], 
            train_target,
            eval_set=[(df_val[columns], val_target)],
            early_stopping_rounds=Config.EARLY_STOP.value,
            verbose=500
        )
        
        oof_tmp = model.predict(df_val[columns])
        test_tmp = model.predict(test[columns])
        
        oof[val_idx] = oof_tmp
        test_preds += test_tmp/Config.N_FOLDS.value
        rmse = mean_squared_error(val_target, oof_tmp, squared=False)
        rmse_score.append(rmse)
        print(f'FOLD: {f} RMSE: {rmse} Mean RMSE: {np.mean(rmse_score)}')

In [None]:
sub['Pawpularity'] = test_preds
sub.to_csv('submission.csv', index=False)