In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

In [None]:
train_df = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
train_df

In [None]:
test_df = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
test_df

In [None]:
scr_df = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')
scr_df

**The size of test data is too low! So we can merge train and test datas first and split them to train and test set later**

In [None]:
test_df = test_df.merge(scr_df, how='outer', on = 'Id')
test_df

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
df

**Let to get some information about data such as type of attribute types and check miss values**

In [None]:
df.info()

In [None]:
df.describe().T

# Data Visualization

In [None]:
import plotly 
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
col_var = df.columns[1:-1].values.tolist()

for i in col_var:
    fig, ax = plt.subplots(1,2, figsize=(14,4))
    sns.boxplot(data=df, x=i, y='Pawpularity', ax=ax[0])
    sns.histplot(data=df, x='Pawpularity', hue=i, kde=True, ax=ax[1])
    plt.suptitle(i, fontsize=20)
    fig.show()

**As you see, there isn't any significant relationship between target(Pawplularity) and data attributes! Almost all zeros and ones are in the same variation range!**

**For this reason, it can be assumed that the data used is not suitable for trainin and it's better to use the images themselves.**

# Data Training

**Preprocessing:**

* boxcox normalization for Pawpularity only because other attributes are binary!

In [None]:
df['Pawpularity'].hist(bins=100)

In [None]:
from scipy import stats

df['Pawpularity'], _ = stats.boxcox(df['Pawpularity'])
df['Pawpularity'].hist(bins=100)

In [None]:
X = df[df.columns[1:-1]]
y = df['Pawpularity']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

**1. Random Forest Regression**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor()
rfr_param = { 'criterion': ['mse'],
              'max_features': ['sqrt', 'auto', 'log2'],
              'max_depth': [4, 5, 6],
              #'warm_start': [True],
              'bootstrap': [True],
              #'oob_score': [True],
              #'min_samples_split': [5, 10, 20, 30],
              'min_samples_leaf': [10, 20, 30],
              'max_samples': [0.75],
              'n_estimators': [50, 100, 150, 200],
              'random_state': [42]}

rfr_grd = GridSearchCV(rfr,
                       rfr_param,
                       cv = 5,
                       n_jobs= -1,
                       verbose=1)

rfr_grd.fit(X, y)
print(rfr_grd.best_params_)

In [None]:
rfr = RandomForestRegressor(bootstrap= True, criterion= 'mse', max_depth= 5, max_features= 'sqrt', max_samples= 0.75, min_samples_leaf= 10, n_estimators= 150, random_state= 42)

rfr.fit(X_train, y_train)
#print(rfr.oob_score_)
print(rfr.score(X_test, y_test))  #R**2 of validation loss

**As we guessed, RandomForestRegressor model was not well trained because of data used**

**2. K-Neighbors Regressorion**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

knr = KNeighborsRegressor()
knr_param = {'algorithm': ['ball_tree', 'kd_tree', 'brute'],
             'leaf_size': [2, 5, 10, 20, 30],
             'metric': ['minkowski', 'euclidean'],
             'p': [2, 3],
             'n_neighbors': [20, 40, 80, 100]}

knr_grd = GridSearchCV( knr,
                        knr_param,
                        cv = 5,
                        n_jobs= -1,
                        verbose=1)

knr_grd.fit(X_train, y_train)
print(knr_grd.best_params_)

In [None]:
knr = KNeighborsRegressor(algorithm= 'kd_tree', leaf_size= 2, metric= 'minkowski', n_neighbors= 100, p=2)

knr.fit(X_train, y_train)
print(knr.score(X_test, y_test))

**OMG! it just get worse :)**

**3. XGBRegressor**

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgbr = xgb.XGBRegressor()
xgbr_param1 = {'booster': ['gbtree'],
              'tree_method': ['gpu_hist'],
              'predictor': ['gpu_predictor'],
              'objective': ['reg:squarederror'],
              'eval_metric': ['rmse'],
              'learning_rate': [0.01, 0.05, 0.1, 0.5],
              'gamma': [0, 0.25, 0.5, 1],
              'max_depth': [4, 5, 6, 7],
              'subsample': [0.75],
              'n_estimators': [50, 100, 200],
              'random_state': [42]}

xgbr_grd1 = GridSearchCV(xgbr,
                        xgbr_param1,
                        cv = 5,
                        n_jobs= -1,
                        verbose=1)

xgbr_grd1.fit(X_train, y_train)
print(xgbr_grd1.best_params_)

In [None]:
xgbr1 = xgb.XGBRegressor(booster= 'gbtree', tree_method= 'gpu_hist', predictor= 'gpu_predictor', objective= 'reg:squarederror', eval_metric= 'rmse', learning_rate= 0.05, gamma= 1, max_depth= 4, subsample= 0.8, n_estimators= 100, random_state= 42)

xgbr1.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='rmse')
xgbr1.score(X_test, y_test) #R**2

# Result

**As we have guessed from the diagrams, the csv files are not suitable for training and we should use images themselves**