# <div style="text-align:center; border: 2px solid #FFA500; border-radius: 25px"><span style="color:purple">Pawpularity Prediction</span></div>
In this competition, our goal is to predict the engaggement with a pet's profile based on the appearance of that profile. For example, what kind of pictures are likely to be attractive to someone. (including the pet's name, using props in the picture, using multiple pictures, using accessories etc.)

We are provided with metadata (data about data)
- Image data
- Tabular data


*We want to predict the **Pawpularity score**. We can expect pets with attractive photos to generate more interest and be adopted faster.*
 

# Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold

pd.set_option('display.max_colwidth',None)

loading the `csv` files:

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
sample_submission = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
sample_submission.head(2)

In [None]:
print(train.shape, test.shape, sample_submission.shape)

# Exploratory Data Analysis

**The EDA Kernel** can be found [here](https://www.kaggle.com/esratmaria/petfinder-pawpularity-prediction-eda)

# Model Creation

[Reference Kernel](https://github.com/databatman/kaggle-petfinder-competition/blob/master/stacking-lgb-xgb-mlp-bug-fixed-again.ipynb)

In [None]:
xgb_params = {
    'eval_metric': 'rmse',
    'seed': 1337,
    'eta': 0.0123,
    'subsample': 0.8,
    'colsample_bytree': 0.85,
    'silent': 1,
}


In [None]:
N_FOLDS = 4
FOLDS = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

In [None]:
import xgboost as xgb
def run_xgb(params, X_train, X_test):
    kf = FOLDS
    n_splits = N_FOLDS
    
    verbose_eval = 1000
    num_rounds = 60000
    early_stop = 500

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))

    i = 0

    for train_idx, valid_idx in kf.split(X_train, X_train['Pawpularity'].values):

        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['Pawpularity'].values
        X_tr = X_tr.drop(['Pawpularity'], axis=1)

        y_val = X_val['Pawpularity'].values
        X_val = X_val.drop(['Pawpularity'], axis=1)

        d_train = xgb.DMatrix(data=X_tr._get_numeric_data(), label=y_tr, feature_names=X_tr._get_numeric_data().columns, enable_categorical= True)
        d_valid = xgb.DMatrix(data=X_val._get_numeric_data(), label=y_val, feature_names=X_val._get_numeric_data().columns, enable_categorical= True)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

        valid_pred = model.predict(xgb.DMatrix(X_val._get_numeric_data(), feature_names=X_val._get_numeric_data().columns), ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(X_test._get_numeric_data(), feature_names=X_test._get_numeric_data().columns), ntree_limit=model.best_ntree_limit)

        oof_train[valid_idx] = valid_pred
        oof_test[:, i] = test_pred

        i += 1
    return model, oof_train, oof_test

model, oof_train, oof_test = run_xgb(xgb_params, train, test)


# Prediction

In [None]:
train_pred = oof_train
test_pred = np.mean(oof_test, axis=1)
train_pred.shape, test_pred.shape

In [None]:
test_pred

# Submission

In [None]:
submission = pd.DataFrame({'Id': sample_submission['Id'], 'Pawpularity': test_pred})
submission

In [None]:
submission.to_csv("submission.csv", index=False)