## ***About this Competition***

### ***In this competition, your task is to predict engagement with a pet's profile based on the photograph for that profile. You are also provided with hand-labelled metadata for each photo. The dataset for this competition therefore comprises both images and tabular data.***


# Import Necessary library

In [None]:
#mathematical analysis
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import cv2
#preprocessing
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
#Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb


# Read data

In [None]:
#
train_data = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test_data = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
sample = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')
print('Shape of input data :',train_data.shape,test_data.shape,sample.shape)
print('Identify the Null values : ',train_data.isnull().sum())
train_data.sample(2)


# KFOLD

In [None]:
#add extra one columns
train_data['kfold']=-1
#Distributing the data 5 shares
kfold = model_selection.KFold(n_splits=10, shuffle= True, random_state = 42)
for fold, (train_indicies, valid_indicies) in enumerate(kfold.split(X=train_data)):
    #print(fold,train_indicies,valid_indicies)
    train_data.loc[valid_indicies,'kfold'] = fold

    
print(train_data.kfold.value_counts()) #total data 300000 = kfold split :5 * 60000

#output of train folds data
train_data.to_csv("trainfold_10.csv",index=False)

# Visualize to Null data

In [None]:
train = pd.read_csv("./trainfold_10.csv")
# Plot dataframe
heat = train_data.corr().round(5)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(heat)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(heat, annot=False, mask=mask, cmap="RdYlGn", annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();

In [None]:
train.columns

# XGBRegressor

In [None]:
#store the final_prediction data and score
final_predictions = []
score= []

#features(categorical and numerical datas separate)
useful_features = [c for c in train.columns if c not in ("Id","Pawpularity","kfold")]
object_cols = [col for col in useful_features]
#numerical_cols = [col for col in useful_features]
test = test_data[useful_features]

for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.Pawpularity
    yvalid = xvalid.Pawpularity
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #ordinal encode categorical colums and standardscaler is applied (mean0,sd=1)
    ordinal_encoder = OrdinalEncoder()
    
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    

    #Model hyperparameter of XGboostRegressor
    xgb_params = {
        'learning_rate': 0.2113303692287,
        'subsample': 0.12703520389320402,
        'colsample_bytree': 0.2566392406542389,
        'max_depth': 2,
        'booster': 'gbtree', 
        'reg_lambda': 0.0005172374569093787,
        'reg_alpha': 0.001273145009879541,
        'random_state':256,
        'n_estimators':30000
        
        
    }
    
    model= XGBRegressor(**xgb_params,
                       tree_method='gpu_hist',
                       predictor='gpu_predictor',
                       gpu_id=0)
    model.fit(xtrain,ytrain,early_stopping_rounds=100,eval_set=[(xvalid,yvalid)],verbose=False)
    preds_valid = model.predict(xvalid)
    
    #Training model apply the test data and predict the output
    test_pre = model.predict(xtest)
    final_predictions.append(test_pre)
    
    #Rootmeansquared output
    rms = mean_squared_error(yvalid,preds_valid,squared=False)
    
    score.append(rms)
    #way of output is display
    print(f"fold:{fold},rmse:{rms}")

#mean of repeation of fold data and identify the  mean and standard deviation 
print(np.mean(score),np.std(score))

In [None]:
#store the final_prediction data and score
final_predictions = []
score= []

#features(categorical and numerical datas separate)
useful_features = [c for c in train.columns if c not in ("Id","Pawpularity","kfold")]
object_cols = [col for col in useful_features]
#numerical_cols = [col for col in useful_features]
test = test_data[useful_features]

for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.Pawpularity
    yvalid = xvalid.Pawpularity
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #ordinal encode categorical colums and standardscaler is applied (mean0,sd=1)
    ordinal_encoder = OrdinalEncoder()
    
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    

    #Model hyperparameter of XGboostRegressor
    #lgb parameters
    params_lgb = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    'subsample': 0.95312,
    "metric": "rmse",
    'learning_rate': 0.11635,
    "max_depth": 2,
    "feature_fraction": 0.2256038826485174,
    "bagging_fraction": 0.7705303688019942,
    "min_child_samples": 290,
    "reg_alpha": 14.68267919457715,
    "reg_lambda": 66.156,
    "max_bin": 772,
    "min_data_per_group": 177,
    "bagging_freq": 1,
    "cat_smooth": 96,
    "cat_l2": 17,
    "verbosity": -1,
    'random_state':42,
    'n_estimators':5000,
    'colsample_bytree':0.1107,
    'njobs':4
    }
    
    lgb_train = lgb.Dataset(xtrain, ytrain)
    lgb_val = lgb.Dataset(xvalid, yvalid)
    
    model = lgb.train(params=params_lgb,
                      train_set=lgb_train,
                      valid_sets=lgb_val,
                      early_stopping_rounds=300,
                      verbose_eval=1000)
    
   
    preds_valid = model.predict(xvalid,num_iteration=model.best_iteration)
    test_pre = model.predict(xtest,num_iteration=model.best_iteration)
    final_predictions.append(test_pre)
    
    #Rootmeansquared output
    rms = mean_squared_error(yvalid,preds_valid,squared=False)
    
    score.append(rms)
    #way of output is display
    print(f"fold:{fold},rmse:{rms}")

#mean of repeation of fold data and identify the  mean and standard deviation 
print(np.mean(score),np.std(score))

In [None]:
from catboost import CatBoostRegressor
#store the final_prediction data and score
final_predictions = []
score= []

#features(categorical and numerical datas separate)
useful_features = [c for c in train.columns if c not in ("Id","Pawpularity","kfold")]
object_cols = [col for col in useful_features]
#numerical_cols = [col for col in useful_features]
test = test_data[useful_features]

for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.Pawpularity
    yvalid = xvalid.Pawpularity
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #ordinal encode categorical colums and standardscaler is applied (mean0,sd=1)
    ordinal_encoder = OrdinalEncoder()
    
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    

    #Model hyperparameter of XGboostRegressor
    #catboost model
    catpara={
        'subsample': 0.95312,
        'learning_rate': 0.0011356,
        "max_depth": 6,
        "min_data_in_leaf":77,
        'random_state':42,
        'n_estimators':8000,
        'rsm':0.5,
        'l2_leaf_reg': 0.02247766515106271
    }
    
    model=CatBoostRegressor(**catpara)
    model.fit(xtrain,ytrain,early_stopping_rounds=100,eval_set=[(xvalid,yvalid)],verbose=1000)
    model.fit(xtrain,ytrain,early_stopping_rounds=100,eval_set=[(xvalid,yvalid)],verbose=False)
    preds_valid = model.predict(xvalid)
    
    #Training model apply the test data and predict the output
    test_pre = model.predict(xtest)
    final_predictions.append(test_pre)
    
    #Rootmeansquared output
    rms = mean_squared_error(yvalid,preds_valid,squared=False)
    
    score.append(rms)
    #way of output is display
    print(f"fold:{fold},rmse:{rms}")

#mean of repeation of fold data and identify the  mean and standard deviation 
print(np.mean(score),np.std(score))

# Prediction output

In [None]:
#prediction of data
preds = np.mean(np.column_stack(final_predictions),axis=1)
print(preds)
sample.Pawpularity = preds
sample.to_csv("submission.csv",index=False)
print("success")