In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.xception import preprocess_input, decode_predictions
# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
import tensorflow as tf

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, VotingRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Evalution Metrix
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_validate

from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *

from tqdm.notebook import tqdm

# Standard imports
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange
from colorama import Fore
from glob import glob
import json
from pprint import pprint
import time
import cv2
from enum import Enum
from IPython.display import display



#model_test = Xception(weights='imagenet')

model_test = tf.keras.models.load_model("/kaggle/input/pet-pawpularity-v2/model_name.h5")

#model_test.save("model_name.h5")

In [None]:
class load_images():
    IMAGE_SIZE = 299
    IMAGE_LOCATION = "/kaggle/input/petfinder-pawpularity-score/train/"
    training_data = []
    train_csv = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/train.csv")
            
    def make_training_data(self):
        for f in tqdm(os.listdir(self.IMAGE_LOCATION)):
            try :
                path = os.path.join(self.IMAGE_LOCATION,f)                
                img_pred = image.load_img(path, target_size=(self.IMAGE_SIZE, self.IMAGE_SIZE))
                
                x = image.img_to_array(img_pred)
                x = np.expand_dims(x, axis=0)
                x = preprocess_input(x)

                preds = model_test.predict(x)

                animal_predicted = decode_predictions(preds, top=1)[0][0][1]
                
                location = self.train_csv.loc[self.train_csv['Id'] == f.replace(".jpg", "")]
                
                Subject_Focus = location['Subject Focus'].values[0]
                Eyes = location.Eyes.values[0]
                Face = location.Face.values[0]
                Near = location.Near.values[0]
                Action = location.Action.values[0]
                Accessory = location.Accessory.values[0]
                Group = location.Group.values[0]
                Collage = location.Collage.values[0]
                Human = location.Human.values[0]
                Occlusion = location.Occlusion.values[0]
                Info = location.Info.values[0]
                Blur = location.Blur.values[0]
                Pawpularity = location.Pawpularity.values[0]
                
                self.training_data.append([f.replace(".jpg", ""),Subject_Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,animal_predicted])
            except Exception as e:
                print(str(e))

In [None]:
loader = load_images()
loader.make_training_data()

In [None]:
#let's process a bit better the data
pd.DataFrame(loader.training_data).to_csv("my_data_train.csv")
training_data = pd.read_csv("my_data_train.csv")

training_data.drop(['Unnamed: 0','0'],inplace=True,axis=1)
training_data.rename(columns = {'1': 'Subject_focus', '2': 'Eyes', '3':'Face','4':'Near','5':'Action','6':'Accessory','7':'Group','8':'Collage','9':'Human','10':'Occlusion','11':'Info','12':'Blur','13':'Pawpularity','14':'Animal_type'},inplace = True)

In [None]:
def rmse_score(y_label, y_preds):
    """
    Gives RMSE score
    """
    return np.sqrt(mean_squared_error(y_label, y_preds))

def trainRegModels(df : "data_file", features : list, label: str):
    """
    To automate the training of regression models. Considering
        > RMSE
        > R2 score
    
    """
    regModels = {
            "LinearRegression": LinearRegression(),
            "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=2),
            "AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=100),
            "LGBMRegressor": LGBMRegressor(),
            "Ridge": Ridge(alpha=1.0),
            "ElasticNet": ElasticNet(random_state=0),
            "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
            "DecisionTreeRegressor": DecisionTreeRegressor(),
            "ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
            "RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
            "XGBRegressor": XGBRegressor(n_jobs=-1),
            "CatBoostRegressor": CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05, loss_function = 'RMSE'),
        }
        # Will return this as a data frame
    summary = {
        "Model" : [],
        "Avg R2 Train Score" : [],
        "Avg R2 Val Score" : [],
        "Avg RSME Train Score" : [],
        "Avg RSME Val Score" : []
    }
    
     # Training
    for idx in trange(len(regModels.keys()), desc = "Models are training...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position = 0, leave = True):
        name = list(regModels.keys())[idx]
        model = regModels[name]
        
        # Initializing all the scores to 0
        r2_train = 0; r2_val = 0
        rmse_train = 0; rmse_val = 0
        
        # Running K-fold Cross-validation on every model
        for fold in range(5):
            train_df = df.loc[df.kfold != fold].reset_index(drop = True)
            val_df = df.loc[df.kfold == fold].reset_index(drop = True)
            
            train_X = train_df[features]; train_Y = train_df[label]
            val_X = val_df[features]; val_Y = val_df[label]
            
            cur_model = model
            if name == 'CatBoostRegressor':
                cur_model.fit(train_X, train_Y,verbose=False)
            else:
                cur_model.fit(train_X, train_Y)

            Y_train_preds = model.predict(train_X)
            Y_val_preds = model.predict(val_X)
            
            # Collecting the scores
            r2_train += r2_score(train_Y, Y_train_preds)
            r2_val += r2_score(val_Y, Y_val_preds)
            
            rmse_train += rmse_score(train_Y, Y_train_preds)
            rmse_val += rmse_score(val_Y, Y_val_preds)
        
        # Pushing the scores and the Model names
        summary["Model"].append(name)
        summary["Avg R2 Train Score"].append(r2_train/5)
        summary["Avg R2 Val Score"].append(r2_val/5)
        summary["Avg RSME Train Score"].append(rmse_train/5)
        summary["Avg RSME Val Score"].append(rmse_val/5)
    
    # Finally returning the summary dictionary as a dataframe
    summary_df = pd.DataFrame(summary)
    return summary_df

In [None]:
req_cols = [
    'Subject_focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur','Animal_type'
]

def create_folds_regression(data, target="target", num_splits = 5): 
    """
    Helper function to create folds
    
    """
    data["kfold"] = -1 
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Applying Sturg's rule to calculate the no. of bins for target
    num_bins = int(1 + np.log2(len(data))) 
    
    data.loc[:, "bins"] = pd.cut(data[target], bins=num_bins, labels=False) 
    
    kf = StratifiedKFold(n_splits=num_splits)
    
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 
        data.loc[v_, 'kfold'] = f
        
    data = data.drop(["bins"], axis = 1)         
    return data 

training_data = create_folds_regression(training_data, target = 'Pawpularity', num_splits = 5)
training_data.kfold.value_counts()

In [None]:
training_data["Animal_type"] = training_data["Animal_type"].astype('category')
training_data["Animal_type"] = training_data["Animal_type"].cat.codes

In [None]:
training_summary = trainRegModels(training_data, req_cols, "Pawpularity")
training_summary

In [None]:
training_summary.sort_values("Avg RSME Val Score", axis = 0, ascending = True)

In [None]:
model_test = Xception(weights='imagenet')

import cv2
from tqdm.notebook import tqdm

class load_images_test():
    IMAGE_SIZE = 299
    IMAGE_LOCATION = "/kaggle/input/petfinder-pawpularity-score/test/"
    training_data = []
    train_csv = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/test.csv")
            
    def make_training_data(self):
        for f in tqdm(os.listdir(self.IMAGE_LOCATION)):
            try :
                path = os.path.join(self.IMAGE_LOCATION,f)                
                img_pred = image.load_img(path, target_size=(self.IMAGE_SIZE, self.IMAGE_SIZE))
                
                x = image.img_to_array(img_pred)
                x = np.expand_dims(x, axis=0)
                x = preprocess_input(x)

                preds = model_test.predict(x)

                animal_predicted = decode_predictions(preds, top=1)[0][0][1]
                
                location = self.train_csv.loc[self.train_csv['Id'] == f.replace(".jpg", "")]
                
                Subject_Focus = location['Subject Focus'].values[0]
                Eyes = location.Eyes.values[0]
                Face = location.Face.values[0]
                Near = location.Near.values[0]
                Action = location.Action.values[0]
                Accessory = location.Accessory.values[0]
                Group = location.Group.values[0]
                Collage = location.Collage.values[0]
                Human = location.Human.values[0]
                Occlusion = location.Occlusion.values[0]
                Info = location.Info.values[0]
                Blur = location.Blur.values[0]
                Pawpularity = location.Pawpularity.values[0]
                
                self.training_data.append([f.replace(".jpg", ""),Subject_Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,animal_predicted])
            except Exception as e:
                print(str(e))
        self.training_data.to_csv('my_data.csv',index=False)


In [None]:
cbr = CatBoostRegressor(random_state=0)
gbr = GradientBoostingRegressor(random_state=0)
VR_model = VotingRegressor([('cbr', cbr),('gbr', gbr)], n_jobs=-1)

r2_train = 0; r2_val = 0
rmse_train = 0; rmse_val = 0


model = VR_model
for fold in trange(5, desc = "Models are training...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position = 0, leave = True):
    train_df = training_data.loc[training_data.kfold != fold].reset_index(drop = True)
    val_df = training_data.loc[training_data.kfold == fold].reset_index(drop = True)

    train_X = train_df[req_cols]; train_Y = train_df["Pawpularity"]
    val_X = val_df[req_cols]; val_Y = val_df["Pawpularity"]
    
    model.fit(train_X, train_Y)
    
    Y_train_preds = model.predict(train_X)
    Y_val_preds = model.predict(val_X)

    # Collecting the scores
    r2_train += r2_score(train_Y, Y_train_preds)
    r2_val += r2_score(val_Y, Y_val_preds)

    rmse_train += rmse_score(train_Y, Y_train_preds)
    rmse_val += rmse_score(val_Y, Y_val_preds)

print(f"Avg R2 Train Score : {r2_train/5}")
print(f"Avg R2 Val Score : {r2_val/5}")
print(f"Avg RSME Train Score : {rmse_train/5}")
print(f"Avg RSME Val Score : {rmse_val/5}")

In [None]:
class load_images_test():
    IMAGE_SIZE = 299
    IMAGE_LOCATION = "/kaggle/input/petfinder-pawpularity-score/test/"
    training_data = []
    train_csv = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/test.csv")
            
    def make_training_data(self):
        for f in tqdm(os.listdir(self.IMAGE_LOCATION)):
            try :
                path = os.path.join(self.IMAGE_LOCATION,f)                
                img_pred = image.load_img(path, target_size=(self.IMAGE_SIZE, self.IMAGE_SIZE))
                
                x = image.img_to_array(img_pred)
                x = np.expand_dims(x, axis=0)
                x = preprocess_input(x)

                preds = model_test.predict(x)

                animal_predicted = decode_predictions(preds, top=1)[0][0][1]
                
                location = self.train_csv.loc[self.train_csv['Id'] == f.replace(".jpg", "")]
                
                Subject_Focus = location['Subject Focus'].values[0]
                Eyes = location.Eyes.values[0]
                Face = location.Face.values[0]
                Near = location.Near.values[0]
                Action = location.Action.values[0]
                Accessory = location.Accessory.values[0]
                Group = location.Group.values[0]
                Collage = location.Collage.values[0]
                Human = location.Human.values[0]
                Occlusion = location.Occlusion.values[0]
                Info = location.Info.values[0]
                Blur = location.Blur.values[0]
                
                self.training_data.append([f.replace(".jpg", ""),Subject_Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,animal_predicted])
            except Exception as e:
                print(str(e))

In [None]:
loader_test = load_images_test()
loader_test.make_training_data()

In [None]:
#let's process a bit better the data
pd.DataFrame(loader_test.training_data).to_csv("my_data_test.csv")
test_data = pd.read_csv("my_data_test.csv")

test_data.drop(['Unnamed: 0'],inplace=True,axis=1)
test_data.rename(columns = {'0':'Id','1': 'Subject_focus', '2': 'Eyes', '3':'Face','4':'Near','5':'Action','6':'Accessory','7':'Group','8':'Collage','9':'Human','10':'Occlusion','11':'Info','12':'Blur','13':'Animal_type'},inplace = True)

In [None]:
test_data["Animal_type"] = test_data["Animal_type"].astype('category')
test_data["Animal_type"] = test_data["Animal_type"].cat.codes
test_data

In [None]:
test_X = test_data[req_cols]

model_preds = model.predict(test_X)
test_data["Pawpularity"] = model_preds

submission = test_data[["Id", "Pawpularity"]]
submission.to_csv("submission.csv", index = False)

In [None]:
submission