# Import essential

In [None]:
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd
from abc import ABC, abstractmethod

from typing import Optional, Callable
import numpy as np

# Pipeline

## Import encode data pipeline

In [None]:
class Pipeline(ABC):
    input_path = '../input/petfinder-pawpularity-score/'


class ImagePipeline(Pipeline):
    X: tf.keras.preprocessing.image.DirectoryIterator
    
    def get_dataframe(self, csv: str):
        self.df = pd.read_csv(
            self.input_path + csv,
        )[['Id']]
        self.df.index = self.df.Id
        self.df.index = self.df.index.astype(str) + ".jpg"
        self.df.drop("Id", axis=1, inplace=True)
        
    def __init__(
        self,
        csv: str,
        datagen: Optional[ImageDataGenerator]=None,
        seed: Optional[int]=1234
    ):
        self.datagen = ImageDataGenerator(
            rescale=1./255
        )
        self.get_dataframe(csv)
        self.X = self.datagen.flow_from_dataframe(
            dataframe=self.df.reset_index(),
            directory=self.input_path + csv.replace(".csv", ""),
            x_col="Id",
            class_mode=None,
            target_size=(224, 224),
            batch_size=32,
            shuffle=False,
            seed=seed
        )

class POCEncoder:
    def __init__(self):
        poc = tf.keras.models.load_model("../input/01-keras-images-model-poc/poc.h5")
        poc.summary()
        self.poc_extract = tf.keras.models.Model(poc.input, poc.get_layer("encode").output)
    def predict(self, X):
        return self.poc_extract.predict(X)
    

class ImportPipeline(ABC):
    @abstractmethod
    def import_data(self):
        pass
    
    
class ImportPOCPipeline(ImportPipeline):
    def __init__(self):
        self.poc_encoder = POCEncoder()
        
    def import_data(self):
        self.train_encode = pd.read_csv("../input/02-encode/train_poc_encoded.csv")
        self.train_encode = self.train_encode.set_index("Id")
        self.selected_columns = self.train_encode.columns

        self.test_image_pipeline = ImagePipeline("test.csv")
        self.test_image_encoded = self.poc_encoder.predict(self.test_image_pipeline.X)

        self.test_encode = pd.DataFrame(self.test_image_encoded)
        self.test_encode.index = self.test_image_pipeline.df.index
        self.test_encode.columns =  "poc_"+ self.test_encode.columns.astype(str)

        self.test_encode = pd.concat([self.test_image_pipeline.df, self.test_encode], axis=1)
        self.test_encode.index = self.test_encode.index.str.replace(".jpg", "")
        self.test_encode = self.test_encode[self.selected_columns]

## Main pipeline

In [None]:
class MainPipeline(Pipeline):
    mergepocpipeline = ImportPOCPipeline()
    mergepocpipeline.import_data()

class TrainPipeline(MainPipeline):
    def __init__(self, filepath, target):
        self.df = pd.read_csv(filepath)
        self.df = self.df.set_index("Id")
        self.target = target
    def run(self):
        self.y = self.df[self.target]
        self.X = self.df.drop(self.target, axis=1)
        self.X = pd.concat([self.X, self.mergepocpipeline.train_encode], axis=1)

class TestPipeline(MainPipeline):
    def __init__(self, filepath):
        self.df = pd.read_csv(filepath)
        self.df = self.df.set_index("Id")
    def run(self):
        self.X = self.df
        self.X = pd.concat([self.X, self.mergepocpipeline.test_encode], axis=1)
        
        
train = TrainPipeline("../input/petfinder-pawpularity-score/train.csv", target='Pawpularity')
train.run()

test = TestPipeline("../input/petfinder-pawpularity-score/test.csv")
test.run()

# Modeling

In [None]:
corr = pd.concat([train.X, train.y], axis=1).corr()
corr['Pawpularity']

In [None]:
import os
from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold
import pickle
import json
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error



def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)**0.5


def fit_catboost(X_train, y_train, X_test, y_test):
    from catboost import Pool
    eval_set = Pool(X_test, y_test)
    reg = CatBoostRegressor(
        iterations=500,
        grow_policy='Lossguide',
        loss_function='RMSE',
        verbose=False,
        l2_leaf_reg=100.5,
        task_type='GPU',
        # use_best_model=True,
    )
    reg.fit(X_train, y_train)
    return reg


fit_models = [fit_catboost]
kf = KFold(n_splits=5, shuffle=True, random_state=1234)
# kf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=1234)

fit_results = {}
for k, (train_index, test_index) in enumerate(kf.split(train.X)):
    print(f"K Fold: {k + 1}")
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train.X.iloc[train_index], train.X.iloc[test_index]
    y_train, y_test = train.y.iloc[train_index], train.y.iloc[test_index]
    
    for fit_model in fit_models:
        model_name = '_'.join(fit_model.__name__.split('fit_')[1:])
        model_result_path = f"model_results/{model_name}"
        model_checkpoint = f"model_results/{model_name}/fold_{k+1}.pickle"
        
        if not os.path.exists(model_result_path):
            os.makedirs(model_result_path)
        if model_name not in fit_results:
            fit_results[model_name] = []
        if os.path.isfile(model_checkpoint):
            print("Found trained model")
            with open(model_checkpoint, 'rb') as f:
                model = pickle.load(f)

        else:
            model = fit_model(X_train, y_train, X_test, y_test)
        
        train_pred = model.predict(X_train)
        rmse_train = rmse(y_train, train_pred)

        test_pred = model.predict(X_test)
        rmse_test = rmse(y_test, test_pred)
        
        print(f"rmse train: {rmse_train}")
        print(f"rmse test: {rmse_test}")
        
        try:
            oof_pred = model.predict(X_oof)
            rmse_oof = rmse(y_oof, oof_pred)
            print(f"rmse OOF: {rmse_oof}")
        except NameError:
            pass

        try:
            with open(f"model_results/{model_name}/fold_{k+1}.pickle", "wb") as f:
                pickle.dump(model, f)
        except:
            pass
        fit_results[model_name].append({
            'model': model,
            'rmse_train': rmse_train,
            'rmse_test': rmse_test
        })

In [None]:
np.mean([score['rmse_test'] for score in fit_results[model_name]])

In [None]:
np.std([score['rmse_test'] for score in fit_results[model_name]])

In [None]:
sub = np.mean(
    [model['model'].predict(test.X) for model in fit_results[model_name]],
    axis=0
)

In [None]:
sub = pd.DataFrame(sub)

In [None]:
sub.columns = ['Pawpularity']
sub.index = test.X.index
sub.to_csv("submission.csv")