# Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random as rand
import math
import time
import glob
import pickle
import scipy
import cv2
import re
import plotly.express as px
import collections
import seaborn as sns
import pydot
import graphviz
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler, Normalizer
from tensorflow.keras import Sequential
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.layers import Dense, Input, Dropout, Conv2D, MaxPooling2D, Flatten, concatenate, add
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model

!pip install keras_tuner
import keras_tuner as kt

# Loading Images

In [None]:
train_path = '../input/petfinder-pawpularity-score/train/*.jpg'
test_path = '../input/petfinder-pawpularity-score/test/*.jpg'
train_csv_path = '../input/petfinder-pawpularity-score/train.csv'
test_csv_path = '../input/petfinder-pawpularity-score/test.csv'

train_images = glob.glob(train_path)
test_images = glob.glob(test_path)
df_train = pd.read_csv(train_csv_path )
df_test = pd.read_csv(test_csv_path)
df_train

# Data Exploration

In [None]:
df_examine = df_train.drop(['Id','Pawpularity'],axis=1).value_counts()
df_examine = df_examine.reset_index().rename(columns={0:'Count'})

df_train.drop(columns=['Id'],axis=1)
columns = list(df_examine.columns)
columns.remove('Count')

k_values = []
for row in df_examine[columns].values:
    mask = df_train[columns].values == row
    k = df_train[mask.all(axis=1)]['Pawpularity'].mean()
    k_values.append(k)

df_examine['k_values'] = k_values
df_examine.corr().to_csv('./correlation.csv')
corr = df_examine.corr()['k_values']

# finds columns where < 5% contributuion is found to the K-value score. ex. 95% probably that they don't have an effect on the pawpularity scoring metric
mask = corr.between(-0.05, 0.05) 
columns_to_remove = list(corr[mask].index)
columns_to_remove.remove('Count')
columns_to_remove

In [None]:
sq_shape = 128

class Preprocess:
    
    def __init__(self, df, images, columns_to_remove=columns_to_remove, sq_shape=128, is_test=0):
        self.df = df.drop(columns=columns_to_remove, axis=1)
        self.images = images
        self.sq_shape = sq_shape
        self.is_test = is_test
        self.X = None
        self.y = None
        self.X_meta = None
        
        
    def __process_image(self, image):
        im = cv2.imread(image, cv2.IMREAD_GRAYSCALE) #reads image as greyscale
        im = cv2.resize(im, (self.sq_shape, self.sq_shape)) # resizes to 256 x 256 for processing speed
        im = im/255 # normalizes
        return im
        
    def compile(self):

        self.X = []
        self.X_meta = []
        if not self.is_test:
            self.y = []

        for image_number, image in enumerate(self.images):
            image_name = image[image.rfind('/')+1:image.rfind('.')]
            mask = self.df['Id'] == image_name
            im = self.__process_image(image)

            if self.is_test:
                X_meta_data = self.df[mask].drop(columns=['Id'],axis=1).values[0]
            else:
                Pawpularity_score = self.df[mask]['Pawpularity'].values[0]
                X_meta_data = self.df[mask].drop(columns=['Pawpularity','Id'],axis=1).values[0]
                
            self.X.append(im)
            self.X_meta.append(X_meta_data)
            if not self.is_test:
                self.y.append(Pawpularity_score)

            clear_output(wait=True)
            print(f'{round(image_number/len(self.images)*100,4)}%, {image_name}')

        self.X = np.asarray(self.X)
        self.X_meta = np.asarray(self.X_meta)
        
        if not self.is_test:
            self.y = np.asarray(self.y) #/100 
        
        if self.is_test:
            print("Processed Testing Data")
        else:
            print("Processed Training Data")
        return self.X, self.X_meta, self.y
        
pp_train = Preprocess(df=df_train, images=train_images, columns_to_remove=columns_to_remove, sq_shape=sq_shape, is_test=0)
X, X_meta, y = pp_train.compile()

# Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(X_meta, y, test_size=0.2, random_state=42)

print('Image train',X_train.shape, y_train.shape)
print('Image test',X_test.shape, y_test.shape)

print('Meta train', X_meta_train.shape, y_meta_train.shape)
print('Meta test', X_meta_test.shape, y_meta_test.shape)

print(y_train[:10],y_meta_train[:10])


# Define Model

In [None]:
def model_builder(hp):

    """
        Image model & Categorical Data Combination Functional API Model
    """
    min_node = 8
    max_node = 1000
    step_size = 20
    # Image Data
    image_input = Input(shape=(sq_shape, sq_shape, 1))
    x = Conv2D(hp.Int('C_1', min_value=min_node, max_value=max_node, step=step_size), kernel_size=4, activation='relu')(image_input)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.1)(x)
    x = Conv2D(hp.Int('C_2', min_value=min_node, max_value=max_node, step=step_size), kernel_size=4, activation='relu')(x)
    x = Conv2D(hp.Int('C_3', min_value=min_node, max_value=max_node, step=step_size), kernel_size=4, activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.1)(x)
    x = Conv2D(hp.Int('C_4', min_value=min_node, max_value=max_node, step=step_size), kernel_size=4, activation='relu')(x)
    x = Conv2D(hp.Int('C_5', min_value=min_node, max_value=max_node, step=step_size), kernel_size=4, activation='relu')(x)
    x = Conv2D(hp.Int('C_6', min_value=min_node, max_value=max_node, step=step_size), kernel_size=4, activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.1)(x)
    image_arm = Flatten()(x)

    # Categorical & Meta Data
    columns = X_meta_train.shape[1]
    meta_input = Input(shape=(columns))
    y = Dense(hp.Int('N_Dense_1', min_value=min_node, max_value=max_node, step=step_size), activation='relu')(meta_input)
    y = Dense(hp.Int('N_Dense_2', min_value=min_node, max_value=max_node, step=step_size), activation='relu')(y)
    y = Dense(hp.Int('N_Dense_3', min_value=min_node, max_value=max_node, step=step_size), activation='relu')(y)
    y = Dense(hp.Int('N_Dense_4', min_value=min_node, max_value=max_node, step=step_size), activation='relu')(y)
    meta_arm = Dense(10, activation='relu')(y)

    # Merging arms of Model
    merge = concatenate([image_arm, meta_arm])

    # Finalization and Output
    z = Dense(hp.Int('N_Dense_5', min_value=min_node, max_value=max_node, step=step_size), activation='relu')(merge)
    z = Dense(hp.Int('N_Dense_6', min_value=min_node, max_value=max_node, step=step_size), activation='relu')(z)
    output = Dense(1, activation='linear')(z)

    # Initialize Model as model
    model = Model(inputs=[image_input, meta_input], outputs=output)

    # Compile Model
    
    model.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_percentage_error'])
    print(model.summary())
    plot_model(model, to_file=f'./{sq_shape}x{sq_shape}_{columns}_paw_model_design.png')
    return model

tuner = kt.Hyperband(model_builder,
                     objective='mean_absolute_percentage_error',
                     max_epochs=10,
                     factor=3,
                     directory='./',
                     project_name='Paw_model_final')
print("Model Established")

# Fit Model

In [None]:
# Adds Early Stopping
callback = tf.keras.callbacks.EarlyStopping(monitor="mean_absolute_percentage_error", patience=5, restore_best_weights=True, verbose=1)

tuner.search(
    [X_train, X_meta_train], y_train,
    validation_data=([X_test, X_meta_test], y_test),
    batch_size = 1,
    shuffle=True,
    callbacks=[callback],
    verbose=1
    )

# Pick Best Model

In [None]:
model = tuner.get_best_models(num_models=1)[0]
print("Model Loaded")

# Evaluate Model

In [None]:
y_pred = model.predict([X_test, X_meta_test])
sns.displot(y_pred)
sns.displot(y_test)

print(y_pred, y_test)
print(y_pred.max(),y_pred.min())

#with open('./y_pred', 'wb') as pickle_file:
#    pickle.dump(y_pred, pickle_file)

# Example Test [WIP Below]

In [None]:
def model_adjusted(prediction, y_pred):
    min_val = y_pred.min()
    max_val = (y_pred - y_pred.min()).max()
    output = ((prediction - min_val)/max_val)*100
    return output

# Save Model

In [None]:
model.save(f'./{sq_shape}x{sq_shape}_{columns}_paw_model.h5')
print(f"Model Saved as: {sq_shape}x{sq_shape}_{columns}_paw_model.h5")

# Test Training Data

In [None]:
model_path = './64x64_greyscaled_paw.h5'
pred_path = './y_pred'

pre_trained_model = tf.keras.models.load_model(model_path)

with open(pred_path, 'rb') as pickle_file:
    y_pred = pickle.load(pickle_file)
y_pred

# df_final = pd.DataFrame(zip(Name, outputs)).rename(columns={0:'Id',1:'Pawpularity'}).set_index('Id')
# df_final.to_csv('submission.csv')
# df_final