In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tensorflow import keras
import glob
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib 


In [None]:

#Where training data stored
base_dir = "../input/petfinder-pawpularity-score/train/"
image_files = glob.glob(base_dir + "/*.jpg")
image_df = pd.read_csv(base_dir + "/../train.csv")
image_df
#image_files

In [None]:

id_path = [base_dir + "/" + s + ".jpg" for s in image_df["Id"].values]
image_df["id_path"] = id_path

#Same for test data
test_df = pd.read_csv(base_dir + "/../test.csv")
test_df["id_path"] = [base_dir + "/../test/" + s + ".jpg" for s in test_df["Id"].values]

#Train/val split
#Later can get fancy with XV or whatever
(train_df, val_df) =  train_test_split(image_df, train_size=0.85,
                                        random_state = 1989)

#Scale outcome for avoiding hockey stick
ybar_train = np.mean(train_df["Pawpularity"].values)
y_train = train_df["Pawpularity"].values - ybar_train
y_val = val_df["Pawpularity"].values - ybar_train #note use of train

#Sanity check: what is RMSE of "guess the average"? 
naive_rmse = np.sqrt(np.mean((y_val - 0)**2)) #minus zero because we centered it
print("Naive guess-the-average strategy gives RMSE of " + str(round(naive_rmse,2)) + " cuteness points")

In [None]:
image_df

In [None]:
plt.hist(image_df["Pawpularity"], bins=100)
plt.title("Pawpularity score")

In [None]:
#To TF dataset
y_train_tf = tf.convert_to_tensor(y_train)
y_val_tf = tf.convert_to_tensor(y_val)

train_file_list = tf.convert_to_tensor(train_df["id_path"].values)
val_file_list = tf.convert_to_tensor(val_df["id_path"].values)
test_file_list = tf.convert_to_tensor(test_df["id_path"].values)

def parse_image(filename, label=None):
    file = tf.io.read_file(filename) # this will work only with filename as tensor
    img = tf.image.decode_image(file)
    #Can resize here if you don't have latest Keras    
    return img, label

#"apply" to tensor dataset
train_tfd = tf.data.Dataset.from_tensor_slices((train_file_list,y_train_tf))
train_tfd = train_tfd.map(parse_image).batch(1)

val_tfd = tf.data.Dataset.from_tensor_slices((val_file_list, y_val_tf))
val_tfd = val_tfd.map(parse_image).batch(1)

test_tfd = tf.data.Dataset.from_tensor_slices(test_file_list)
test_tfd = test_tfd.map(parse_image).batch(1)


In [None]:
#Plot some cute pets
plt.figure(figsize=(10,10))
for i, (image, label) in enumerate(train_tfd.take(9)):
    ax = plt.subplot(3,3,i+1)
    plt.imshow(image[0])
    plt.title(round(int(label) + ybar_train)) #Add back y_bar
    plt.axis("off")

In [None]:
#Plot test images
plt.figure(figsize=(10,10))
for i, (image, label) in enumerate(test_tfd.take(4)):
    ax = plt.subplot(2,2,i+1)
    plt.imshow(image[0])
    plt.title("Placeholder image") #Add back y_bar
    plt.axis("off")

In [None]:
#Resize as part of the model
resize_dim = (224,224,3)

#Use Francois' Xception model with pretrained weights 
xception_model = keras.applications.Xception(
    weights = "imagenet", 
    input_shape = resize_dim,
    include_top = False)

#For making predictions on hidden test data, must load model weights internally
#(since no internet connection is allowed)
#xception_model = keras.models.load_model('../input/xception-h5/xception_no_top.h5')

#Freeze entire model
xception_model.trainable = False

#Baseline model
inputs = keras.Input(shape=(None,None,3))
x = keras.layers.Resizing(resize_dim[0], resize_dim[1])(inputs)
x = keras.layers.Rescaling(scale = 1/127.5, offset=-1)(x)
x = xception_model(x, training = False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(1, activation=None)(x)

model = keras.Model(inputs, outputs)
model.summary()

In [None]:
model.compile(optimizer = keras.optimizers.Adam(learning_rate = 3e-4),
              loss = keras.losses.MeanSquaredError(),
              metrics = [keras.metrics.RootMeanSquaredError(name="rmse")])

epochs = 10
history = model.fit(train_tfd, epochs = epochs,
                    batch_size = 64,
                    validation_data = val_tfd)

In [None]:
matplotlib.rcParams.update({'font.size': 16})

#Plot
plt.figure(figsize=(10, 10))
plt.axhline(y=naive_rmse, color='k', linestyle='dotted', label = "Guess-the-average")
plt.plot(history.history["val_rmse"], label = "Val RMSE")
plt.plot(history.history["rmse"], label = "Train RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE loss")
plt.axhline(y=17.72594, color='r', linestyle='dotted', label = "Kaggle top score")
plt.legend(frameon=False, loc='center right')

In [None]:
yhat_train = model.predict(train_tfd, verbose=1) + ybar_train
yhat_val = model.predict(val_tfd, verbose=1) + ybar_train
yhat_test = model.predict(test_tfd, verbose=1) + ybar_train

In [None]:
plt.scatter(train_df.Pawpularity, yhat_train)
plt.plot(np.array([0,100]), np.array([0,100]), 'r')
plt.xlabel("Actual cuteness")
plt.ylabel("Predicted cuteness")
plt.title("Training data")

In [None]:
plt.scatter(val_df.Pawpularity, yhat_val)
plt.plot(np.array([0,100]), np.array([0,100]), 'r')
plt.xlabel("Actual cuteness")
plt.ylabel("Predicted cuteness")
plt.title("Validation data")

In [None]:
submission_df = pd.DataFrame({"Id": test_df["Id"].values, "Pawpularity":yhat_test[:,0]})
submission_df.to_csv("submission.csv", index=False)
submission_df