In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from pathlib import Path
from shutil import copyfile
import glob
import os, sys
import pathlib
import PIL

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential


In [None]:
random_state = 42
test_size = 0.2

In [None]:
train = pd.read_csv("../input/petfinder-pawpularity-score/train.csv", index_col="Id")
test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv", index_col="Id")
target_column = set(train.columns) - set(test.columns)

y = pd.DataFrame(train[target_column])
X = train.drop(columns=target_column)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
dummy_regr = DummyRegressor(strategy="mean", constant=None, quantile=None)
dummy_regr.fit(X_train, y_train)

mean_squared_error(y_true=y_test, y_pred=dummy_regr.predict(X_test), squared=False)

In [None]:
targets = {}
for i in range(1, 100, 10):
    target = f"seg_{i}_{i+9}"
    target_path = "pets/" + target
    try:
        Path(target_path).mkdir(parents=True, exist_ok=True)
    except FileExistsError:
        break
    selected_bin = y_train[(y_train["Pawpularity"] >= i) & (y_train["Pawpularity"] <= i+9)]
    targets.update({target: selected_bin.mean()})
    files_index = selected_bin.index
    for data_file in files_index:
        copyfile(f"../input/petfinder-pawpularity-score/train/{data_file}.jpg", f"{target_path}/{data_file}.jpg")

        
data_dir = pathlib.Path("pets")
        
assert len(list(data_dir.glob('*/*.jpg'))) == len(y_train)
targets

In [None]:
seg_1_10 = list(data_dir.glob('seg_1_10/*'))
PIL.Image.open(str(seg_1_10[0]))


In [None]:
batch_size = 32
img_height = 180
img_width = 180

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
class_names = train_ds.class_names
print(class_names)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [None]:
for image_batch, labels_batch in train_ds:
  print(image_batch.shape)
  print(labels_batch.shape)
  break


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
normalization_layer = layers.Rescaling(1./255)


In [None]:
normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))


In [None]:
num_classes = 10

model = Sequential([
  layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
epochs=10
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()


In [None]:
data_augmentation = keras.Sequential(
  [
    layers.RandomFlip("horizontal",
                      input_shape=(img_height,
                                  img_width,
                                  3)),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
  ]
)


In [None]:
plt.figure(figsize=(10, 10))
for images, _ in train_ds.take(1):
  for i in range(9):
    augmented_images = data_augmentation(images)
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(augmented_images[0].numpy().astype("uint8"))
    plt.axis("off")


In [None]:
model = Sequential([
  data_augmentation,
  layers.Rescaling(1./255),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [None]:
model.summary()


In [None]:
epochs = 15
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
target_path = "pets_test/"
try:
    Path(target_path).mkdir(parents=True, exist_ok=True)
except FileExistsError:
    pass
files_index = y_test.index
for data_file in files_index:
    copyfile(f"../input/petfinder-pawpularity-score/train/{data_file}.jpg", f"{target_path}/{data_file}.jpg")

In [None]:
assert len(list(pathlib.Path(target_path).glob('*.jpg'))) == len(y_test)


In [None]:
from tqdm.notebook import trange, tqdm

In [None]:
files = []
results = []
for image_path in tqdm(list(pathlib.Path(target_path).glob('*.jpg'))):
#     print(image_path)
    img = tf.keras.utils.load_img(
        image_path, target_size=(img_height, img_width)
    )
    img_array = tf.keras.utils.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0) # Create a batch

    predictions = model.predict(img_array)
    score = tf.nn.softmax(predictions[0])

    files.append(image_path.stem)
    results.append(class_names[np.argmax(score)])
#     break
#     print(
#         "This image most likely belongs to {} with a {:.2f} percent confidence."
#         .format(class_names[np.argmax(score)], 100 * np.max(score))
#     )


In [None]:
results_df = pd.DataFrame({"Id": files, "result": results})
results_df.set_index(["Id"], inplace=True)
results_df.sort_index(inplace=True)
y_test.sort_index(inplace=True)

assert (results_df.index == y_test.index).all

In [None]:
results_df.replace({"result": targets}, inplace=True)

In [None]:
mean_squared_error(y_true=y_test, y_pred=results_df["result"], squared=False)

# Retrain on all

In [None]:
for i in range(1, 100, 10):
    target = f"seg_{i}_{i+9}"
    target_path = "pets_full/" + target
    try:
        Path(target_path).mkdir(parents=True, exist_ok=True)
    except FileExistsError:
        break
    selected_bin = y[(y["Pawpularity"] >= i) & (y["Pawpularity"] <= i+9)]
    files_index = selected_bin.index
    for data_file in files_index:
        copyfile(f"../input/petfinder-pawpularity-score/train/{data_file}.jpg", f"{target_path}/{data_file}.jpg")

        
data_dir = pathlib.Path("pets_full")
        
assert len(list(data_dir.glob('*/*.jpg'))) == len(y)
targets

In [None]:
full_train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
epochs = 15
history = model.fit(
  full_train_ds,
  epochs=epochs
)

# Prepare submission

In [None]:
files = []
results = []
for image_path in tqdm(list(pathlib.Path("../input/petfinder-pawpularity-score/test/").glob('*.jpg'))):
#     print(image_path)
    img = tf.keras.utils.load_img(
        image_path, target_size=(img_height, img_width)
    )
    img_array = tf.keras.utils.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0) # Create a batch

    predictions = model.predict(img_array)
    score = tf.nn.softmax(predictions[0])

    files.append(image_path.stem)
    results.append(class_names[np.argmax(score)])

In [None]:
results_df = pd.DataFrame({"Id": files, "Pawpularity": results})

assert len(results_df) == len(list(pathlib.Path("../input/petfinder-pawpularity-score/test/").glob('*.jpg')))

In [None]:
results_df.replace({"Pawpularity": targets}, inplace=True)
results_df.to_csv("submission.csv", index=False)

In [None]:
results_df

In [None]:
!rm -rf pets*