# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Conv2D, Activation, Dense, MaxPooling2D, Dropout
from keras.applications.xception import Xception
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
import glob

# Define global variables

In [None]:
data_dir = 'drive/MyDrive/dist/'
img_height = 255
img_width = 255
batch_size = 32

# Get dataset (bit messy :))

In [None]:
df = pd.DataFrame()
for i in glob.glob('drive/MyDrive/df*.csv'):
  train_label_df = pd.read_csv(i, delimiter=',', header=None, names=['index','score', 'id'])
  train_label_df = train_label_df.iloc[1:,1:]
  train_label_df['score'] = train_label_df['score'].astype(float)
  if i == 'drive/MyDrive/df_cnn.csv':
    train_label_df['id'] = train_label_df['id'].apply(lambda x : 'drive/MyDrive/dist2/'+x)
  else:
    train_label_df['id'] = train_label_df['id'].apply(lambda x : data_dir+x)
  df = df.append(train_label_df)

In [None]:
# shuffle df so that there is no sequence logic between train, validation and test
df = shuffle(df).reset_index(drop=True)
df.describe()

Unnamed: 0,score
count,25449.0
mean,7.965073
std,1.441621
min,3.937797
25%,6.35082
50%,8.363437
75%,8.438887
max,14.594172


# Create generators

In [None]:
train_generator = ImageDataGenerator(rescale=1.0/255.0).flow_from_dataframe(dataframe=df.iloc[:int(0.7*len(df)),:], 
                                              x_col="id", y_col="score", color_mode = 'rgb',
                                              class_mode='raw', target_size=(img_width, img_height), 
                                              batch_size=batch_size)
val_generator = ImageDataGenerator(rescale=1.0/255.0).flow_from_dataframe(dataframe=df.iloc[int(0.7*len(df)):int(0.8*len(df)),:], 
                                              x_col="id", y_col="score", color_mode = 'rgb',
                                              class_mode='raw', target_size=(img_width, img_height), 
                                              batch_size=batch_size)
test_generator = ImageDataGenerator(rescale=1.0/255.0).flow_from_dataframe(dataframe=df.iloc[int(0.8*len(df)):,:], 
                                              x_col="id", y_col="score", color_mode = 'rgb',
                                              class_mode='raw', target_size=(img_width, img_height), 
                                              batch_size=batch_size)

  .format(n_invalid, x_col)


Found 17813 validated image filenames.
Found 2545 validated image filenames.
Found 5090 validated image filenames.


# Train model and save weights based on best epoch

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='drive/MyDrive/training_1/cp.ckpt',
    save_weights_only=True,
    monitor='val_mean_absolute_error',
    mode='min',
    save_best_only=True)

base_model = Xception(
    weights='imagenet',  # Load weights pre-trained on ImageNet.
    input_shape=(255, 255, 3),
    include_top=False) 

x = base_model.output
x = GlobalAveragePooling2D()(x)

predictions = Dense(1, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=predictions)
model.compile(loss="mae", optimizer='adam', metrics=[tf.keras.metrics.MeanAbsoluteError()])


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
history = model.fit(x=train_generator,
                    steps_per_epoch=494,
                    validation_data=val_generator,
                    validation_steps=70,
                    epochs=5,
                    callbacks=[model_checkpoint_callback])

Epoch 1/5
 78/494 [===>..........................] - ETA: 4:05:29 - loss: 1.0526 - mean_absolute_error: 1.0526

# Plot loss evolution

In [None]:
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.title('model MAE')
plt.ylabel('MAE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# get weights and check MAE on test set

In [None]:
latest = tf.train.latest_checkpoint('drive/MyDrive/training_1/')
model.load_weights(latest)
y_pred = model.predict(test_generator)

In [None]:
mean_absolute_error(df.iloc[int(0.80*len(df)):,0], y_pred)

1.4709878544442077