In [5]:
!pip uninstall umap -y
!pip install umap-learn



In [6]:
USE_GOOGLE_COLAB = False
# COLAB_WORKING_PATH = "/content/drive/My Drive/Colab Notebooks/Botanist" # Path to folder in Google Drive
COLAB_WORKING_PATH = "/content/drive/My Drive/Colab/Botanist" if USE_GOOGLE_COLAB else "." 

# PATHS
DATASET_ZIP_PATH = COLAB_WORKING_PATH # Path to "herbier.zip"
DATASET_PATH = "/content/data/" # Where the unzipped data should land ?
WORD_DATA_PATH = "/content/data/herbier/data_public/words/"
METADATA_PATH = "/content/data/herbier/data_public/ascii/words.txt"

In [7]:
if USE_GOOGLE_COLAB:
  from google.colab import drive
  drive.mount('/content/drive/', force_remount=True)

In [8]:
# Create our data folder, unzip the data
!mkdir $DATASET_PATH
!unzip "$DATASET_ZIP_PATH/herbier.zip" -d $DATASET_PATH
!cd "$DATASET_PATH/herbier"

mkdir: /content: No such file or directory
Archive:  ./herbier.zip
checkdir:  cannot create extraction directory: /content/data
           No such file or directory
zsh:cd:1: no such file or directory: /content/data//herbier


In [9]:
import os

import numpy as np

import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt

import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import sklearn.utils as sku
import sklearn.decomposition as skd
import sklearn.metrics as skm

from sklearn.manifold import TSNE

# Module for k-fold
from sklearn.model_selection import KFold

import umap.umap_ as umap

In [10]:
IMAGE_HEIGHT = IMAGE_WIDTH = 128

CLASSES = ['a01-000u', 'c03-000a']
N_CLASSES = len(CLASSES)

FLATTEN_LAYER_NAME = 'flattened'

DEBUG = True

In [11]:
def load_words_data(data_path, metadata_path, selected_writers = []):
    if selected_writers == []:
        raise ValueError("selected_writers must be a non-empty list of writer IDs")

    data = []

    with open(metadata_path, 'r') as file:
        for line in file:
            if not line.startswith("#"):
                components = line.strip().split(' ')
                word_id = components[0]

                parts = word_id.split('-')
                writer_id = '-'.join(parts[:2])

                if writer_id in selected_writers:
                    image_subfolder = parts[0]
                    image_filename = f"{word_id}.png"
                    image_path = os.path.join(data_path, image_subfolder, writer_id, image_filename)

                    if os.path.exists(image_path):
                        try:
                            img = tf.io.read_file(image_path)
                            img = tf.image.decode_png(img)
                            data.append({
                                'image_path': image_path,
                                'writer_id': writer_id,
                                'image_array': img
                            })
                        except tf.errors.InvalidArgumentError:
                            print(f"Image not found for word ID: {word_id} at {image_path}")
                    else:
                        print(f"Image not found for word ID: {word_id} at {image_path}")

    return data

words_data = load_words_data(WORD_DATA_PATH, METADATA_PATH, selected_writers=CLASSES)

if DEBUG:
  print(f"Loaded {len(words_data)} words.")
  for entry in words_data[:5]:
      print(f"  Writer ID: {entry['writer_id']}; image shape: {entry['image_array'].shape}")

if DEBUG:
    print("number of writers: ", len(set([entry['writer_id'] for entry in words_data])))

if DEBUG:
  plt.figure(figsize=(10, 10))
  for i in range(25):
      plt.subplot(5, 5, i + 1)
      plt.xticks([])
      plt.yticks([])
      plt.grid(False)
      plt.imshow(words_data[i]['image_array'], cmap=plt.cm.binary)
      plt.xlabel(words_data[i]['writer_id'])
  plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/content/data/herbier/data_public/ascii/words.txt'

**Pre-processing**

In [None]:
import cv2

def preprocess_data(data):
    labels = []
    images = []

    for entry in data:
        # Resize the image while preserving aspect ratio
        img = np.array(entry['image_array'])
        old_size = img.shape[:2]

        ratio = float(IMAGE_HEIGHT)/old_size[0]
        new_size = tuple([int(x*ratio) for x in old_size])

        img = cv2.resize(img, (new_size[1], new_size[0]))

        # Ignore images that are too narrows
        if new_size[1] < IMAGE_WIDTH:
          continue;

        # Crop images that are too wide
        if new_size[1] > IMAGE_WIDTH:
            start_x = (new_size[1] - IMAGE_WIDTH) // 2
            img = img[:, start_x:start_x + IMAGE_WIDTH]
            new_size = (new_size[0], IMAGE_WIDTH)

        img = img.astype('float32') / 255.0

        # Ensure dimensions format is correct: (sample_n, width, height, channels)
        img = np.expand_dims(img, axis=-1)
        delta_w = IMAGE_WIDTH - new_size[1]
        delta_h = IMAGE_HEIGHT - img.shape[0]
        delta_w = IMAGE_WIDTH - img.shape[1]
        padding = ((0, delta_h), (0, delta_w), (0, 0))
        img = np.pad(img, padding, 'constant')

        images.append(img)
        labels.append(entry['writer_id'])

    return np.array(images), np.array(labels)


images, labels = preprocess_data(words_data)
X_train, X_test, y_train, y_test = skms.train_test_split(images, labels, test_size=0.2, random_state=42)

if DEBUG:
  print(f"X_train: {X_train.shape}; y_train: {y_train.shape}")
  print(f"X_test: {X_test.shape}; y_test: {y_test.shape}")

  num=5
  plt.figure(figsize=(10,10))
  for i in range(num):
      plt.subplot(N_CLASSES,num,i+1)
      plt.xticks([])
      plt.yticks([])
      plt.grid(False)
      plt.imshow(images[i], cmap=plt.cm.binary)
      plt.xlabel(labels[i])
  plt.show()


**Kfold**

In [None]:
# Define number of splits
n_splits = 5

# Create Kfold instance
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

**Set function to use kfold**

In [None]:
def train_and_evaluate(encoder_model, X_train, y_train, X_test, y_test):
  BATCH_SIZE = 8 # fine tuned
  EPOCHS = 200

  class_weights = sku.compute_class_weight(
      class_weight='balanced',
      classes=np.unique(integer_class_labels),
      y=integer_class_labels
  )
  class_weights_dict = {i : weight for i, weight in enumerate(class_weights)}

  early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
  autoencoder.fit(X_train, X_train,  # input and output are the same for an autoencoder
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  shuffle=True,
                  validation_data=(X_test, X_test))

  ############

  # Extract features
  encoded_features = encoder_model.predict(X_train)

  # Standardize the features
  scaler = skp.StandardScaler()
  encoded_features_standardized = scaler.fit_transform(encoded_features.reshape(len(encoded_features), -1))

  # encoded_features_standardized now ready for UMAP

  ############

  # Standardize the features
  # Now, use the standardized features with UMAP

  def evaluate_model(n_neighbors, min_dist, ax):
      reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=32, metric='euclidean')
      embedding = reducer.fit_transform(np.nan_to_num(encoded_features_standardized))

      sc = ax.scatter(embedding[:, 0], embedding[:, 1],
                      c=integer_class_labels, edgecolor='none', alpha=0.5,
                      cmap=plt.cm.get_cmap('Accent', N_CLASSES))
      ax.set_xlabel('UMAP component 1')
      ax.set_ylabel('UMAP component 2')
      ax.set_title(f'n_neighbors={n_neighbors}, min_dist={min_dist}')

      if n_neighbors == n_neighbors_list[-1] and min_dist == min_dist_list[-1]:
          plt.colorbar(sc, ax=ax)

  n_neighbors_list = [10, 20, 30]
  min_dist_list = [0.0, 0.1, 0.2]

  fig, axes = plt.subplots(len(n_neighbors_list), len(min_dist_list), figsize=(15, 12))

  axes = axes.flatten()

  for idx, (n_neighbors, min_dist) in enumerate([(x, y) for x in n_neighbors_list for y in min_dist_list]):
      evaluate_model(n_neighbors, min_dist, axes[idx])

  plt.tight_layout()
  plt.show()

**Define model**

In [None]:
# Encoder
input_img = keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 1)) # adapt this if using `channels_first` image data format

x = keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same')(input_img)
x = keras.layers.MaxPooling2D((2, 2), padding='same')(x)
x = keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = keras.layers.MaxPooling2D((2, 2), padding='same')(x)
x = keras.layers.Conv2D(16, (3, 3), activation='relu', padding='same')(x)
encoded = keras.layers.MaxPooling2D((2, 2), padding='same', name='encoded_layer')(x)

# Decoder
x = keras.layers.Conv2D(16, (3, 3), activation='relu', padding='same')(encoded)
x = keras.layers.UpSampling2D((2, 2))(x)
x = keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = keras.layers.UpSampling2D((2, 2))(x)
x = keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = keras.layers.UpSampling2D((2, 2))(x) # Additional upsampling layer
decoded = keras.layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)


# Autoencoder model
autoencoder = keras.Model(input_img, decoded)

if DEBUG:
    autoencoder.summary()

autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Create a model to retrieve the encoded features
encoder_model = keras.Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoded_layer').output)

**Use kfold**

In [None]:
# Execute kfold
for fold, (train_index, test_index) in enumerate(kf.split(images), 1):

    # encode labels
    label_encoder = skp.LabelEncoder()
    integer_encoded_labels = label_encoder.fit_transform(labels)
    one_hot_encoded_labels = keras.utils.to_categorical(integer_encoded_labels)

    X_train, X_test = images[train_index], images[test_index]
    y_train, y_test = one_hot_encoded_labels[train_index], one_hot_encoded_labels[test_index]

    print(f"\nFold {fold} - Training set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
    print(f"Fold {fold} - Testing set: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

    integer_class_labels = np.argmax(y_train, axis=1)

    # Train and test modele
    train_and_evaluate(encoder_model, X_train, y_train, X_test, y_test)