In [None]:
# !pip uninstall umap -y
# !pip install umap-learn

In [None]:
import os

import numpy as np

import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt

import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import sklearn.utils as sku
import sklearn.decomposition as skd
import sklearn.metrics as skm


from sklearn.manifold import TSNE

import umap.umap_ as umap

In [None]:
WORD_DATA_PATH = "./herbier/data_public/words/"
METADATA_PATH = "./herbier/data_public/ascii/words.txt"

IMAGE_HEIGHT = IMAGE_WIDTH = 128

CLASSES = ['a01-000u', 'a01-003u']
N_CLASSES = len(CLASSES)

DEBUG = True

## TODO
- cross-validation
- 
- voir pour ignorer le fichier METADATA
- prétraitement image
  - réduire spectre pixels
  - resize
- data augmentation

In [None]:
def load_words_data(data_path, metadata_path, selected_writers = []):
    if selected_writers == []:
        raise ValueError("selected_writers must be a non-empty list of writer IDs")

    data = []

    with open(metadata_path, 'r') as file:
        for line in file:
            if not line.startswith("#"):
                components = line.strip().split(' ')
                word_id = components[0]
                
                parts = word_id.split('-')
                writer_id = '-'.join(parts[:2])

                if writer_id in selected_writers:
                    image_subfolder = parts[0]
                    image_filename = f"{word_id}.png"
                    image_path = os.path.join(data_path, image_subfolder, writer_id, image_filename)
                    
                    if os.path.exists(image_path):
                        try:
                            img = tf.io.read_file(image_path)
                            img = tf.image.decode_png(img)
                            data.append({
                                'image_path': image_path,
                                'writer_id': writer_id,
                                'image_array': img
                            })
                        except tf.errors.InvalidArgumentError:
                            print(f"Image not found for word ID: {word_id} at {image_path}")
                    else:
                        print(f"Image not found for word ID: {word_id} at {image_path}")

    return data
 
words_data = load_words_data(WORD_DATA_PATH, METADATA_PATH, selected_writers=CLASSES)

if DEBUG:
  print(f"Loaded {len(words_data)} words.")
  for entry in words_data[:5]:
      print(f"  Writer ID: {entry['writer_id']}; image shape: {entry['image_array'].shape}")

if DEBUG: 
    print("number of writers: ", len(set([entry['writer_id'] for entry in words_data])))

if DEBUG:
  plt.figure(figsize=(10, 10))
  for i in range(25):
      plt.subplot(5, 5, i + 1)
      plt.xticks([])
      plt.yticks([])
      plt.grid(False)
      plt.imshow(words_data[i]['image_array'], cmap=plt.cm.binary)
      plt.xlabel(words_data[i]['writer_id'])
  plt.show()


In [None]:
def preprocess_data(data):
    labels = []
    images = []

    for entry in data:
        # Resize the image
        img = tf.image.resize(entry['image_array'], [IMAGE_HEIGHT, IMAGE_WIDTH])

        img = img.numpy().astype('float32') / 255.0  # Normalize and convert to float32
        images.append(img)

        labels.append(entry['writer_id'])

    return np.array(images), np.array(labels)


images, labels = preprocess_data(words_data)

X_train, X_test, y_train, y_test = skms.train_test_split(images, labels, test_size=0.2, random_state=42)

if DEBUG:
    print(f"X_train: {X_train.shape}; y_train: {y_train.shape}")
    print(f"X_test: {X_test.shape}; y_test: {y_test.shape}")

In [None]:
# encode labels
label_encoder = skp.LabelEncoder()
integer_encoded_labels = label_encoder.fit_transform(labels)
one_hot_encoded_labels = keras.utils.to_categorical(integer_encoded_labels)

X_train, X_test, y_train, y_test = skms.train_test_split(images, one_hot_encoded_labels, test_size=0.2, random_state=42)

integer_class_labels = np.argmax(y_train, axis=1)

if DEBUG:
    print(f"X_train: {X_train.shape}; y_train: {y_train.shape}")
    print(f"X_test: {X_test.shape}; y_test: {y_test.shape}")
    # number of training samples per class
    print("number of training samples per class: ", np.bincount(integer_class_labels))
    

In [None]:
input_layer = keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 1))

# Define L1 and L2 regularization
l1_l2 = keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)

# path 1
conv1_1 = keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l1_l2)(input_layer)
pool1_1 = keras.layers.MaxPooling2D((2, 2))(conv1_1)
conv1_2 = keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l1_l2)(pool1_1)
pool1_2 = keras.layers.MaxPooling2D((2, 2))(conv1_2)

# path 2
conv2_1 = keras.layers.Conv2D(32, (5, 5), activation='relu', padding='same', kernel_regularizer=l1_l2)(input_layer)
pool2_1 = keras.layers.MaxPooling2D((2, 2))(conv2_1)
conv2_2 = keras.layers.Conv2D(64, (5, 5), activation='relu', padding='same', kernel_regularizer=l1_l2)(pool2_1)
pool2_2 = keras.layers.MaxPooling2D((2, 2))(conv2_2)

# merge paths
merged = keras.layers.concatenate([pool1_2, pool2_2])

flat = keras.layers.Flatten()(merged)
dense1 = keras.layers.Dense(128, activation='relu', kernel_regularizer=l1_l2)(flat)
dropout = keras.layers.Dropout(0.5)(dense1)  # Consider experimenting with the dropout rate
output_layer = keras.layers.Dense(N_CLASSES, activation='softmax')(dropout)

model = keras.models.Model(inputs=input_layer, outputs=output_layer)

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
BATCH_SIZE = 8 # fine tuned
EPOCHS = 200

class_weights = sku.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(integer_class_labels),
    y=integer_class_labels
)
class_weights_dict = {i : weight for i, weight in enumerate(class_weights)}

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, 
                    y_train, 
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE, 
                    validation_data=(X_test, y_test),
                    class_weight=class_weights_dict,
                    callbacks=[early_stopping])

In [None]:
features = model.predict(X_train)

# Standardize the features
scaler = skp.StandardScaler()
features_standardized = scaler.fit_transform(features)

# Now, use the standardized features with UMAP

def evaluate_model(n_neighbors, min_dist, ax):
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=2, metric='euclidean')
    embedding = reducer.fit_transform(np.nan_to_num(features_standardized))

    # reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=2, metric='euclidean')
    # embedding = reducer.fit_transform(np.nan_to_num(features))

    sc = ax.scatter(embedding[:, 0], embedding[:, 1],
                    c=integer_class_labels, edgecolor='none', alpha=0.5,
                    cmap=plt.cm.get_cmap('Accent', N_CLASSES))
    ax.set_xlabel('UMAP component 1')
    ax.set_ylabel('UMAP component 2')
    ax.set_title(f'n_neighbors={n_neighbors}, min_dist={min_dist}')

    if n_neighbors == n_neighbors_list[-1] and min_dist == min_dist_list[-1]:
        plt.colorbar(sc, ax=ax)

n_neighbors_list = [10, 20, 30]
min_dist_list = [0.0, 0.1, 0.2]

fig, axes = plt.subplots(len(n_neighbors_list), len(min_dist_list), figsize=(15, 12))

axes = axes.flatten()

for idx, (n_neighbors, min_dist) in enumerate([(x, y) for x in n_neighbors_list for y in min_dist_list]):
    evaluate_model(n_neighbors, min_dist, axes[idx])

plt.tight_layout()
plt.show()

In [None]:
# Confusion matrix
true_labels = np.argmax(y_test, axis=1)
model_predictions = np.argmax(model.predict(X_test), axis=1)
conf_matrix = skm.confusion_matrix(true_labels, model_predictions)

fig, ax = plt.subplots(figsize=(8, 6))
cax = ax.matshow(conf_matrix, cmap=plt.cm.Blues)
fig.colorbar(cax)

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

for (i, j), val in np.ndenumerate(conf_matrix):
    ax.text(j, i, val, ha='center', va='center')

plt.show()

model_predictions = np.argmax(model.predict(X_test), axis=1)
true_labels = np.argmax(y_test, axis=1)
cls_report = skm.classification_report(true_labels, model_predictions)
print(f"Classification Report:\n{cls_report}\n")
