
# Botanist
An academical projet to recognise and classify writings from botanists

Github repo: [github.com/satche/botanist](https://github.com/satche/botanist/)

## Get the data

First, we'll unzip raw data of different botanists notebooks. There is a lot of images, so run it and go grab a coffee. We'll connect to your Google Drive so we can save some results and output. Make sure to change the directory according to your folder structure.

*Note: the training data won't be stored in your drive as it's heavy*

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/', force_remount=True)
# GDRIVE_PATH = "/content/drive/My Drive/Colab/Botanist"

In [None]:
# !mkdir "/data/"
# !unzip herbier.zip -d "/data/"

In [None]:
# data_public = os.listdir('./herbier/data_public/')
# data_neuchatel = os.listdir('./herbier/data_neuchatel/')
# print(data_public)
# print(data_neuchatel)

In [None]:
# global packages  
import numpy as np
import os
import numpy as np
from PIL import Image

# tenserflow packages
import tensorflow as tf
import tensorflow.keras as keras
import sklearn.model_selection

In [None]:
WORD_DATA_PATH = "./herbier/data_public/words/"
METADATA_PATH = "./herbier/data_public/ascii/words.txt"


In [None]:
# J'ai tenté ceci : https://stackoverflow.com/questions/68447126/tensorflow-giving-error-invalidargumenterror-input-is-empty-when-training-or
# Ne marche pas selon-moi car la classe d'un sample est présent dans un fichier séparé et non dans le nom du fichier
# img_height = 64
# img_width = 64
# batch_size = 32
# train_ds = tf.keras.preprocessing.image_dataset_from_directory(WORD_DATA_PATH, labels='inferred', label_mode='int', image_size=(img_height, img_width), batch_size=batch_size)

# train_ds = train_ds.map(lambda x, y: (x/255.0, y))

# print("loaded # of images: ", len(train_ds))
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10, 10))
# for images, labels in train_ds.take(1):
#   for i in range(9):
#     ax = plt.subplot(3, 3, i + 1)
#     plt.imshow(images[i].numpy().astype("uint8"))
#     plt.title(labels[i])
#     plt.axis("off")

In [None]:
def load_words_data(data_path, metadata_path):
    data = []

    with open(metadata_path, 'r') as file:
        for line in file:
            if not line.startswith("#"):
                # a01-000u-00-00 ok 154 408 768 27 51 AT A
                components = line.strip().split(' ')
                # a01-000u-00-00
                word_id = components[0]
                # ok
                segmentation_result = components[1]
                # 154
                gray_level = int(components[2])
                # 408 768 27 51
                bounding_box = tuple(map(int, components[3:7]))
                # AT
                grammatical_tag = components[7]
                # A
                transcription = ' '.join(components[8:]) if len(components) > 8 else ''

                parts = word_id.split('-')
                subfolder = parts[0] + '-' + parts[1] # a01-000u
                image_subfolder = parts[0]  # a01
                image_filename = f"{word_id}.png" # a01-000u-00-00.png
                image_path = os.path.join(data_path, image_subfolder, subfolder, image_filename)
                
                if os.path.exists(image_path):
                    try: 
                        img = tf.io.read_file(image_path)
                        img = tf.image.decode_png(img)
                        data.append({
                            'image_path': image_path, 
                            'word_id': word_id,
                            'segmentation_result': segmentation_result,
                            'gray_level': gray_level,
                            'bounding_box': bounding_box,
                            'grammatical_tag': grammatical_tag,
                            'transcription': transcription,
                            'image_array': img  # numpy array?
                        })
                    except tf.errors.InvalidArgumentError:
                        print(f"Image not found for word ID: {word_id} at {image_path}")
                else:
                    print(f"Image not found for word ID: {word_id} at {image_path}")
                
                                # if os.path.exists(image_path):
                #     try:
                #         image = Image.open(image_path)
                #         img_array = np.array(image)
                #         # print(f"Loaded image for word ID: {word_id} at {image_path};")
                #         data.append({
                #             'word_id': word_id,
                #             'segmentation_result': segmentation_result,
                #             'gray_level': gray_level,
                #             'bounding_box': bounding_box,
                #             'grammatical_tag': grammatical_tag,
                #             'transcription': transcription,
                #             'image_array': img_array  # numpy arraz ? 
                #         })
                #         image.close()
                #     except Image.UnidentifiedImageError:
                #         print(f"Image not found for word ID: {word_id} at {image_path}")
                # else:
                #     print(f"Image not found for word ID: {word_id} at {image_path}")

    return data

words_data = load_words_data(WORD_DATA_PATH, METADATA_PATH)

print(f"Loaded {len(words_data)} words.")
for entry in words_data[:5]:
    print(f"  {entry['word_id']}: {entry['transcription']}; image shape: {entry['image_array'].shape}")



In [None]:
# def preprocess_data(words_data, img_width, img_height):
#     X = []
#     y = []

#     for entry in words_data:
#         # img resize
#         resized_img = np.resize(entry['image_array'], [img_height, img_width])
#         X.append(resized_img)

#         # TODO: ground truth = transcription ? 
#         y.append(entry['transcription'])

#     # normalize pixel values
#     X = np.array(X) / 255.0

#     return np.array(X), np.array(y)

from tensorflow.keras.preprocessing.image import img_to_array, load_img

def preprocess_data(words_data, img_width, img_height):
    X = []
    y = []

    for entry in words_data:
        # Load and resize image while maintaining aspect ratio
        image = load_img(entry['image_path'], color_mode='grayscale', target_size=(img_height, img_width))
        image = img_to_array(image)
        X.append(image)
        y.append(entry['transcription'])

    # Normalize pixel values
    X = np.array(X) / 255.0

    return np.array(X), np.array(y)

img_width, img_height = 128, 128  # TODO: global variables, à ajuser
X, y = preprocess_data(words_data, img_width, img_height)

# Split into training and testing sets
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

print (f"X_train shape: {X_train.shape}")
print (f"X_test shape: {X_test.shape}")
print (f"y_train shape: {y_train.shape}")
print (f"y_test shape: {y_test.shape}")


In [None]:
from sklearn.preprocessing import LabelEncoder

combined_y = np.concatenate([y_train, y_test])

# Encode all labels
label_encoder = LabelEncoder()
combined_y_encoded = label_encoder.fit_transform(combined_y)

# Find the number of classes
num_classes = len(label_encoder.classes_)

# Split them back into train and test
split_at = len(y_train)
y_train_encoded = combined_y_encoded[:split_at]
y_test_encoded = combined_y_encoded[split_at:]

# Convert to categorical
y_train_categorical = tf.keras.utils.to_categorical(y_train_encoded, num_classes)
y_test_categorical = tf.keras.utils.to_categorical(y_test_encoded, num_classes)

print(f"Number of classes: {num_classes}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout


model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(img_height, img_width, 1)))
model.add(tf.keras.layers.BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(tf.keras.layers.BatchNormalization())

model.add(Dense(num_classes, activation='softmax'))


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# TODO: constants
batch_size = 32
epochs = 1

history = model.fit(X_train, y_train_categorical, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test_categorical))


In [None]:
import matplotlib.pyplot as plt

# accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Test')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

# loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Test')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.show()