
# Botanist
An academical projet to recognise and classify writings from botanists

Github repo: [github.com/satche/botanist](https://github.com/satche/botanist/)

## Initialization

### Settings
First, let's define some settings and parameters

In [None]:
# GOOGLE COLAB
GOOGLE_COLAB = True # Are you using Google Colab ?
COLAB_WORKING_PATH = "/content/drive/My Drive/Colab/Botanist" # Path to folder in Google Drive

# PATHS
DATASET_ZIP_PATH = COLAB_WORKING_PATH # Path to "herbier.zip"
DATASET_PATH = "/content/data/" # Where the unzipped data should land ?

In [None]:
# Mount on Google Drive
if GOOGLE_COLAB:
  from google.colab import drive
  drive.mount('/content/drive/', force_remount=True)

### Imports

In [None]:
# global packages
import os
import numpy as np
import cv2
import sys

"""
import pickle
from math import exp
import torch
from tqdm import tqdm
from torch import optim
from torch import nn
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.optim.lr_scheduler import LambdaLR
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
"""

from PIL import Image
import imghdr

# OCR
!pip install paddleocr --upgrade
!pip install paddlepaddle
from paddleocr import PaddleOCR,draw_ocr

## Get the data

First, we'll unzip raw data of different botanists notebooks. There is a lot of images, so run it and go grab a coffee. We'll connect to your Google Drive so we can save some results and output. Make sure to change the directory according to your folder structure.

*Note: the training data won't be stored in your drive as it's heavy*

In [None]:
# Create our data folder, unzip the data
!mkdir $DATASET_PATH
!unzip "$DATASET_ZIP_PATH/herbier.zip" -d $DATASET_PATH

# OCR: handwriting detection

In [None]:
# Need to run only once to download and load model into memory
ocr = PaddleOCR(use_angle_cls=True, lang='en')
!wget -c https://github.com/PaddlePaddle/PaddleOCR/raw/release/2.7/doc/fonts/french.ttf -O "$DATASET_PATH/herbier/fonts/french.ttf"

In [None]:
DETECTION_DATASET_PATH = os.path.join(DATASET_PATH, "herbier", "data_neuchatel", "Image Chaillet pour reconnaissance écriture")
NEW_HEIGHT = 50

# Iterate over all images in the dataset
for root, dirs, files in os.walk(DETECTION_DATASET_PATH):
    for file in files:

        img_path = os.path.join(root, file)
        if imghdr.what(img_path) is not None:

            # Detect all elements in the current image
            result = ocr.ocr(img_path, cls=True)

            for idx in range(len(result)):
                res = result[idx]

                # If res is none, ignore and continue
                if res is None:
                    print(f"Could not detect anything in {img_path}")
                    continue

                # Each element detected has a boundary
                for i, line in enumerate(res):
                    boundary = line[0]

                    # Convert boundaries into a format suitable for Image.crop()
                    left = min(coord[0] for coord in boundary)
                    upper = min(coord[1] for coord in boundary)
                    right = max(coord[0] for coord in boundary)
                    lower = max(coord[1] for coord in boundary)
                    crop_boundary = (left, upper, right, lower)

                    # Create folder with same name as the image to stock cropped part
                    img_folder = img_path[:-4]
                    if not os.path.exists(img_folder):
                      os.makedirs(img_folder)

                    # Crop the image, store in folder
                    img = Image.open(img_path)
                    img_crop = img.crop(crop_boundary)

                    # PIL -> OpenCV
                    img_cv = cv2.cvtColor(np.array(img_crop), cv2.COLOR_RGB2BGR)

                    # White balancing
                    wb = cv2.xphoto.createSimpleWB()
                    img_wb = wb.balanceWhite(img_cv)

                    # Grayscale
                    img_gray = cv2.cvtColor(img_wb, cv2.COLOR_BGR2GRAY)

                    # Thresholding
                    _, img_thresh = cv2.threshold(img_gray, 127, 255, cv2.THRESH_BINARY)

                    # OpenCV -> PIL
                    img_bw = Image.fromarray(img_thresh)

                    # Resize by height (keep ratio)
                    width, height = img_bw.size
                    new_width = int((NEW_HEIGHT / height) * width)
                    img_resized = img_bw.resize((new_width, NEW_HEIGHT))

                    # Final cropped image output
                    img_resized.save(f"{img_folder}/crop_{i}.jpg")