
# Botanist
An academical projet to recognise and classify writings from botanists

Github repo: [github.com/satche/botanist](https://github.com/satche/botanist/)

# Parameters
First, let's define some settings and parameters

In [None]:
# GOOGLE COLAB
GOOGLE_COLAB = True # Are you using Google Colab ?
COLAB_WORKING_PATH = "/content/drive/My Drive/Colab/Botanist" # Path to folder in Google Drive

# PATHS
DATASET_ZIP_PATH = COLAB_WORKING_PATH # Path to "herbier.zip"
DATASET_PATH = "/content/data/" # Where the unzipped data should land ?
WORD_DATA_PATH = "{DATASET_PATH}/data_public/words/"
METADATA_PATH = "{DATASET_PATH}/data_public/ascii/words.txt"

## Get the data

First, we'll unzip raw data of different botanists notebooks. There is a lot of images, so run it and go grab a coffee. We'll connect to your Google Drive so we can save some results and output. Make sure to change the directory according to your folder structure.

*Note: the training data won't be stored in your drive as it's heavy*

In [None]:
# global packages
import os
import numpy as np
from PIL import Image

# tenserflow packages
import tensorflow as tf
import tensorflow.keras as keras
import sklearn

# OCR
!pip install paddleocr
from paddleocr import PaddleOCR,draw_ocr

In [None]:
# Mount on Google Drive
if GOOGLE_COLAB:
  from google.colab import drive
  drive.mount('/content/drive/', force_remount=True)

In [None]:
# Create our data folder, unzip the data
!mkdir $DATASET_PATH
!unzip "$DATASET_ZIP_PATH/herbier.zip" -d $DATASET_PATH
!cd "$DATASET_PATH/herbier"

# OCR: handwriting recognition

In [None]:
# Need to run only once to download and load model into memory
ocr = PaddleOCR(use_angle_cls=True, lang='en')
!wget -c https://github.com/PaddlePaddle/PaddleOCR/raw/release/2.7/doc/fonts/french.ttf -O "$DATASET_PATH/herbier/fonts/french.ttf"

In [None]:
# Recognition
img_path = f"{DATASET_PATH}/herbier/data_neuchatel/Image Chaillet pour reconnaissance écriture/Douteux/NEU000006521.JPG"

boundaries = []

result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        print(line)
        boundaries.append(line[0])

# Draw result
result = result[0]
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path=f'{DATASET_PATH}/herbier/fonts/french.ttf')
im_show = Image.fromarray(im_show)
im_show.save(f'{DATASET_PATH}/herbier/data_neuchatel/result.jpg')

# Crop
for i, boundary in enumerate(boundaries):
    im = Image.open(img_path)
    im_crop = im.crop(boundary)
    im_crop.save(f"{DATASET_PATH}/herbier/data_neuchatel/crop_{i}.jpg")