# Extraction des données avec Teceract

In [27]:
!apt install tesseract-ocr
!apt-get install poppler-utils
!pip install pdf2image
!pip install pytesseract
!pip install opencv-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 64 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 2s (2,934 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 155503 files and directories currently installed.)
Preparing to unpack .../tesserac

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
!cp "/content/gdrive/My Drive/Colab Notebooks/Data_challenge_IA_PAU/pdf.zip" .
!unzip pdf.zip
!mv pdf/pdf\ no\ zoi pdf/pdf_no_zoi
!mv pdf/pdf\ with\ zoi pdf/pdf_with_zoi

Archive:  pdf.zip
   creating: pdf/
   creating: pdf/pdf no zoi/
  inflating: pdf/pdf no zoi/not_zoi_1.pdf  
  inflating: pdf/pdf no zoi/not_zoi_10.pdf  
  inflating: pdf/pdf no zoi/not_zoi_100.pdf  
  inflating: pdf/pdf no zoi/not_zoi_11.pdf  
  inflating: pdf/pdf no zoi/not_zoi_12.pdf  
  inflating: pdf/pdf no zoi/not_zoi_13.pdf  
  inflating: pdf/pdf no zoi/not_zoi_14.pdf  
  inflating: pdf/pdf no zoi/not_zoi_15.pdf  
  inflating: pdf/pdf no zoi/not_zoi_16.pdf  
  inflating: pdf/pdf no zoi/not_zoi_17.pdf  
  inflating: pdf/pdf no zoi/not_zoi_18.pdf  
  inflating: pdf/pdf no zoi/not_zoi_19.pdf  
  inflating: pdf/pdf no zoi/not_zoi_2.pdf  
  inflating: pdf/pdf no zoi/not_zoi_20.pdf  
  inflating: pdf/pdf no zoi/not_zoi_21.pdf  
  inflating: pdf/pdf no zoi/not_zoi_22.pdf  
  inflating: pdf/pdf no zoi/not_zoi_23.pdf  
  inflating: pdf/pdf no zoi/not_zoi_24.pdf  
  inflating: pdf/pdf no zoi/not_zoi_25.pdf  
  inflating: pdf/pdf no zoi/not_zoi_26.pdf  
  inflating: pdf/pdf no zoi/not_zoi_

In [15]:
from pdf2image import convert_from_path
import os
import pytesseract
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
try:
    from PIL import Image
except ImportError:
    import Image
import re

### Transformation des fichiers pdf en Images

In [6]:
def creation_images(dossier_traitement, dossier_destination, name="zoi"):
    """ 
    Convertit les fichiers pdf en images
    """
    docs = os.listdir(dossier_traitement)
    compteur = 0

    for doc in docs:
        converted_doc = convert_from_path(os.path.join(dossier_traitement, doc))
        converted_doc[0].save(dossier_destination + "/" + name + " " + str(compteur) + ".jpeg", "JPEG")
        compteur+=1



### Extraction de petites images contenant des portions de text

In [7]:
def decomposition_img(lien_img):
    img = cv.imread(lien_img)

    #Phase de transformation
    img_gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    img_blur = cv.blur(img_gray, (33,33), 0)
    ret, thresh = cv.threshold(img_blur, 220, 255, cv.THRESH_BINARY)
    img_canny = cv.Canny(thresh,125,175)
    dilated = cv.dilate(img_canny, (15,15), iterations=65)
    dilated_blur = cv.blur(dilated, (15,15))

    #Détection des contours
    contours, hierarchies = cv.findContours(dilated_blur, cv.RETR_LIST, cv.CHAIN_APPROX_NONE)
    contours = sorted(contours, key=cv.contourArea, reverse=True)

    list_img = []
    for idx in range(len(contours[:10])):

        mask = np.zeros_like(img) # Create mask where white is what we want, black otherwise
        cv.drawContours(mask, contours, idx, 255, -1) # Draw filled contour in mask
        out = np.zeros_like(img) # Extract out the object and place into output image
        out[mask == 255] = img[mask == 255]

        # Now crop
        y, x, _ = np.where(mask == 255)
        (topy, topx) = (np.min(y), np.min(x))
        (bottomy, bottomx) = (np.max(y), np.max(x))

        if(bottomy - topy > 30) and (bottomx - topx > 30):

            blank = np.zeros(img.shape[:2], dtype='uint8')
            rectangle = cv.rectangle(blank.copy(), (topx, topy), (bottomx, bottomy), 255, -1)
            out = cv.bitwise_and(img,img, mask=rectangle)
            out = out[topy:bottomy+1, topx:bottomx+1]
            img_gray_out = cv.cvtColor(out, cv.COLOR_BGR2GRAY)
            list_img.append(img_gray_out)
    
    return list_img

### Extraction des données textes des images

In [16]:
def image_to_csv_tesseract(dossier_traitement, label, headerLine = True):

    csv_str = "Text;nbcaracteres;label\n" if headerLine else ""
    docs = os.listdir(dossier_traitement)

    total = len(docs)
    done = 0
    for doc in docs:
        text = pytesseract.image_to_string(Image.open(os.path.join(dossier_traitement, doc)))
        text = re.sub(r";+", " ", text)
        text = re.sub(r"\s+", " ", text)
        csv_str += f"{text};{len(text)};{label}\n"
        if done % 10 == 0:
            print(f"{done}/{total}", end="=========")
        done += 1
    
    return csv_str
        

In [46]:
def image_to_csv_opencv(dossier_traitement, label, headerLine = True):

    csv_str = "Text;nbcaracteres;label\n" if headerLine else ""
    docs = os.listdir(dossier_traitement)

    total = len(docs)
    done = 0
    for doc in docs:
        list_img = decomposition_img(os.path.join(dossier_traitement, doc))
        text = ""
        for img in list_img:
            text += pytesseract.image_to_string(img) + " "
        text = re.sub(r";+", " ", text)
        text = re.sub(r"\s+", " ", text)
        csv_str += f"{text};{len(text)};{label}\n"
        if done % 10 == 0:
            print(f"{done}/{total}", end="=========")
        done += 1

    return csv_str
        

## Pipeline de traitrement

In [20]:
def extraction_tesseract(pdf_files_dir = "../data/pdf", img_files_dir = "../data/img", csv_file_path = "../data/dataset_tesseract.csv", headerLine = True):

    #   Conversion des pdf en images
    zoi_path = img_files_dir + "/img_with_zoi"
    no_zoi_path = img_files_dir + "/img_no_zoi"
    if not os.path.isdir(img_files_dir):
        os.makedirs(zoi_path, exist_ok=True)
        os.makedirs(no_zoi_path, exist_ok=True)
        creation_images(pdf_files_dir + "/pdf_with_zoi", zoi_path, "zoi")
        creation_images(pdf_files_dir + "/pdf_no_zoi", no_zoi_path, "no_zoi")

    
    csv_data_tesseract = image_to_csv_tesseract(zoi_path, 1)
    csv_data_tesseract += image_to_csv_tesseract(no_zoi_path, 0, False)

    with open(csv_file_path, "w") as f:
        f.write(csv_data_tesseract)

In [28]:
extraction_tesseract("pdf", "img", "dataset_tesseract.csv")

In [32]:
def extraction_opencv(pdf_files_dir = "../data/pdf", img_files_dir = "../data/img", csv_file_path = "../data/dataset_opencv.csv"):

    #   Conversion des pdf en images
    zoi_path = img_files_dir + "/img_with_zoi"
    no_zoi_path = img_files_dir + "/img_no_zoi"
    if not os.path.isdir(img_files_dir):
        os.makedirs(zoi_path, exist_ok=True)
        os.makedirs(no_zoi_path, exist_ok=True)
        creation_images(pdf_files_dir + "/pdf_with_zoi", zoi_path, "zoi")
        creation_images(pdf_files_dir + "/pdf_no_zoi", no_zoi_path, "no_zoi")

    csv_data_opencv = image_to_csv_opencv(zoi_path, 1)
    csv_data_opencv += image_to_csv_opencv(no_zoi_path, 0, False)

    with open(csv_file_path, "w") as f:
        f.write(csv_data_opencv)

In [47]:
extraction_opencv("pdf", "img", "dataset_opencv.csv")

