# Setting up the environment

**Important steps:**

1.   Installing required packages
2.   Importing required libraries
3. Setting up the working directory
3. Importing the images and ground truth data
4. Cloning CRAFT github repo to use the model


In [0]:
##Installing required packages
!sudo apt install tesseract-ocr
! apt install tesseract-ocr
! apt install libtesseract-dev
! pip install Pillow
! pip install pytesseract
!pip install jiwer

In [0]:
# importing required libraries
import urllib.request
import zipfile
import numpy as np
import cv2
from imutils.object_detection import non_max_suppression
import pytesseract
from matplotlib import pyplot as plt
import pandas as pd
import sys
import time
import matplotlib
import matplotlib.pylab as plt
import os
from os.path import exists, join, basename, splitext
from jiwer import wer
import re
from statistics import mean

In [0]:
#Setting up working directory
! mkdir -p /content/Text_detection_and_extraction

#Using the url to download data
url = 'https://s3.amazonaws.com/tech-interview/text_detection.zip'
urllib.request.urlretrieve(url, '/content/Text_detection_and_extraction/text_detection.zip')

#Extracting the data from zipped folder
with zipfile.ZipFile("/content/Text_detection_and_extraction/text_detection.zip", 'r') as zip_ref:
  zip_ref.extractall("/content/Text_detection_and_extraction")

In [0]:
## Clonining git repo to download CRAFT model

git_repo_url = 'https://github.com/clovaai/CRAFT-pytorch.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q {git_repo_url}
  #!cd {project_name} && pip install -q -r requirements.txt

sys.path.append(project_name)
plt.rcParams["axes.grid"] = False

In [0]:
def download_from_google_drive(file_id, file_name):
  # download a file from the Google Drive link
  !rm -f ./cookie
  !curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id={file_id}" > /dev/null
  confirm_text = !awk '/download/ {print $NF}' ./cookie
  confirm_text = confirm_text[0]
  !curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm={confirm_text}&id={file_id}" -o {file_name}
  
pretrained_model = 'craft_mlt_25k.pth'
if not exists(pretrained_model):
  # download the pretrained model
  !wget -q -O {pretrained_model} 'https://drive.google.com/uc?authuser=0&id=1Jk4eGD7crsqCCg9C9VjCLkMN3ze8kutZ&export=download'

# Using CRAFT model to predict bounding boxes for the images

**Note**: Change the np.array(img) to np.ascontiguousarray(img) at line 43 in the file_utils.py file in the CRAFT-pytorch folder. (This is required for the model to run)

**Output:**

1. Images with bounding boxes (.jpg files)
2. Bounding box coordinates for each image (.txt files)

In [0]:
## Running the CRAFT model on the images 
!cd {project_name} && python test.py --trained_model=../{pretrained_model} --test_folder=/content/Text_detection_and_extraction/text_detection/images

# Using tesseract model to extract text in the images

**Process to extract data from one image:**

1. Iterate over the bounding boxes for the image
2. Crop the image region based on the bbox 
3. Input the image into the tesseract model to extract text in that region

In [0]:
# Get list of images/files for prediction
list_names = os.listdir('/content/Text_detection_and_extraction/text_detection/ground_truth/')

In [0]:
column_names = ["x1","y1","x2","y2","x3","y3","x4","y4"]
bbox_text_prediction = pd.DataFrame()

for j,name in enumerate(list_names):

  name_split = name.split("_")

  if len(name_split) == 3:
    name = name.split("_")
    name_bbox = name[1] + "_" + name[2]
    name_img = name_bbox.split(".")
    name_img = name_img[0]+".jpg"
  else:
    name = name.split("_")
    name_bbox = name[1]
    name_img = name_bbox.split(".")
    name_img = name_img[0]+".jpg"

  results_path = "/content/CRAFT-pytorch/result"
  coordinates = pd.read_csv(join(results_path,"res_"+name_bbox),names = column_names)
  image = cv2.imread(join('/content/Text_detection_and_extraction/text_detection/images',name_img))

  ## Converting the output coordinates from CRAFT model to StartX,StartY,EndX,EndY

  coordinates['StartX'] = coordinates[['x1','x2','x3','x4']].min(axis = 1)
  coordinates['StartY'] = coordinates[['y1','y2','y3','y4']].min(axis = 1)
  coordinates['EndX'] = coordinates[['x1','x2','x3','x4']].max(axis = 1)
  coordinates['EndY'] = coordinates[['y1','y2','y3','y4']].max(axis = 1)

  boxes = (coordinates[['StartX','StartY','EndX','EndY']])
  boxes[boxes < 0] = 0
  boxes_list = boxes.values.tolist()
  boxes['name'] = name_bbox
  boxes["text"] = ' '

## Loop to iterate over the bounding boxes -> Crop the image -> Extract the text in the cropped image

  for i, (startX,startY,endX,endY) in enumerate(boxes_list):
    ## Cropping the image to input to the tesseract model for prediction

    r = image[startY:endY, startX:endX]

    ## Using the necessary configuration for tesseract model

    '''
    OCR engine mode - Neural nets LSTM engine
    Page segmentation mode - Treat the image as a single word

    '''
    configuration = ("--oem 1 --psm 8")
    
    ## Using tesseract to predict the text in the cropped image

    text = pytesseract.image_to_string(r, config = configuration)
    boxes.loc[i,'text'] = text

  ### Appedning all the predictions to the dataframe, which can be exported
  bbox_text_prediction = bbox_text_prediction.append(boxes,ignore_index = True)

# Evaluation of the models

**Bounding box prediction by CRAFT model:**

*   Intersection over union is used as the evaluation metric for bbox predictions
*   Based on the threshold for IOU Average precision and recall are calculated as the final metrics

**Text extraction by Tesseract:**

*   Word error rate (WER) is calculated for the True positive boxes* 

*Note*: Character error rate is not calculated at the moment



In [0]:
### Funtion to calculate Intersection over union of Predicted bbox and ground truth bbox

def bb_intersection_over_union(boxA, boxB):
	# determine the (x, y)-coordinates of the intersection rectangle
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
 
	# compute the area of intersection rectangle
	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
 
	# compute the area of both the prediction and ground-truth
	# rectangles
	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
 
	# compute the intersection over union by taking the intersection
	# area and dividing it by the sum of prediction + ground-truth
	# areas - the interesection area
	iou = interArea / float(boxAArea + boxBArea - interArea)
 
	# return the intersection over union value
	return iou

In [68]:
column_names = ["StartX","StartY","EndX","EndY","text"]
iou_threshold = 0.6

precision_list = list()
recall_list = list()
wer_list = list()

## Iterating over all the files in the ground truth directory

for names in list_names:
  
  name_split = names.split("_")
  if len(name_split) == 3:
    name = names.split("_")
    name_bbox = name[1] + "_" + name[2]
  else:
    name = names.split("_")
    name_bbox = name[1]
  
  actual_bbox = pd.read_csv(join('/content/Text_detection_and_extraction/text_detection/ground_truth',"gt_"+ name_bbox), sep=' ', names = column_names,index_col=None)
  predicted_bbox_table = bbox_text_prediction[bbox_text_prediction["name"] == name_bbox]
  predicted_bbox_table = predicted_bbox_table[column_names]

  if(len(predicted_bbox_table) == 0):
    precision = 0.0
    recall = 0.0
    precision_list.extend([precision])
    recall_list.extend([recall])

  else:
    actual_bbox_table_points = actual_bbox[["StartX","StartY","EndX","EndY"]]
    actual_bbox_table_points[["StartX","StartY","EndX","EndY"]] = actual_bbox_table_points[["StartX","StartY","EndX","EndY"]].replace(",","",regex = True)
    actual_bbox_table_points[["StartX","StartY","EndX","EndY"]] = actual_bbox_table_points[["StartX","StartY","EndX","EndY"]].astype(float)

    actual_bbox_list = (actual_bbox_table_points).values.tolist()
    predicted_bbox_list = predicted_bbox_table[["StartX","StartY","EndX","EndY"]].values.tolist()

    no_of_actual_boxes = len(actual_bbox_list)
    no_of_pred_boxes = len(predicted_bbox_list)

    iou_matrix = np.empty([no_of_pred_boxes,no_of_actual_boxes])

    for i,pred_box in enumerate(predicted_bbox_list):
      for j,actual_box in enumerate(actual_bbox_list):
        iou_matrix[i,j] = bb_intersection_over_union(pred_box,actual_box)
    
    ## Extracting bounding boxes mased on IOU threshold
    result = np.where(iou_matrix > iou_threshold)
    pred_box_indices,actual_box_indices = result

    ## Calculating the word error rate for the true positives

    if(len(pred_box_indices)>0):
      for j in range(0,len(pred_box_indices)):
        pred_index = pred_box_indices[j]
        actual_index = actual_box_indices[j]
        predicted_bbox_text = re.sub('[^A-Za-z0-9]+', '', predicted_text[pred_index])
        actual_bbox_text = re.sub('[^A-Za-z0-9]+', '', actual_text[actual_index])
        error = wer(predicted_bbox_text, actual_bbox_text)
        wer_list.extend([error])

    ## Calculating true positives, false positives, false negatives, precision and recall
    tp = len(np.unique(pred_box_indices))
    fp = no_of_pred_boxes - tp
    fn = no_of_actual_boxes - len(np.unique(actual_box_indices))
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precision_list.extend([precision])
    recall_list.extend([recall])

print ("Average precision for bbox detection:", mean(precision_list))
print ("Average recall for bbox detection:", mean(recall_list))
print ("Word error rate:", (sum(wer_list)/len(wer_list)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Average precision for bbox detection: 0.884392395827702
Average recall for bbox detection: 0.9094777282601958
Word error rate: 0.6418685121107266
