## CTPN

In [None]:
from __future__ import print_function

import cv2
import glob
import os
import shutil
import sys

import numpy as np
import tensorflow as tf

sys.path.append('/Users/sibylhe/text-detection-ctpn') # path to ctpn
from lib.networks.factory import get_network
from lib.fast_rcnn.config import cfg, cfg_from_file
from lib.fast_rcnn.test import test_ctpn
from lib.utils.timer import Timer
from lib.text_connector.detectors import TextDetector
from lib.text_connector.text_connect_cfg import Config as TextLineCfg

In [2]:
PATH_TO_TEST_IMAGES_DIR = '/Users/sibylhe/Documents/DR/image_extraction/image/test181019/'
image_names = os.listdir(PATH_TO_TEST_IMAGES_DIR)
if '.DS_Store' in image_names:
    image_names.remove('.DS_Store')

In [3]:
image_names

['11280+Olympic+Blvd.%2C+Los+Angeles%2C+CA.jpg']

In [4]:
def resize_im(im, scale, max_scale=None):
    f = float(scale) / min(im.shape[0], im.shape[1])
    if max_scale != None and f * max(im.shape[0], im.shape[1]) > max_scale:
        f = float(max_scale) / max(im.shape[0], im.shape[1])
    return cv2.resize(im, None, None, fx=f, fy=f, interpolation=cv2.INTER_LINEAR), f

def format_text_boxes(boxes, img_w, img_h):
    fomatted_boxes_list = []
    for box in boxes:
        min_x = min(box[0], box[2], box[4], box[6])/img_w
        min_y = min(box[1], box[3], box[5], box[7])/img_h
        max_x = max(box[0], box[2], box[4], box[6])/img_w
        max_y = max(box[1], box[3], box[5], box[7])/img_h
        score = box[8]
        fomatted_box = np.array([min_y, min_x, max_y, max_x, score])
        fomatted_boxes_list.append(fomatted_box)
    fomatted_boxes = np.array(fomatted_boxes_list)
    return fomatted_boxes

def filter_box_size(boxes, threshold=0.004):
    qualified_boxes_list = []
    for box in boxes:
        min_x = box[1]
        min_y = box[0]
        max_x = box[3]
        max_y = box[2]
        box_size = (max_x-min_x)*(max_y-min_y)
        if box_size >= threshold: # 0.0035 is the threshold to filter out google watermarks. Threshold to be defined. 
            qualified_boxes_list.append(box)
    qualified_boxes = np.array(qualified_boxes_list)
    return qualified_boxes


def ctpn(sess, net, image_name):
    timer = Timer()
    timer.tic()

    img = cv2.imread(image_name)
    img, scale = resize_im(img, scale=TextLineCfg.SCALE, max_scale=TextLineCfg.MAX_SCALE)
    img_w = img.shape[0]
    img_h = img.shape[1]
    scores, boxes = test_ctpn(sess, net, img)

    textdetector = TextDetector()
    boxes = textdetector.detect(boxes, scores[:, np.newaxis], img.shape[:2])
    
    text_boxes = format_text_boxes(boxes, img_w, img_h)
    text_boxes = filter_box_size(text_boxes, threshold=0.004)
    # text_boxes = [min_y, min_x, max_y, max_x, score]
    
    #draw_boxes(img, image_name, text_boxes, scale)
    timer.toc()
    print(('Detection took {:.3f}s for '
           '{:d} object proposals').format(timer.total_time, text_boxes.shape[0]))
    return text_boxes

In [5]:
#if os.path.exists("data/results/"):
#    shutil.rmtree("data/results/")
#os.makedirs("data/results/")

cfg_from_file('/Users/sibylhe/text-detection-ctpn/ctpn/text.yml') #path to ctpn/text.yml

# init session
#tf.get_variable_scope().reuse_variables() #comment out if 1st time run
config = tf.ConfigProto(allow_soft_placement=True)
sess = tf.Session(config=config)
# load network
net = get_network("VGGnet_test")
# load model
print(('Loading network {:s}... '.format("VGGnet_test")), end=' ')
saver = tf.train.Saver()

'''
/Users/sibylhe/text-detection-ctpn/ctpn/text.yml
line 37: checkpoints_path: /Users/sibylhe/text-detection-ctpn/checkpoints/
'''

try:
    ckpt = tf.train.get_checkpoint_state(cfg.TEST.checkpoints_path)
    print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
    saver.restore(sess, ckpt.model_checkpoint_path)
    print('done')
except:
    raise 'Check your pretrained {:s}'.format(ckpt.model_checkpoint_path)

im = 128 * np.ones((300, 300, 3), dtype=np.uint8)
for i in range(2):
    _, _ = test_ctpn(sess, net, im)

Tensor("Placeholder:0", shape=(?, ?, ?, 3), dtype=float32)
Tensor("conv5_3/conv5_3:0", shape=(?, ?, ?, 512), dtype=float32)
Tensor("rpn_conv/3x3/rpn_conv/3x3:0", shape=(?, ?, ?, 512), dtype=float32)
Tensor("lstm_o/Reshape_2:0", shape=(?, ?, ?, 512), dtype=float32)
Tensor("lstm_o/Reshape_2:0", shape=(?, ?, ?, 512), dtype=float32)
Tensor("rpn_cls_score/Reshape_1:0", shape=(?, ?, ?, 20), dtype=float32)
Tensor("rpn_cls_prob:0", shape=(?, ?, ?, ?), dtype=float32)
Tensor("Reshape_2:0", shape=(?, ?, ?, 20), dtype=float32)
Tensor("rpn_bbox_pred/Reshape_1:0", shape=(?, ?, ?, 40), dtype=float32)
Tensor("Placeholder_1:0", shape=(?, 3), dtype=float32)
Loading network VGGnet_test...  Restoring from /Users/sibylhe/text-detection-ctpn/checkpoints/VGGnet_fast_rcnn_iter_50000.ckpt... INFO:tensorflow:Restoring parameters from /Users/sibylhe/text-detection-ctpn/checkpoints/VGGnet_fast_rcnn_iter_50000.ckpt
done


## mask_rcnn_inception_v2_coco

Make sure to follow the [installation instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md) before you start.

Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_FROZEN_GRAPH` to point to a new .pb file.  

See the [detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run with varying speeds and accuracies.

In [None]:
from distutils.version import StrictVersion
import numpy as np
import six.moves.urllib as urllib
import tarfile
import zipfile

from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
import time
%matplotlib inline

sys.path.append("/anaconda3/lib/python3.6/site-packages/tensorflow/models/research") #parent folder of object-detection
from object_detection.utils import ops as utils_ops

if StrictVersion(tf.__version__) < StrictVersion('1.9.0'):
    raise ImportError('Please upgrade your TensorFlow installation to v1.9.* or later!')

# imports from the object detection module
from utils import label_map_util

from utils import visualization_utils as vis_util

In [7]:
def load_image_into_numpy_array(image):
    (im_width, im_height) = image.size
    return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)

def run_inference_for_single_image(image, graph):
    with graph.as_default():
        with tf.Session() as sess:
            ops = tf.get_default_graph().get_operations()
            all_tensor_names = {output.name for op in ops for output in op.outputs}
            tensor_dict = {}
            for key in ['num_detections', 'detection_boxes', 
                        'detection_scores','detection_classes', 'detection_masks']:
                tensor_name = key + ':0'
                if tensor_name in all_tensor_names:
                    tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(tensor_name)
            if 'detection_masks' in tensor_dict:
                # The following processing is only for single image
                detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
                detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
                # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
                real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
                detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
                detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
                detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(detection_masks, detection_boxes, image.shape[0], image.shape[1])
                detection_masks_reframed = tf.cast(tf.greater(detection_masks_reframed, 0.5), tf.uint8)
                # Follow the convention by adding back the batch dimension
                tensor_dict['detection_masks'] = tf.expand_dims(detection_masks_reframed, 0)
            image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
            # Run inference
            output_dict = sess.run(tensor_dict,feed_dict={image_tensor: np.expand_dims(image, 0)})
                        
            # all outputs are float32 numpy arrays, so convert types as appropriate
            output_dict['num_detections'] = int(output_dict['num_detections'][0])
            output_dict['detection_classes'] = output_dict['detection_classes'][0].astype(np.uint8)
            output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
            output_dict['detection_scores'] = output_dict['detection_scores'][0]
            if 'detection_masks' in output_dict:
                output_dict['detection_masks'] = output_dict['detection_masks'][0]
    return output_dict

In [None]:
# 1-st time (need to download model)
# What model to download.
MODEL_NAME = 'mask_rcnn_inception_v2_coco_2018_01_28'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

# download model
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
    file_name = os.path.basename(file.name)
    if 'frozen_inference_graph.pb' in file_name:
        tar_file.extract(file, os.getcwd())

In [8]:
# 2nd time and after (model has been downloaded)
MODEL_NAME = 'mask_rcnn_inception_v2_coco_2018_01_28'
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

In [9]:
# Load a (frozen) Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
    od_graph_def = tf.GraphDef()
    with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')

# Load label map
NUM_CLASSES = 90
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

# Generate positive class list
positive_name = ['stop sign','traffic light','clock','bench', 'potted plant', 'fire hydrant', 'parking meter', 'toilet']
positive_id =[]
for i in range(len(positive_name)):
    name = positive_name[i]
    for j in category_index.keys():
        if category_index[j]['name'] == name:
            positive_id.append(category_index[j]['id'])
# positive_id = [13, 10, 85, 15, 64, 11, 14, 70]

In [10]:
positive_id

[13, 10, 85, 15, 64, 11, 14, 70]

In [11]:
def filter_neg_boxes(output_dict):
    num_detections = output_dict['num_detections']
    detection_boxes = output_dict['detection_boxes'][:num_detections]
    detection_classes = output_dict['detection_classes'][:num_detections]
    detection_scores = output_dict['detection_scores'][:num_detections]
    
    pos_boxes_list = []
    neg_boxes_list = []
    for i in range(num_detections):
        if detection_classes[i] in positive_id:
            detection_box = np.append(detection_boxes[i],detection_scores[i])
            pos_boxes_list.append(detection_box)
        else:
            neg_boxes_list.append(detection_boxes[i])
    pos_boxes = np.array(pos_boxes_list)
    neg_boxes = np.array(neg_boxes_list)
    #pos_boxes = [min_y, min_x, max_y, max_x, score]
    #neg_boxes = [min_y, min_x, max_y, max_x]
    
    return pos_boxes, neg_boxes

In [12]:
def filter_neg_text_boxes(text_boxes, neg_boxes, threshold=0.5):
    '''
    If a text_box overlaps with a neg_box and overlapping area >= 0.5 * text_box size:
    eliminate the text_box
    '''
    # [min_y, min_x, max_y, max_x]
    to_delete = []
    for i in range(len(text_boxes)):
        t = text_boxes[i]
        for n in neg_boxes:
            '''
            notOverlapped: 
            t.max_x < n.min_x | t.max_y < n.min_y | t.min_x > n.max_x | t.min_y < n.max_y
            '''
            notOverlapped = (t[3]<n[1])|(t[2]<n[0])|(t[1]>n[3])|(t[0]<n[2])
            
            if notOverlapped is False:
                t_size = abs((t[3]-t[1])*(t[2]-t[0]))
                
                o_min_x = max(t[1], n[1])
                o_min_y = max(t[0], n[0])  
                o_max_x = min(t[3], n[3])  
                o_max_y = min(t[2], n[2])  
                o_size = abs((o_max_x-o_min_x)*(o_max_y-o_min_y))
                
                if o_size/t_size >= threshold:
                    to_delete.append(i)

    if len(to_delete) > 0:
        text_boxes = np.delete(text_boxes, to_delete, axis=0)
    return text_boxes

In [13]:
def filter_top10_size(boxes):
    new_boxes_list = []
    for box in boxes:
        box_size = abs((box[2]-box[0])*(box[3]-box[1]))
        box = np.append(box, box_size)
        new_boxes_list.append(box)
    new_boxes = np.array(new_boxes_list)
    arg = np.argsort(new_boxes[:,5])
    top10_boxes = new_boxes[arg]
    top10_boxes = top10_boxes[::-1][:10]
    return top10_boxes

In [14]:
def filter_top10_score(boxes):
    arg = np.argsort(boxes[:,4])
    top10_boxes = boxes[arg][::-1][:10]
    return top10_boxes

In [15]:
#boxes in ratio -> absolute coordinates
def format_abs_boxes(boxes, h, w):
    abs_box_list = []
    for box in boxes:
        #[min_y, min_x, max_y, max_x, score] -> [min_x*w, min_y*h, max_x*w, max_y*h, score]
        abs_box = np.array([box[1]*w, box[0]*h, box[3]*w, box[2]*h, box[4]])
        abs_box_list.append(abs_box)
    abs_boxes = np.array(abs_box_list)
    return abs_boxes

In [16]:
def draw_boxes(image_path, image_name, boxes, outpath):
    #with open(outpath + 'res_{}.txt'.format(image_name.split('.')[0]), 'w') as f:
    img = cv2.imread(image_path)
    for box in boxes:
        #[min_x*w, min_y*h, max_x*w, max_y*h, score]
        if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3] - box[0]) < 5:
            continue
        if box[4] >= 0.9:
            color = (0, 255, 0)
        elif box[4] >= 0.8:
            color = (255, 0, 0)
        cv2.line(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[1])), color, 2)
        cv2.line(img, (int(box[2]), int(box[1])), (int(box[2]), int(box[3])), color, 2)
        cv2.line(img, (int(box[2]), int(box[3])), (int(box[0]), int(box[3])), color, 2)
        cv2.line(img, (int(box[0]), int(box[3])), (int(box[0]), int(box[1])), color, 2)
        
        #line = str(list(box)).strip('[|]')+'\n'
        #f.write(line)

    cv2.imwrite(outpath+image_name, img)

## Model Pipeline

In [20]:
# Size, in inches, of the output images.
#IMAGE_SIZE = (12,8)

start = time.time()

for image_name in image_names:
    image_path = PATH_TO_TEST_IMAGES_DIR+image_name
    
    # CTPN
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(('Demo for {:s}'.format(image_path)))
    text_boxes = ctpn(sess, net, image_path)
    
    # Mask RCNN
    image = Image.open(image_path)
    image_np = load_image_into_numpy_array(image)
    w = image_np.shape[0] # original width
    h = image_np.shape[1] # original height
    image_np_expanded = np.expand_dims(image_np, axis=0)
    output_dict = run_inference_for_single_image(image_np, detection_graph)
    
    # Eliminate negative boxes and too small positive boxes
    pos_boxes, neg_boxes = filter_neg_boxes(output_dict)
    pos_boxes = filter_box_size(pos_boxes, threshold=0.002)
    
    # Eliminate text boxes overlapping with negative boxes
    text_boxes = filter_neg_text_boxes(text_boxes, neg_boxes, threshold=0.5)
    
    output_boxes = np.concatenate((text_boxes, pos_boxes), axis=0)
    # Filter top10 boxes by size/score
    if len(output_boxes) > 10:
        output_boxes = filter_top10_size(output_boxes)   # by size
        #output_boxes = filter_top10_score(output_boxes)  # by score
    
    # Format output_boxes represented in ratio to abs_boxes in absolute coordinates
    abs_boxes = format_abs_boxes(output_boxes, h, w)
    
    # Visualize boxes and write output txt
    draw_boxes(image_path, image_name, abs_boxes, outpath='/Users/sibylhe/Documents/DR/image_extraction/image/result181019/')
    
end = time.time()
print('Elaspe: ',end-start,'s')
# 19+ sec per image on 8GB RAM CPU 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Demo for /Users/sibylhe/Documents/DR/image_extraction/image/test181019/11280+Olympic+Blvd.%2C+Los+Angeles%2C+CA.jpg
Detection took 5.404s for 3 object proposals
Elaspe:  19.419419765472412 s
