In [2]:
# USAGE
# python text_detection_video.py --east frozen_east_text_detection.pb

# import the necessary packages
from imutils.video import VideoStream
from imutils.video import FPS
from imutils.object_detection import non_max_suppression
import numpy as np
import argparse
import imutils
import time
import cv2
import pytesseract
import math

In [3]:

def decode_predictions(scores, geometry):
    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability,
            # ignore it
            if scoresData[x] < args["min_confidence"]:
                continue

            # compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # use the geometry volume to derive the width and height
            # of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            # add the bounding box coordinates and probability score
            # to our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # return a tuple of the bounding boxes and associated confidences
    return (rects, confidences)

In [4]:
# construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument("-east", "--east", type=str, required=True,
# 	help="path to input EAST text detector")
# ap.add_argument("-v", "--video", type=str,
# 	help="path to optinal input video file")
# ap.add_argument("-c", "--min-confidence", type=float, default=0.5,
# 	help="minimum probability required to inspect a region")
# ap.add_argument("-w", "--width", type=int, default=320,
# 	help="resized image width (should be multiple of 32)")
# ap.add_argument("-e", "--height", type=int, default=320,
# 	help="resized image height (should be multiple of 32)")
# args = vars(ap.parse_args())

min_confidence = 0.2
width = 640
height = 640
args = { 'east' : './frozen_east_text_detection.pb', 
        'video' : './videos/faster.mp4',
        'min_confidence' : min_confidence,
        'width' : width, 
        'height' : height}

In [5]:
# initialize the original frame dimensions, new frame dimensions,
# and ratio between the dimensions
(W, H) = (None, None)
(newW, newH) = (args["width"], args["height"])
(rW, rH) = (None, None)

# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = [
	"feature_fusion/Conv_7/Sigmoid",
	"feature_fusion/concat_3"]

In [6]:
# load the pre-trained EAST text detector
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet(args["east"])

[INFO] loading EAST text detector...


In [7]:
# if a video path was not supplied, grab the reference to the web cam
if not args.get("video", False):
	print("[INFO] starting video stream...")
	vs = VideoStream(src=0).start()
	time.sleep(1.0)

# otherwise, grab a reference to the video file
else:
	vs = cv2.VideoCapture(args["video"])

In [10]:
# start the FPS throughput estimator
fps = FPS().start()

# loop over frames from the video stream
while True:
    # grab the current frame, then handle if we are using a
    # VideoStream or VideoCapture object
    frame = vs.read()
    frame = frame[1] if args.get("video", False) else frame

    # check to see if we have reached the end of the stream
    if frame is None:
        break

    # resize the frame, maintaining the aspect ratio
    frame = imutils.resize(frame, width=1000)
    orig = frame.copy()

    # if our frame dimensions are None, we still need to compute the
    # ratio of old frame dimensions to new frame dimensions
    if W is None or H is None:
        (H, W) = frame.shape[:2]
        rW = W / float(newW)
        rH = H / float(newH)

    # resize the frame, this time ignoring aspect ratio
    frame = cv2.resize(frame, (newW, newH))

    # construct a blob from the frame and then perform a forward pass
    # of the model to obtain the two output layer sets
    blob = cv2.dnn.blobFromImage(frame, 1.0, (newW, newH),
        (123.68, 116.78, 103.94), swapRB=True, crop=False)
    net.setInput(blob)
    (scores, geometry) = net.forward(layerNames)

    # decode the predictions, then  apply non-maxima suppression to
    # suppress weak, overlapping bounding boxes
    (rects, confidences) = decode_predictions(scores, geometry)
    boxes = non_max_suppression(np.array(rects), probs=confidences)

    # loop over the bounding boxes
    #---Tesseract 
    tesseractOutputImage = np.zeros_like(orig)
    origTesseract = orig.copy()
    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        #----tessaract
        extraPixelsX = 0.99
        extraPixelsY = 0.90
        croppedImage = origTesseract[int(startY*extraPixelsY):int(endY/extraPixelsY), int(startX*extraPixelsX):int(endX/extraPixelsX)]
        config = ('-l eng --oem 3 --psm 6')
        pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Roy\AppData\Local\Tesseract-OCR\tesseract.exe'
        text = pytesseract.image_to_string(croppedImage, config=config).encode('utf-8')
        print(startX, startY, endX, endY)
        textScale = (2.0*(endX-startX)/200.0)/(len(text)+0.002)*5
        textThickness = max(4*(endX-startX)/200/(len(text)+1)*4, 0)
        print("Text Scale: ", textScale)
        print("Text Thickness: ", textThickness)
        cv2.putText(tesseractOutputImage, str(text), (startX,endY), cv2.FONT_HERSHEY_SIMPLEX, textScale, (0, 255, 0), int(textThickness), lineType = cv2.LINE_AA)
    #         plt.imshow(croppedImage)
    #         plt.show()
        print("Text:", str(text))


    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        # draw the bounding box on the frame
        cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)

    # update the FPS counter
    fps.update()

    # show the output frame
    cv2.imshow("Text Detection", orig)
    cv2.imshow("Text recognition", tesseractOutputImage)
    key = cv2.waitKey(1) & 0xFF

    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break

# stop the timer and display FPS information
fps.stop()
print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

# if we are using a webcam, release the pointer
if not args.get("video", False):
    vs.stop()

# otherwise, release the file pointer
else:
    vs.release()

# close all windows
cv2.destroyAllWindows()

643 94 723 110
Text Scale:  3.992015968063872
Text Thickness:  3.2
Text: b'\x0c'
725 94 807 111
Text Scale:  0.3726595164515542
Text Thickness:  0.5466666666666666
Text: b'. EDITION\n\x0c'
535 390 657 418
Text Scale:  0.5544446464279221
Text Thickness:  0.8133333333333334
Text: b'ace\nUSIVE\n\x0c'
704 391 810 419
Text Scale:  1.3243378310844578
Text Thickness:  1.6960000000000002
Text: b'To\n\x0c'
465 430 590 450
Text Scale:  0.48069527764959236
Text Thickness:  0.7142857142857143
Text: b'PEAGCLUSIVE\n\x0c'
298 86 384 109
Text Scale:  0.18694026606382055
Text Thickness:  0.2866666666666667
Text: b'eannuenneneent\neT HIF\n\x0c'
292 432 384 450
Text Scale:  0.21902675935625182
Text Thickness:  0.33454545454545453
Text: b'AKSHA\nsai) Ta ot\nes\n\x0c'
223 36 693 71
Text Scale:  0.9790850762436465
Text Thickness:  1.504
Text: b'@ #AkshavSpeaksToArnab\n\x0c'
400 87 589 108
Text Scale:  0.7268112598061837
Text Thickness:  1.0799999999999998
Text: b'NEWS) Aaa 1\n\x0c'
364 391 528 420
Text Scal

Text: b'\x0c'
192 85 234 112
Text Scale:  0.5247376311844077
Text Thickness:  0.6719999999999999
Text: b'it\n\x0c'
515 125 778 187
Text Scale:  13.12375249500998
Text Thickness:  10.52
Text: b'\x0c'
650 390 703 417
Text Scale:  0.37846329620108543
Text Thickness:  0.53
Text: b'a\nTOs\n\x0c'
643 95 723 111
Text Scale:  3.992015968063872
Text Thickness:  3.2
Text: b'\x0c'
725 94 807 111
Text Scale:  0.31533610213813257
Text Thickness:  0.4685714285714285
Text: b'. EDITION |\n\x0c'
535 390 657 418
Text Scale:  0.5544446464279221
Text Thickness:  0.8133333333333334
Text: b'ace\nUSIVE\n\x0c'
704 391 810 419
Text Scale:  1.3243378310844578
Text Thickness:  1.6960000000000002
Text: b'Pi\n\x0c'
465 430 590 450
Text Scale:  0.5207465422429595
Text Thickness:  0.7692307692307693
Text: b'pEACLUSIVE\n\x0c'
292 432 384 450
Text Scale:  0.2090719025543133
Text Thickness:  0.32
Text: b'AKSHA\nsai 0) Todt\nes\n\x0c'
228 36 693 71
Text Scale:  1.0567221161712572
Text Thickness:  1.6173913043478263
Text

231 36 693 71
Text Scale:  0.9624197983501376
Text Thickness:  1.4784
Text: b'@ tAkshaySpeaksToArnab\n\x0c'
396 87 589 109
Text Scale:  0.5675802846723915
Text Thickness:  0.8577777777777778
Text: b'NEWSY INTERVIEW\n\x0c'
237 86 295 110
Text Scale:  2.894211576846307
Text Thickness:  2.32
Text: b'\x0c'
192 84 234 112
Text Scale:  0.29991431019708653
Text Thickness:  0.42
Text: b'THE\na\n\x0c'
542 129 768 188
Text Scale:  2.2590963614554176
Text Thickness:  3.013333333333333
Text: b'_ |\n\x0c'
592 89 632 110
Text Scale:  0.6662225183211192
Text Thickness:  0.8
Text: b'a\n\x0c'
643 95 723 111
Text Scale:  0.5712653527563554
Text Thickness:  0.8
Text: b'SPS |\n\x0c'
295 432 385 449
Text Scale:  0.44991001799640074
Text Thickness:  0.6545454545454545
Text: b'Taro\nee,\n\x0c'
725 94 807 111
Text Scale:  0.31533610213813257
Text Thickness:  0.4685714285714285
Text: b'. EDITION |\n\x0c'
387 430 470 449
Text Scale:  0.8296681327469012
Text Thickness:  1.1066666666666667
Text: b'\xe2\x80\x94\n\

557 131 745 186
Text Scale:  1.0442123972450565
Text Thickness:  1.504
Text: b'_\nre\nTT\n\x0c'
643 94 725 111
Text Scale:  4.091816367265469
Text Thickness:  3.28
Text: b'\x0c'
723 93 806 111
Text Scale:  0.3772041447009634
Text Thickness:  0.5533333333333333
Text: b'L EDITION\n\x0c'
290 432 384 450
Text Scale:  0.2937132858392701
Text Thickness:  0.44235294117647056
Text: b'SUPERS\noad\nez,\n\x0c'
226 36 693 71
Text Scale:  0.9728355970335806
Text Thickness:  1.4944
Text: b'@ #AkshaySpeaksToArnab\n\x0c'
465 430 592 450
Text Scale:  0.5290784869188468
Text Thickness:  0.7815384615384615
Text: b'PEACLUSIVE\n\x0c'
296 86 384 110
Text Scale:  0.338409475465313
Text Thickness:  0.5028571428571429
Text: b'nome\nCANDID\n\x0c'
398 86 587 108
Text Scale:  1.8892443022790884
Text Thickness:  2.52
Text: b'sea\n\x0c'
387 429 470 449
Text Scale:  0.34577570404932506
Text Thickness:  0.5107692307692308
Text: b'STAR A\n\xe2\x80\x94\n\x0c'
273 389 456 419
Text Scale:  0.653478074560777
Text Thicknes

400 86 587 108
Text Scale:  1.1684578855286178
Text Thickness:  1.6622222222222223
Text: b'MAM aa\n\x0c'
225 36 693 71
Text Scale:  0.9749187567702691
Text Thickness:  1.4975999999999998
Text: b'a tAkshaySpeaksToArnab\n\x0c'
467 429 596 448
Text Scale:  0.5862570441737865
Text Thickness:  0.86
Text: b'EXCLUSIVE\n\x0c'
296 86 384 109
Text Scale:  0.628391888031991
Text Thickness:  0.88
Text: b'ae\nyy\n\x0c'
385 429 470 449
Text Scale:  0.42491501699660067
Text Thickness:  0.6181818181818182
Text: b'EGE\n\xe2\x80\x94)\n\x0c'
275 389 457 419
Text Scale:  0.6065857885615251
Text Thickness:  0.91
Text: b'> a\nDEE\nSUPER\n\x0c'
456 389 610 419
Text Scale:  0.34996818471048086
Text Thickness:  0.5356521739130435
Text: b'a4 \xe2\x80\x9cSe\nR EACLUSIVE\n\x0c'
234 86 293 111
Text Scale:  2.944111776447106
Text Thickness:  2.36
Text: b'\x0c'
192 85 234 112
Text Scale:  0.2624343914021494
Text Thickness:  0.3733333333333333
Text: b'THE\nsl\n\x0c'
553 127 729 182
Text Scale:  0.676818950930626
Text

387 429 470 449
Text Scale:  0.34577570404932506
Text Thickness:  0.5107692307692308
Text: b'STAR A\n\xe2\x80\x94\n\x0c'
456 389 607 419
Text Scale:  0.41939784468392405
Text Thickness:  0.6357894736842106
Text: b'7 Ve\nR EACLUSIVE\n\x0c'
273 389 457 419
Text Scale:  0.5749281339832522
Text Thickness:  0.8658823529411765
Text: b"> |e\n| '\nSUPER\n\x0c"
239 86 293 110
Text Scale:  2.694610778443114
Text Thickness:  2.16
Text: b'\x0c'
192 85 234 112
Text Scale:  0.4198320671731307
Text Thickness:  0.5599999999999999
Text: b'THE\n\x0c'
556 126 756 183
Text Scale:  0.9089256498818397
Text Thickness:  1.3333333333333333
Text: b'i\nOO _.\nq\n\x0c'
643 95 725 111
Text Scale:  4.091816367265469
Text Thickness:  3.28
Text: b'\x0c'
725 94 807 111
Text Scale:  0.31533610213813257
Text Thickness:  0.4685714285714285
Text: b'. EDITION |\n\x0c'
692 390 810 418
Text Scale:  5.888223552894212
Text Thickness:  4.72
Text: b'\x0c'
290 432 384 450
Text Scale:  0.2937132858392701
Text Thickness:  0.4423529

235 86 293 111
Text Scale:  2.894211576846307
Text Thickness:  2.32
Text: b'\x0c'
189 84 232 112
Text Scale:  0.42982806877249097
Text Thickness:  0.5733333333333334
Text: b'THE\n\x0c'
562 129 757 180
Text Scale:  3.2478347768154565
Text Thickness:  3.9
Text: b'|\n\x0c'
642 94 723 111
Text Scale:  4.041916167664671
Text Thickness:  3.24
Text: b'\x0c'
721 93 807 111
Text Scale:  0.42991401719656064
Text Thickness:  0.6254545454545455
Text: b'LEDITION\n\x0c'
692 390 810 418
Text Scale:  5.888223552894212
Text Thickness:  4.72
Text: b'\x0c'
290 432 384 450
Text Scale:  0.3614828487924934
Text Thickness:  0.5371428571428571
Text: b'ais\noad\nez,\n\x0c'
403 85 571 107
Text Scale:  0.4940595224091283
Text Thickness:  0.7466666666666666
Text: b'NEWSY INTERVIE\\\n\x0c'
628 391 690 418
Text Scale:  0.38740314921269675
Text Thickness:  0.5511111111111111
Text: b'\xe2\x80\x94\nim\n\x0c'
298 86 382 110
Text Scale:  0.24702976120456416
Text Thickness:  0.3733333333333333
Text: b'ormemnenn\nwhiny\n\

KeyboardInterrupt: 