In [1]:
# Specify the arguments for the model
# video : Path to video file
# east : Path to EAST scene text detector model file
# min_confidence : Probability threshold to determine text
# width :Resized frame width (must be a multiple of 32)
# height : Resized frame height (must be a multiple of 32)
# padding : amount of padding to add to each border of ROI
args = {'video':"source_hq.mp4", 'min_confidence' : 0.6 , 'east': 'frozen_east_text_detection.pb', 'width': 1280 , 'height' : 704, 'padding' : 0}

In [2]:
# Import the required packages
import imutils
from imutils.object_detection import non_max_suppression
from imutils.video import VideoStream
from imutils.video import FPS
import numpy as np
import pytesseract
import argparse
import cv2
import time
import re

# Mention the installed location of Tesseract-OCR in the system
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  

def decode_predictions(scores, geometry):
	# grab the number of rows and columns from the scores volume, then
	# initialize our set of bounding box rectangles and corresponding
	# confidence scores
	(numRows, numCols) = scores.shape[2:4]
	rects = []
	confidences = []
	# loop over the number of rows
	for y in range(0, numRows):
		# extract the scores (probabilities), followed by the
		# geometrical data used to derive potential bounding box
		# coordinates that surround text
		scoresData = scores[0, 0, y]
		xData0 = geometry[0, 0, y]
		xData1 = geometry[0, 1, y]
		xData2 = geometry[0, 2, y]
		xData3 = geometry[0, 3, y]
		anglesData = geometry[0, 4, y]
		# loop over the number of columns
		for x in range(0, numCols):
			# if our score does not have sufficient probability,
			# ignore it
			if scoresData[x] < args["min_confidence"]:
				continue
			# compute the offset factor as our resulting feature
			# maps will be 4x smaller than the input image
			(offsetX, offsetY) = (x * 4.0, y * 4.0)
			# extract the rotation angle for the prediction and
			# then compute the sin and cosine
			angle = anglesData[x]
			cos = np.cos(angle)
			sin = np.sin(angle)
			# use the geometry volume to derive the width and height
			# of the bounding box
			h = xData0[x] + xData2[x]
			w = xData1[x] + xData3[x]
			# compute both the starting and ending (x, y)-coordinates
			# for the text prediction bounding box
			endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
			endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
			startX = int(endX - w)
			startY = int(endY - h)
			# add the bounding box coordinates and probability score
			# to our respective lists
			rects.append((startX, startY, endX, endY))
			confidences.append(scoresData[x])
	# return a tuple of the bounding boxes and associated confidences
	return (rects, confidences)

In [3]:
# initialize the original frame dimensions, new frame dimensions,
# and ratio between the dimensions
(W, H) = (1280, 720)
(newW, newH) = (args["width"], args["height"])
(rW, rH) = (W/float(newW),H/float(newH))
# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = [
	"feature_fusion/Conv_7/Sigmoid",
	"feature_fusion/concat_3"]
# load the pre-trained EAST text detector
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet(args["east"])


[INFO] loading EAST text detector...


In [4]:
# start Video Capture
vs = cv2.VideoCapture("source_hq.mp4")
# set FPS to 10
vs.set(cv2.CAP_PROP_FPS, 10)
# start the FPS throughput estimator
fps = FPS().start()
# initialize the list of results
res = []

In [5]:
# loop over frames from the video stream
while True:
	# grab the current frame of the VideoCapture object
	frame = vs.read()
	frame = frame[1] if args.get("video", False) else frame
	# check to see if we have reached the end of the stream
	if frame is None:
		break
	orig = frame.copy()
	(origH, origW) = frame.shape[:2]
	# resize the frame
	frame = cv2.resize(frame, (args["width"], args["height"]))
	# construct a blob from the image and then perform a forward pass of
	# the model to obtain the two output layer sets
	blob = cv2.dnn.blobFromImage(frame, 1.0, (args["width"], args["height"]),(123.68, 116.78, 103.94), swapRB=True, crop=False)
	net.setInput(blob)
	(scores, geometry) = net.forward(layerNames)
	# decode the predictions, then  apply non-maxima suppression to
	# suppress weak, overlapping bounding boxes
	(rects, confidences) = decode_predictions(scores, geometry)
	boxes = non_max_suppression(np.array(rects), probs=confidences)
	
	# loop over the bounding boxes
	for (startX, startY, endX, endY) in boxes:
		# scale the bounding box coordinates based on the respective
		# ratios
		startX = int(startX * rW)
		startY = int(startY * rH)
		endX = int(endX * rW)
		endY = int(endY * rH)
		# in order to obtain a better OCR of the text we can potentially
		# apply a bit of padding surrounding the bounding box -- here we
		# are computing the deltas in both the x and y directions
		dX = int((endX - startX) * args["padding"])
		dY = int((endY - startY) * args["padding"])
		# apply padding to each side of the bounding box, respectively
		startX = max(0, startX - dX)
		startY = max(0, startY - dY)
		endX = min(origW, endX + (dX * 2))
		endY = min(origH, endY + (dY * 2))
		# extract the actual padded ROI
		roi = orig[startY:endY, startX:endX]

		# in order to apply Tesseract v4 to OCR text we must supply
		# (1) a language, (2) an OEM flag of 4, indicating that the we
		# wish to use the LSTM neural net model for OCR, and finally
		# (3) an OEM value, in this case, 7 which implies that we are
		# treating the ROI as a single line of text
		config = ("-l eng --oem 1 --psm 7")
		text = pytesseract.image_to_string(roi, config=config)
		# draw the bounding box on the frame
		cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
		# add the bounding box coordinates and OCR'd text to the list
		# of results
		res.append(((startX, startY, endX, endY), text))
	# update the FPS counter
	fps.update()
	# show the output image
	cv2.imshow("Text Detection", orig)
	key = cv2.waitKey(1) & 0xFF
	# if the `q` key was pressed, break from the loop
	if key == ord("q"):
		break
# stop the timer and display FPS information
fps.stop()
print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))
# Release the pointer 
vs.release()
# close all windows
cv2.destroyAllWindows()

KeyboardInterrupt: 

In [7]:
# sort the results bounding box coordinates from top to bottom
res = sorted(res, key=lambda r:r[0][1])

# clean the results obtained by Tesseract OCR
filtered = set()
# Remove duplicate words, words with unexpected symbols
for ((startX, startY, endX, endY), text) in res:
	tokens = text.strip().split()
	clean_tokens =  [t for t in tokens if re.match(r'[^''-$%^&*()«_®+|~=.{}<>\[\]:";`\/]*$', t)]
	clean_s = ' '.join(clean_tokens)
	if(len(clean_s)>1 and not(bool(re.search(r"\s", clean_s)))):
		filtered.add(clean_s)

# Write results obtained to a text file
file1 = open('results.txt', 'w')


# display the text OCR'd by Tesseract
print("OCR TEXT")
print("========")	
for word in filtered:	
	print("{}\n".format(word))
	file1.write(word)
	file1.write("\n")

# Close the file
file1.close()	

OCR TEXT
We

Event?

we

FV

PAV

Wil

buals

Vey

the

Ws

Missed

