/
process_image.py
539 lines (448 loc) · 21.2 KB
/
process_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
"""
process_image.py : This file contains the class for recognition of the sudoku from image.
File created by : Shashank Goyal
Last commit done by : Shashank Goyal
Last commit date : 4th September
"""
# import the opencv module for image processing
import cv2.cv2 as cv2
# import numpy module for operations on image matrices
import numpy as np
# import torch module
import torch
# import load_model method in order to classify the images
from Image_Processing.classifier import load_model
# name of model file
model_file = './Image_Processing/char74k-cnn.pth'
class SudokuImageProcessing:
"""
Template class for image processing to recognise the puzzle from the image
and plot to plot back the solution in case of augmented reality.
"""
def __init__(self, image: np.ndarray = None, fname: str = None):
"""default initialization"""
# if image not provided
if image is None:
# check for path name
assert fname is not None, "file name not entered"
# read image
self.image = cv2.imread(fname)
# if image is invalid
assert self.image is not None, "unable to open file: {}".format(fname)
# if image provided
else:
# set image for the class
self.image = image
# initialize default size as None
self.game_size = None
# initialize default box rows as None
self.box_rows = None
# initialize default box cols as None
self.box_cols = None
def get_grid(self):
"""
Detect the outter grid of the sudoku and return the area inside this grid.
"""
# convert the image to gray scale
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
# apply gaussian blur
blur = cv2.GaussianBlur(gray, (5, 5), 0)
# apply gaussian threshold
thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
# find all the contours
contours = cv2.findContours(thresh, cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE)[0]
# select the largest contour
max_cnt = max(contours, key=cv2.contourArea)
# check if the contour area is greater than 250 ** 2 pixel sq.
if cv2.contourArea(max_cnt) < 250 * 250:
# if not, then return None
return None, None
# create black mask
mask = np.zeros(gray.shape, np.uint8)
# fill the area inside contour with white pixels
cv2.drawContours(mask, [max_cnt], 0, 255, -1)
# fill the rest area with black pixels
cv2.drawContours(mask, [max_cnt], 0, 0, 2)
# create a white mask
out = 255 * np.ones_like(gray)
# copy the area from the original image which is marked by white pixels from the black mask.
out[mask == 255] = gray[mask == 255]
# return the generated white mask and largest contour
return out, max_cnt
def get_warped(self):
"""
Apply warp perspective tranformation by finding the corners of the contour to convert the
sudoku from being part of the image to the complete image itself.
"""
# get the generated white mask and the largest contour
img, max_cnt = self.get_grid()
# if the contour detected had an area smaller than 250 ** 2 pixels sq.
if img is None:
# return None
return None, None
# calculte the perimeter of the contour
peri = cv2.arcLength(max_cnt, True)
# approximates the polygonal curves to detect vertices
approx = cv2.approxPolyDP(max_cnt, 0.015 * peri, True)
# flatten the vertices array
pts = np.squeeze(approx)
# find width of the puzzle
box_width = np.max(pts[:, 0]) - np.min(pts[:, 0])
# find height of the puzzle
box_height = np.max(pts[:, 1]) - np.min(pts[:, 1])
"""
The following steps are used to approximate the corner coordinates of the puzzle
in order to apply an appropriate transformation.
"""
sum_pts = pts.sum(axis=1)
diff_pts = np.diff(pts, axis=1)
bounding_rect = np.array([pts[np.argmin(sum_pts)],
pts[np.argmin(diff_pts)],
pts[np.argmax(sum_pts)],
pts[np.argmax(diff_pts)]], dtype=np.float32)
dst = np.array([[0, 0],
[box_width - 1, 0],
[box_width - 1, box_height - 1],
[0, box_height - 1]], dtype=np.float32)
# generate the transformation matrix
transform_matrix = cv2.getPerspectiveTransform(bounding_rect, dst)
# apply the tranformation matrix to get the primary sudoku image
warped_img = cv2.warpPerspective(img, transform_matrix, (box_width, box_height))
# return the warped_img and its tranformation matrix
return warped_img, transform_matrix
def get_dimensions(self):
"""
Get the dimensions of the sudoku, the aim of this function is to detect out of the following -
game_size | game_dim
----------------------------------
4 | 2 X 2
6 | 2 X 3
6 | 3 X 2
8 | 2 X 4
8 | 4 X 2
9 | 3 X 3
Higher dimensions can be also detected, provided the camera has better resolution.
Note: The approach specified here will only work when a thicker border is used to
distinguish between the sub-grids of the puzzle.
"""
# get the warped image
img, _ = self.get_warped()
# if no image is received
if img is None:
# return None
return None, (None, None)
# apply gaussian blur
blur = cv2.GaussianBlur(img, (5, 5), 0)
# apply thresholding such that only the thicker border is visible
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
"""
Note: The image now contains only 0 or 255 as value for any pixel
"""
# mask the surrounding border of the puzzle
# top side
thresh[0:10, :] = 255
# left side
thresh[:, 0:10] = 255
# bottom side
thresh[-10:, :] = 255
# right side
thresh[:, -10:] = 255
# generate the inverse of the image and map it to 0 or 1
thresh = np.bitwise_not(thresh) / 255
# get sum of each column
vertical_sum = np.sum(np.round(thresh, 0), axis=0)
# get sum of each row
horizontal_sum = np.sum(np.round(thresh, 0), axis=1)
# set the minimum height as 2/3rd of the complete image height
min_height = int(0.66 * img.shape[0])
# set the minimum width as 2/3rd of the complete image width
min_width = int(0.66 * img.shape[1])
"""
Image Horizontal Bool
00000000101000000000110000000000 0
01000000101001000000101010000000 0
00010000111000000000111000100000 0
00000100111010001000110000001000 0
00000000110000000000100000000000 0
11111000010111111110101111111011 1
00001011111100000011111100011110 1
00000000110000000000111000100000 0
01000000011001000000011000000000 0
00010000101000010000110010000000 0
00000100111000000000110000000000 0
11111100110111111111111111111011 1
00011111111111111111110001111111 1
00000000011000000000011000000000 0
00001000011000100000111001001000 0
00000000101000000000110000000000 0
00000000111000000000011000000000 0
00000000111000000000111000000000
Vertical Bool
"""
vertical_bool = vertical_sum > min_height
horizontal_bool = horizontal_sum > min_width
# detects the total number of `01` patterns in the vertical bool
v_lines = 1
for i in range(1, len(vertical_bool)):
v_lines += (~ vertical_bool[i - 1]) & vertical_bool[i]
# detects the total number of `01` patterns in the horizontal bool
h_lines = 1
for i in range(1, len(horizontal_bool)):
h_lines += (~ horizontal_bool[i - 1]) & horizontal_bool[i]
"""
v_lines corresponds to the total number of rows in a sub grid
h_lines corresponds to the total number of columns in a sub grid
"""
# the game size is the product of the sub_grid dimensions
game_size = v_lines * h_lines
"""
Modify the line below in order to add other valid dimensions if possible.
"""
# since the camera can clearly detect only sizes 4,6,8,9
if game_size not in (4, 6, 8, 9):
# raise exception for improper grid size
raise RuntimeError("Improper Grid Size, expected in {{4,6,8,9}}, got {}".format(game_size))
# return the game dimensions
return game_size, (v_lines, h_lines)
@staticmethod
def preprocess_digit(digit_img):
"""
Helper method to clear borders and darken the digit of an individual cell of the puzzle.
"""
# expand the image
digit_img = cv2.resize(digit_img, (112, 112),
interpolation=cv2.INTER_CUBIC)
# apply gaussian blur
digit_img = cv2.GaussianBlur(digit_img, (5, 5), 0)
# apply threshold to the image
digit_img = cv2.threshold(digit_img, 135, 255, cv2.THRESH_TRUNC)[1]
# set all pixels with value greater than 110 as 255
digit_img[digit_img >= 110] = 255
# mask the surrounding border of the cell
# top side
digit_img[0:10, :] = 255
# left side
digit_img[:, 0:10] = 255
# bottom side
digit_img[-10:, :] = 255
# right side
digit_img[:, -10:] = 255
# resize image
digit_img = cv2.resize(digit_img, (28, 28),
interpolation=cv2.INTER_CUBIC)
# if there are less that 10 black pixels
if np.sum(np.bitwise_not(digit_img)) < 255 * 10:
return None
# scale pixels with value less than 150 to 3/4th the value
pos = digit_img < 150
digit_img[pos] = 3 * (digit_img[pos] // 4)
# double the values of other pixels
digit_img[np.bitwise_not(pos)] = 2 * digit_img[np.bitwise_not(pos)]
# return the cell image
return digit_img
def get_matrix(self):
"""Returns the puzzle matrix from the image"""
# get the warped image
img, _ = self.get_warped()
# if warped image is None
if img is None:
# return None
return None
# get the dimensions for the game
self.game_size, (self.box_rows, self.box_cols) = self.get_dimensions()
# initialize matrix with zeros
matrix = np.zeros((self.game_size, self.game_size), dtype=int)
# load the lassifier model
model = load_model(model_file)
# set the model to evaluation mode, i.e. now the imputs will individual not in batches
model.eval()
# get the height and width of each cube or cell
cube_h, cube_w = np.array(img.shape) / self.game_size
# iterate through the rows
for i in range(self.game_size):
# get start pixel height
y_start = int(i * cube_h)
# get end pixel height
y_end = int((i + 1) * cube_h)
# iterate through columns
for j in range(self.game_size):
# get start pixel width
x_start = int(j * cube_w)
# get end pixel width
x_end = int((j + 1) * cube_w)
# copy the cell
digit_img = img[y_start:y_end, x_start:x_end].copy()
# preprocess the cell
digit_img = self.preprocess_digit(digit_img)
# if the preprocessing returns None
if digit_img is None:
# set value as 0
matrix[i, j] = 0
# continue with next cell
continue
# map the image pixel values between 0-1
digit_img = digit_img / 255
# reshape the numpy array
digit_img = np.array(digit_img).reshape((1, 1, 28, 28))
# convert the image to tensor
digit_img_tensor = torch.tensor(digit_img, dtype=torch.float)
# get the model prediction
digit_img_out = np.array(model(digit_img_tensor).detach(), dtype=np.float32).flatten()
# digit is the index of the max value in predicted outputs
element = int(np.argmax(digit_img_out))
"""
Sanity Check: This will ensure that an impossible puzzle does not get loaded because
2 cells in a row or in a column or in a sub grid have the same value.
If they have same values, then the one with higher prediction score for
the specific label gets the value of the element and the other variable
chooses the value with the second highest prediction score.
"""
# sub grid position
sub_r, sub_c = i - (i % self.box_rows), j - (j % self.box_cols)
# sub grid values
sub_matrix = matrix[sub_r:sub_r + self.box_rows, sub_c:sub_c + self.box_cols]
# if value in the same row
if element in matrix[i, :]:
x = i
y = list(matrix[i, :]).index(element)
# if the value in the same column
elif element in matrix[:, j]:
x = i
y = list(matrix[:, j]).index(element)
# if the value in the same sub grid
elif element in sub_matrix:
pos = list(sub_matrix.flatten()).index(element)
x = sub_r + pos // self.box_rows
y = sub_c + pos % self.box_cols
# value not found any where else
else:
matrix[i, j] = element
continue
# get the image of the duplicate cell
duplicate = np.copy(img[int(x * cube_h):int((x + 1) * cube_h),
int(y * cube_w):int((y + 1) * cube_w)])
# preprocess the cell
duplicate = self.preprocess_digit(duplicate)
# if the preprocessing returns None
if duplicate is None:
# set the current index with the current element value
matrix[i, j] = element
# set the duplicate cell with 0
matrix[x, y] = 0
continue
# map the image pixel values between 0-1
duplicate = duplicate / 255
# reshape the numpy array
duplicate = np.array(duplicate).reshape((1, 1, 28, 28))
# convert the image to tensor
duplicate_tensor = torch.tensor(duplicate, dtype=torch.float)
# get the model prediction
duplicate_out = np.array(model(duplicate_tensor).detach(), dtype=np.float32).flatten()
# if prediction score of current element is more
if digit_img_out[element] > duplicate_out[element]:
# set the current index with the current element value
matrix[i, j] = element
# assign the index in duplicate output with negative infinity
duplicate_out[element] = np.NINF
# set the duplicate index with the new highest of its respective output
matrix[x, y] = int(np.argmax(duplicate_out))
# if prediction score of duplicate element is more
else:
# set the duplicate index with the current element value
matrix[x, y] = element
# assign the index in current output with negative infinity
digit_img_out[element] = np.NINF
# set the current index with the new highest of its respective output
matrix[i, j] = int(np.argmax(digit_img_out))
# return the matrix
return matrix
@staticmethod
def plot_on_image(image, matrix, matrix_sol, game_dim):
"""Plot the answer on the initial image"""
# if the detected matrix or the solution for it is none
if matrix is None or matrix_sol is None:
# return the original image without any changes
return image
# get size of the puzzle
game_size = np.prod(game_dim)
# convert the image to gray scale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# apply gaussian blur
blur = cv2.GaussianBlur(gray, (5, 5), 0)
# apply gaussian threshold
thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
# find all the contours
contours = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0]
# select the largest contour
max_cnt = max(contours, key=cv2.contourArea)
# calculte the perimeter of the contour
peri = cv2.arcLength(max_cnt, True)
# approximates the polygonal curves to detect vertices
approx = cv2.approxPolyDP(max_cnt, 0.015 * peri, True)
# flatten the vertices array
pts = np.squeeze(approx)
# find width of the puzzle
box_width = np.max(pts[:, 0]) - np.min(pts[:, 0])
# find height of the puzzle
box_height = np.max(pts[:, 1]) - np.min(pts[:, 1])
"""
The following steps are used to approximate the corner coordinates of the puzzle
in order to apply an appropriate transformation.
"""
sum_pts = pts.sum(axis=1)
diff_pts = np.diff(pts, axis=1)
bounding_rect = np.array([pts[np.argmin(sum_pts)],
pts[np.argmin(diff_pts)],
pts[np.argmax(sum_pts)],
pts[np.argmax(diff_pts)]], dtype=np.float32)
dst = np.array([[0, 0],
[box_width - 1, 0],
[box_width - 1, box_height - 1],
[0, box_height - 1]], dtype=np.float32)
# generate the transformation matrix
transform_matrix = cv2.getPerspectiveTransform(bounding_rect, dst)
# apply the tranformation matrix to get the primary sudoku image
warped_img = cv2.warpPerspective(image, transform_matrix, (box_width, box_height))
# get the height and width of each cube or cell
cube_h, cube_w, _ = np.array(warped_img.shape) / game_size
# generate mask on which the numbers will be drawn
template = np.zeros(warped_img.shape)
# iterate through rows
for i in range(game_size):
# get start pixel height
y_start = int(i * cube_h) + int(cube_h / 1.25)
# iterate throught columns
for j in range(game_size):
# get start pixel width
x_start = int(j * cube_w) + int(cube_w / 4)
# if its an empty cell
if matrix[i, j] == 0:
"""
font_size = (14 - game_size)/5;
this is an equation found by experimenting with various font sizes on
different game sizes and settling for the best fit.
"""
# add text on the mask
cv2.putText(template, str(matrix_sol[i, j]), (x_start, y_start),
cv2.FONT_HERSHEY_SIMPLEX, (14 - game_size) / 5,
(255, 255, 255), 3, cv2.LINE_AA)
# get the height and width of the image on which the result is to be plotted
img_height, img_width, _ = image.shape
try:
# apply the inverse of the transform which was applied to bring the sudoku to frame
unwarped_mask = cv2.warpPerspective(template,
np.linalg.inv(transform_matrix),
(img_width, img_height))
# incase the transformation matrix is Singular
except np.linalg.LinAlgError:
# return original image
return image
# generate a copy of original image
image_masked = image.copy()
# on this plot the mask with the predicted solution
image_masked[unwarped_mask == 255] = 0
# return the image with the solution
return image_masked