In [1]:
import os
from os.path import join
import numpy as np
import cv2
import shutil
import imutils

In [2]:
def draw_color_mask(img, borders, color=(0, 0, 0)):
    h = img.shape[0]
    w = img.shape[1]

    x_min = int(borders[0] * w / 100)
    x_max = w - int(borders[2] * w / 100)
    y_min = int(borders[1] * h / 100)
    y_max = h - int(borders[3] * h / 100)
    
    img = cv2.rectangle(img, (0, 0), (x_min, h), color, -1)
    img = cv2.rectangle(img, (0, 0), (w, y_min), color, -1)
    img = cv2.rectangle(img, (x_max, 0), (w, h), color, -1)
    img = cv2.rectangle(img, (0, y_max), (w, h), color, -1)

    return img

In [3]:
def preprocess_image_change_detection(img, gaussian_blur_radius_list=[13, 13], black_mask=(5, 10, 5, 0)):
    gray = img.copy()
    gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
    if gaussian_blur_radius_list is not None:
        for radius in gaussian_blur_radius_list:
            gray = cv2.GaussianBlur(gray, (radius, radius), 0)

    gray = draw_color_mask(gray, black_mask)

    return gray

In [4]:
def compare_frames_change_detection(prev_frame, next_frame, min_contour_area):
    prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    next_frame = cv2.cvtColor(next_frame, cv2.COLOR_BGR2GRAY)
    frame_delta = cv2.absdiff(prev_frame, next_frame)
    thresh = cv2.threshold(frame_delta, 45, 255, cv2.THRESH_BINARY)[1]

    thresh = cv2.dilate(thresh, None, iterations=2)
    cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
                            cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    #     print('cnts: {}, cnts-shape: {}'.format(cnts, len(cnts)))
    score = 0
    res_cnts = []
#     print('shape of tuple: {}'.format(cnts))
    for c in cnts:
        
        if cv2.contourArea(c) < min_contour_area:
            continue

        res_cnts.append(c)
        score += cv2.contourArea(c)

    return score, res_cnts, thresh

#### Calling compare_frames_change_detection function to detect and remove similar images

In [5]:
# Base path 
base_path = join(os.getcwd(), 'c23')
# All frames in a list
all_frames = os.listdir(base_path)

# Create directory to move similar images 
similar_imgs_dir = join(os.getcwd(), 'similar_images')
# Remove directory if exists otherwise create
if(os.path.exists(similar_imgs_dir)):
    # Allowing permission to write
    os.chmod(similar_imgs_dir, 0o777) 
    shutil.rmtree(similar_imgs_dir)
else:
    os.mkdir(similar_imgs_dir)
    
# Create list of similar images indexes
similar_indexes = list()

# Iterate over all frames to set path and get previous and next frames then
for (index, frameName) in enumerate(all_frames, start=0):
    
    if (index < len(all_frames) and index - 1 >= 0):
        prev_frame_path = join(base_path, all_frames[index - 1])
        next_frame_path = join(base_path, all_frames[index])
        
        # Reading previous and next frames
        prev_frame = cv2.imread(prev_frame_path)
        next_frame = cv2.imread(next_frame_path)
        
        # Call frame_changes_detection_function to compare frames
        score, _ , _ = compare_frames_change_detection(prev_frame, next_frame, min_contour_area=50)
        
        if score == 0:
            # Copy similar looking images into a duplicate_images directory
            shutil.copy(join(base_path, frameName), similar_imgs_dir)
            # Appending indexes of similar images to a list to remove later
            similar_indexes.append(index)
            
            print('score_difference: {}, index: {}, frameName: {}'.format(score, index, frameName))
            
        else:
            continue

print('--------------------------- REMOVING SIMILAR IMAGES ------------------------------')           
# Remove similar images from directory based on their indexes
for index in similar_indexes:
    os.remove(join(base_path, all_frames[index]))
    print('Removing frame: {} at index: {}'.format(all_frames[index], index))
print('--------------------------- TOTAL SIMILAR IMAGES FOUND ----------------------------')  
print('Total removed similar images were: {}'.format(len(similar_indexes)))    

score_difference: 0, index: 26, frameName: c23-1616696584520.png
score_difference: 0, index: 33, frameName: c23-1616698981729.png
score_difference: 0, index: 34, frameName: c23-1616699324009.png
score_difference: 0, index: 35, frameName: c23-1616699666493.png
score_difference: 0, index: 36, frameName: c23-1616700009093.png
score_difference: 0, index: 37, frameName: c23-1616700351574.png
score_difference: 0, index: 38, frameName: c23-1616700693937.png
score_difference: 0, index: 39, frameName: c23-1616701036262.png
score_difference: 0, index: 42, frameName: c23-1616702063501.png
score_difference: 0, index: 43, frameName: c23-1616702405906.png
score_difference: 0, index: 44, frameName: c23-1616702748470.png
score_difference: 0, index: 45, frameName: c23-1616703091109.png
score_difference: 0, index: 46, frameName: c23-1616703433669.png
score_difference: 0, index: 49, frameName: c23-1616704460947.png
score_difference: 0, index: 50, frameName: c23-1616704802950.png
score_difference: 0, inde

score_difference: 0, index: 350, frameName: c23-1616807507626.png
score_difference: 0, index: 351, frameName: c23-1616807849826.png
score_difference: 0, index: 352, frameName: c23-1616808191946.png
score_difference: 0, index: 353, frameName: c23-1616808534030.png
score_difference: 0, index: 354, frameName: c23-1616808876270.png
score_difference: 0, index: 355, frameName: c23-1616809218514.png
score_difference: 0, index: 356, frameName: c23-1616809560594.png
score_difference: 0, index: 357, frameName: c23-1616809902599.png
score_difference: 0, index: 358, frameName: c23-1616810244560.png
score_difference: 0, index: 359, frameName: c23-1616810586997.png
score_difference: 0, index: 360, frameName: c23-1616810929079.png
score_difference: 0, index: 361, frameName: c23-1616811271324.png
score_difference: 0, index: 362, frameName: c23-1616811613406.png
score_difference: 0, index: 366, frameName: c23-1616812981329.png
score_difference: 0, index: 367, frameName: c23-1616813323212.png
score_diff

## Additional required information    

##### Q:1 What parameters you decided to use for the provided example dataset 

###### Ans: I have used only one function "compare_frames_change_detection" to compute similarity score of frame pairs with all required parameters.

##### Q:2 How you found those values?

###### Ans: prev_frame and next_frame were taken based on their indexes from the given dataset, whereas the final parameter "min_contour_area" was tweaked based on grabed contour values. Nevertheless there were more than 1 minimum values but I decided to go with 50 which worked better in finding similar images.

##### Q:3  What amount of duplicates script found with these parameters?

###### Ans: I was able to find 151 duplicate frames with score difference of ZERO (0). 

##### Q:4  What you would suggest improving to make data collection of unique cases better ?

###### Ans: In my opinion, it would be better to deploy online solutions to handle duplication of data. For instance, deploy a already trained model or other programs based on the case studies and data requirement to handle redundacny in the data online, which will be helpful and time saving in future. 

##### Q:5 Any other comments about imaging_interview.py or your solution?

###### Ans: I was bit confused about the first two functions in the script that I skipped while implementing my solution. I think the most important function is "compare_frames_change_detection" to detect difference in frames where I added just two lines of extra code to transform the frames from RGB to GrayScale. So, after doing this I thought the other two functions are not that much important as the one I used to remove similar images. 
###### Last but not least, it would be a great help if the given script has some comments about the functions.