In [4]:
import os
import cv2
import numpy as np
from sklearn.cluster import KMeans
from numpy import linalg, abs
import csv
from time import localtime, strftime

index = 0

def extract_frames(video_path, sampling_frequency):
    if not os.path.exists(video_path):
        print(f"{video_path}: File not found.")
        exit()

    # Load video
    cap = cv2.VideoCapture(video_path)

    # define resolution
    # cap.set(3, 1280)
    # cap.set(4, 720)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    ret, frame = cap.read()
    # height, width, channels = frame.shape
    # print(f"{width}, {height}")

    # Extracting frames, chronologically forwards
    frames = []
    count = 0
    frame_index = 0
    while frame_index <= cap.get(cv2.CAP_PROP_FRAME_COUNT):
        ret, frame = cap.read()
        if not ret:
            break  # End of video
        # frames.append(frame)
        frames.append(cv2.resize(frame, (1280, 720)))
        
        # Increment frame_index
        count += 1
        cap.set(cv2.CAP_PROP_POS_FRAMES, int((count * fps) / sampling_frequency))

    cap.release()

    return frames, count

# numpy array param
def k_means_cluster_image(frame, k):
    global index
    pixel_vals = frame.reshape((-1,3))
    k_means = KMeans(n_clusters=k, random_state=42)
    k_means.fit(pixel_vals)

    centers = np.int16(k_means.cluster_centers_)
    # print(index, ": ")
    # print(centers)
    index += 1
    labels = k_means.labels_
    segmented_data = centers[labels.flatten()]

    # reshape data into the original image dimensions
    segmented_image = segmented_data.reshape((frame.shape))

    background_color = centers[0]
    for center in centers:
        if linalg.norm([255, 255, 255] - center) < linalg.norm([255, 255, 255] - background_color):
            background_color = center

    
    return segmented_image, background_color

def k_means_cluster_video(frames, k):
    segmented_images = []
    background_colors = []
    for frame in frames:
        # segmented_images.append(k_means_cluster_image(frame, k))
        segmented_image, background_color = k_means_cluster_image(frame, k)
        segmented_images.append(segmented_image)
        background_colors.append(background_color)
    return segmented_images, background_colors

# so janky
def remove_background(frame, background_color, null_color, height, width):
    # difference = frame[70][110] - background_color
    # dist = linalg.norm(difference)
    # print(f"frame[70][110]: {frame[70][110]}")
    # print(f"background: {background_color}")
    # print(difference)
    # print(dist)
    # comparison = frame[70][110].copy() if dist < 50 else background_color
    # print(comparison)

    for row in range(height):
            for col in range(width):
                if (frame[row][col] == background_color).all():
                    frame[row][col] = null_color # might add a transparency channel instead

    return frame

def remove_all_backgrounds(frames_array, background_colors, null_color, height, width):
    frames_array = np.array(frames_array, dtype=np.int16)
    for i, frame in enumerate(frames_array):
        remove_background(frame, background_colors[i], null_color, height, width)
    return frames_array

def subtract_two_masks(curr_frame, next_frame, height, width, null_color):
    for row in range(height):
        for col in range(width): 
            curr_pixel = curr_frame[row][col]
            next_pixel = next_frame[row][col]
            dist = linalg.norm(curr_pixel - next_pixel)
            if (dist < 50):
                curr_frame[row][col] = null_color                

def subtract_all_masks(frames_array, len, height, width, null_color):
    frames_array = np.array(frames_array, dtype=np.int16)
    
    for index in range(len - 1): 
        curr_frame = frames_array[index]
        next_frame = frames_array[index + 1]
        subtract_two_masks(curr_frame, next_frame, height, width, null_color)
    
    return frames_array

def count_remaining(frames_array, output_array, height, width, null_color):
    for frame in frames_array:
        count = 0
        for row in range(height):
            for col in range(width): 
                if (frame[row][col] == null_color).all():
                    continue
                count += 1
        output_array.append(count)

def output_to_csv(array, output_csv, fps):
    with open(output_csv, 'a', newline='') as csvfile:
        fieldnames = ['index', 'frame', 'value']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        index = 0
        for value in array:
            writer.writerow({'index' : index, 'frame' : int((index * fps) / sampling_frequency), 'value' : value})
            index += 1

def save_frames(frames, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        exit()
    index = 0
    for frame in frames:
        img_file = os.path.join(output_dir, f"scene_{index}.jpg")
        cv2.imwrite(img_file, frame)
        index += 1

def print_time():
    return strftime("%H:%M:%S", localtime())

def run_mask_segmentation(video_path, output_csv, output_dir, sampling_frequency):
    # Load video
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)

    # frames NumPy array
    print(f"Extracting frames {print_time()}")
    frames_array, count = extract_frames(video_path, sampling_frequency)
    frames_array = np.array(frames_array, dtype=np.int16)
    height, width, channels = frames_array[0].shape
    print(f"Finished extracting frames {print_time()}")

    print("height: ", height)
    print("width: ", width)
    print("fps: ", fps)

    # K-means cluster
    k = 2                                                      # hardcoded value
    print("K-means cluster")
    frames_array, background_colors = k_means_cluster_video(frames_array, k)
    print(f"Finished k-means cluster {print_time()}")

    # Assuming white background
    background_color = [255, 255, 255] 
    null_color = [50, 205, 50]

    # Removing background pixels, creating masks
    print("Removing backgrounds")
    frames_array = remove_all_backgrounds(frames_array, background_colors, null_color, height, width)
    # temp_arr = []
    # temp_arr.append(remove_background(frames_array[0], background_color, null_color, height, width))
    # temp_arr.append(remove_background(frames_array[1], background_color, null_color, height, width))    
    print(f"Finished reomving backgrounds {print_time()}")

    # Subtracting adjacent frames
    print("Subtracting masks")
    frames_array = subtract_all_masks(frames_array, count, height, width, null_color)
    print(f"Finished subtracting masks {print_time()}")

    # Checking emptiness of frame
    # print("Counting remaining pixels")
    # frame_pixel_count = []
    # count_remaining(frames_array, frame_pixel_count, height, width, null_color)
    # print(f"Finished counting remaining pixels {print_time()}")

    # Output to csv
    # print("Outputting to csv")
    # output_to_csv(frame_pixel_count, output_csv, fps)
    # print(f"Finished outputting to csv {print_time()}")

    # frames_array to image for visualization
    print("Saving frames")
    save_frames(frames_array, output_dir)
    # save_frames(processed_frames, output_dir)
    print(f"Finished saving frames {print_time()}")

    return frames_array

# Video, output csv, output dir path
video_path = 'data/videos/test4_ie300_8_24_2020.mp4'
output_csv = 'output/txt/annotations.csv'
output_dir = "output/image/k_means_clustering7"
sampling_frequency = 1  # seconds between samples

result = run_mask_segmentation(video_path, output_csv, output_dir, sampling_frequency)
# frame = result[0]

Extracting frames 15:09:47
Finished extracting frames 15:10:51
height:  720
width:  1280
fps:  29.97002997002997
K-means cluster
Finished k-means cluster 15:12:16
Removing backgrounds
Finished reomving backgrounds 15:28:56
Subtracting masks
Finished subtracting masks 16:03:36
Saving frames
Finished saving frames 16:03:39
