# Load libraries

In [1]:
import numpy as np
import cv2
import glob
import yaml

# Global initializations

In [2]:
DATASET_FOLDER = '../theatre_dataset/'
VIDEO_FOLDER = 'braunfels_1'
OFFSET_EYE_GAZE = 2 # seconds
DICT_NOACTORS_SHOTFILE = dict() # key = no of actors in a shot, value = list of indices where these shots are present in the LIST_FILENAMEPATH_SHOTS
LIST_FILENAMEPATH_SHOTS = sorted(glob.glob(DATASET_FOLDER + VIDEO_FOLDER + "/shots/*.txt"))
for shot_filename in LIST_FILENAMEPATH_SHOTS[::]:
    no_of_actors = shot_filename.count("-") - 1
    if no_of_actors == 1 and shot_filename.count("FS") == 1:
        LIST_FILENAMEPATH_SHOTS.remove(shot_filename)
    elif no_of_actors > 1 and shot_filename.count("MS") == 1:
        LIST_FILENAMEPATH_SHOTS.remove(shot_filename)
DICT_FILENAMEPATH_SHOTNO = {LIST_FILENAMEPATH_SHOTS[i]:i for i in range(len(LIST_FILENAMEPATH_SHOTS))}
for i in range(len(LIST_FILENAMEPATH_SHOTS)):
    shot_filename = LIST_FILENAMEPATH_SHOTS[i]
    no_of_actors = shot_filename.count("-") - 1
    if no_of_actors in DICT_NOACTORS_SHOTFILE:
        DICT_NOACTORS_SHOTFILE[no_of_actors].append(i)
    else:
        DICT_NOACTORS_SHOTFILE[no_of_actors] = [i]
LIST_ACTORNAMES = [i[i.rindex('/')+len(VIDEO_FOLDER)+2:-4] for i in sorted(glob.glob(DATASET_FOLDER + VIDEO_FOLDER + "/tracks/*.txt"))]
DICT_ACTORNAME_ACTORNO = {LIST_ACTORNAMES[i]:i for i in range(len(LIST_ACTORNAMES))}
for i in DICT_NOACTORS_SHOTFILE[1]:
    shot_filename = LIST_FILENAMEPATH_SHOTS[i]
    shot_filename_2 = shot_filename.replace("MS", "FS")
    DICT_FILENAMEPATH_SHOTNO[shot_filename_2] = DICT_FILENAMEPATH_SHOTNO[shot_filename]
DICT_NOACTORS_SHOTFILE, LIST_FILENAMEPATH_SHOTS, DICT_FILENAMEPATH_SHOTNO, LIST_ACTORNAMES, DICT_ACTORNAME_ACTORNO

({1: [0, 4, 6], 2: [1, 3, 5], 3: [2]},
 ['../theatre_dataset/braunfels_1/shots/braunfels_1-p1-MS.txt',
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p1-p2-FS.txt',
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p1-p2-p3-FS.txt',
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p1-p3-FS.txt',
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p2-MS.txt',
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p2-p3-FS.txt',
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p3-MS.txt'],
 {'../theatre_dataset/braunfels_1/shots/braunfels_1-p1-MS.txt': 0,
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p1-p2-FS.txt': 1,
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p1-p2-p3-FS.txt': 2,
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p1-p3-FS.txt': 3,
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p2-MS.txt': 4,
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p2-p3-FS.txt': 5,
  '../theatre_dataset/braunfels_1/shots/braunfels_1-p3-MS.txt': 6,
  '../theatre

# Load files

In [3]:
file_video_input = cv2.VideoCapture(DATASET_FOLDER + VIDEO_FOLDER + "/" + VIDEO_FOLDER + ".mp4")
VIDEO_INPUT_CODEC = int(file_video_input.get(cv2.CAP_PROP_FOURCC)) # not working
VIDEO_INPUT_FRAME_SIZE = (int(file_video_input.get(cv2.CAP_PROP_FRAME_WIDTH)), int(file_video_input.get(cv2.CAP_PROP_FRAME_HEIGHT)))
VIDEO_INPUT_FPS = file_video_input.get(cv2.CAP_PROP_FPS)
VIDEO_INPUT_FRAMES_COUNT = min(int(file_video_input.get(cv2.CAP_PROP_FRAME_COUNT)), *[np.loadtxt(i).shape[0] for i in LIST_FILENAMEPATH_SHOTS])
OFFSET_EYE_GAZE = int(np.rint(OFFSET_EYE_GAZE * VIDEO_INPUT_FPS)) # index number
file_video_input.release()

In [4]:
VIDEO_INPUT_CODEC, VIDEO_INPUT_FRAME_SIZE, VIDEO_INPUT_FPS, VIDEO_INPUT_FRAMES_COUNT, OFFSET_EYE_GAZE

(828601953, (3840, 2160), 29.97002997002997, 2276, 60)

In [5]:
# This is coordinates of all the shots
all_shots_coords = np.empty(shape = (VIDEO_INPUT_FRAMES_COUNT, 4, len(LIST_FILENAMEPATH_SHOTS))) # dimensions = (#frames, coordinates (x1, y1, x2, y2), #shots)
for i in range(len(LIST_FILENAMEPATH_SHOTS)):
    all_shots_coords[:, :, i] = np.loadtxt(LIST_FILENAMEPATH_SHOTS[i])[:VIDEO_INPUT_FRAMES_COUNT]
all_shots_coords = np.rint(all_shots_coords).astype(np.int16)
all_shots_coords.shape

(2276, 4, 7)

In [6]:
c_t_i = np.empty(shape = (VIDEO_INPUT_FRAMES_COUNT, len(DICT_NOACTORS_SHOTFILE[1]), 2)) # center of all the shots across all the frames # dimensions = (#frames, #shots (1-actor), coordinates (x,y))
for i in range(len(DICT_NOACTORS_SHOTFILE[1])):
    temp = np.loadtxt(LIST_FILENAMEPATH_SHOTS[DICT_NOACTORS_SHOTFILE[1][i]])[:VIDEO_INPUT_FRAMES_COUNT]
    c_t_i[:, i, 0] = (temp[:, 0] + temp[:, 2]) / 2
    c_t_i[:, i, 1] = (temp[:, 1] + temp[:, 3]) / 2
c_t_i.shape

(2276, 3, 2)

In [7]:
with open(DATASET_FOLDER + VIDEO_FOLDER + "/" + VIDEO_FOLDER + "_gaze.yml", 'r') as f:
    dict_user_gaze = yaml.safe_load(f) # key = user number, value = list of lists
g_t_k = np.empty(shape = (VIDEO_INPUT_FRAMES_COUNT, len(dict_user_gaze), 2)) # gaze coordinates of all the users # dimensions = (#frames, #users, coordinates (x, y))
for i, x_y_coords in dict_user_gaze.items():
    temp = np.array(x_y_coords).astype(float)[OFFSET_EYE_GAZE:OFFSET_EYE_GAZE+VIDEO_INPUT_FRAMES_COUNT, :2]
    g_t_k[:, i, :] = temp
g_t_k[:, :, 0] = np.rint(g_t_k[:, :, 0] / 1920 * VIDEO_INPUT_FRAME_SIZE[0])
g_t_k[:, :, 1] = np.rint(g_t_k[:, :, 1] / 1080 * VIDEO_INPUT_FRAME_SIZE[1])
g_t_k.shape

(2276, 5, 2)

In [8]:
d_t_i = np.empty(shape = (VIDEO_INPUT_FRAMES_COUNT, len(DICT_NOACTORS_SHOTFILE[1]))) # distance of shot centers and gaze # dimensions = (#frames, #shots (1-actor))
for i in range(len(DICT_NOACTORS_SHOTFILE[1])):
    d_t_i[:, i] = np.sum(np.linalg.norm(c_t_i[:, [i], :] - g_t_k, axis = 2), axis = 1)
d_t_i.shape

(2276, 3)

In [9]:
# This is the bounding box
bx1_t_orig = np.empty(shape = (VIDEO_INPUT_FRAMES_COUNT, 0), dtype = np.int16) # dimensions = (#frames, #actors)
bx2_t_orig = np.empty(shape = (VIDEO_INPUT_FRAMES_COUNT, 0), dtype = np.int16) # dimensions = (#frames, #actors)
for file_name in sorted(glob.glob(DATASET_FOLDER + VIDEO_FOLDER + "/tracks/*.txt")):
     try:
          temp = np.rint(np.loadtxt(file_name, delimiter = ',')[:VIDEO_INPUT_FRAMES_COUNT]).astype(np.int16)
     except:
          temp = np.rint(np.loadtxt(file_name)[:VIDEO_INPUT_FRAMES_COUNT]).astype(np.int16)
     bx1_t_orig = np.append(bx1_t_orig, temp[:, [0]], axis = 1)
     bx2_t_orig = np.append(bx2_t_orig, temp[:, [2]], axis = 1)
x_t = (bx1_t_orig + bx2_t_orig) // 2 # dimensions = (#frames, #actors)

# Gaze potential

In [10]:
Gs_t_i = -1 * np.ones(shape = (VIDEO_INPUT_FRAMES_COUNT, len(LIST_FILENAMEPATH_SHOTS))) # gaze potential # dimensions = (#frames, #shots)
temp_1 = 1 / d_t_i
temp_2 = temp_1.sum(axis = 1)
for i in range(len(DICT_NOACTORS_SHOTFILE[1])):
    Gs_t_i[:, DICT_NOACTORS_SHOTFILE[1][i]] = temp_1[:, i] / temp_2
Gs_t_i.shape

(2276, 7)

In [11]:
for i in range(2, 10):
    if i not in DICT_NOACTORS_SHOTFILE: break
    for j in range(len(DICT_NOACTORS_SHOTFILE[i])):
        list_shot_filename = LIST_FILENAMEPATH_SHOTS[DICT_NOACTORS_SHOTFILE[i][j]].split("-")
        list_actors_shot = np.array([DICT_ACTORNAME_ACTORNO[k] for k in list_shot_filename[1:-1]])
        actors_left_right = x_t[:, list_actors_shot].argsort(axis = 1)
        actors_left_right = list_actors_shot[actors_left_right]
        temp_1 = np.sort(actors_left_right[:, :-1], axis = 1)
        temp_2 = np.sort(actors_left_right[:, 1:], axis = 1)
        temp_1 = np.vectorize(lambda t: LIST_ACTORNAMES[t])(temp_1)
        temp_2 = np.vectorize(lambda t: LIST_ACTORNAMES[t])(temp_2)
        temp_1 = np.apply_along_axis(lambda t: list_shot_filename[0] + '-' + '-'.join(t) + '-' + list_shot_filename[-1], 1, temp_1)
        temp_2 = np.apply_along_axis(lambda t: list_shot_filename[0] + '-' + '-'.join(t) + '-' + list_shot_filename[-1], 1, temp_2)
        temp_1 = np.vectorize(lambda t: DICT_FILENAMEPATH_SHOTNO[t])(temp_1)
        temp_2 = np.vectorize(lambda t: DICT_FILENAMEPATH_SHOTNO[t])(temp_2)
        Gs_t_a = Gs_t_i[range(VIDEO_INPUT_FRAMES_COUNT), temp_1]
        Gs_t_b = Gs_t_i[range(VIDEO_INPUT_FRAMES_COUNT), temp_2]
        assert np.all(Gs_t_a != -1) and np.all(Gs_t_b != -1)
        Gs_t_i[:, DICT_NOACTORS_SHOTFILE[i][j]] = Gs_t_a + Gs_t_b - np.abs(Gs_t_a - Gs_t_b)
assert np.all(Gs_t_i != -1)
Gs_t_i = -np.log(Gs_t_i)

# Edges

In [12]:
edges = np.zeros(shape = (len(LIST_FILENAMEPATH_SHOTS), len(LIST_FILENAMEPATH_SHOTS), VIDEO_INPUT_FRAMES_COUNT-1))

## Shot transition cost

In [13]:
lamb = 2
shot_transition_cost = np.empty(shape = edges.shape)
for i in range(VIDEO_INPUT_FRAMES_COUNT-1):
    shot_transition_cost[:, :, i] = np.diagflat([-lamb]*len(LIST_FILENAMEPATH_SHOTS), 0)
shot_transition_cost += lamb
edges += shot_transition_cost

## Overlap cost

In [14]:
# returns the IOU of the two rectangles
def bb_intersection_over_union(boxA, boxB): # here parameter box = (x1, y1, x2, y2)
	# determine the (x, y)-coordinates of the intersection rectangle
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
	# compute the area of intersection rectangle
	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
	# compute the area of both the prediction and ground-truth
	# rectangles
	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
	# compute the intersection over union by taking the intersection
	# area and dividing it by the sum of prediction + ground-truth
	# areas - the interesection area
	iou = interArea / float(boxAArea + boxBArea - interArea)
	# return the intersection over union value
	return iou

In [15]:
alpha = 0.2; beta = 0.4; mu = 1; v = 1000
overlap_cost = np.empty(shape = edges.shape)
for i in range(VIDEO_INPUT_FRAMES_COUNT-1): # iterate along the time frames
    for j in range(len(LIST_FILENAMEPATH_SHOTS)): # iterate along the current shots
        for k in range(len(LIST_FILENAMEPATH_SHOTS)): # iterate along the next shots
            gamma = bb_intersection_over_union(all_shots_coords[i, :, j], all_shots_coords[i+1, :, k])
            if j == k:
                overlap_cost[j, k, i] = 0
            elif gamma >= 0 and gamma < alpha:
                overlap_cost[j, k, i] = 0
            elif gamma >= alpha and gamma <= beta:
                overlap_cost[j, k, i] = mu * gamma / alpha
            elif gamma > beta and gamma <= 1:
                overlap_cost[j, k, i] = v
            else:
                raise Exception("Error: Invalid gamma value. Valid range of gamma: [0, 1]")
edges += overlap_cost

## Rhythm cost

In [16]:
# will do later

# Dynamic programming

## Forward pass

In [17]:
graph_cost = np.zeros(shape = (VIDEO_INPUT_FRAMES_COUNT, len(LIST_FILENAMEPATH_SHOTS)))
graph_cost[0, :] = Gs_t_i[0, :]
back_trace = -1 * np.ones(shape = (VIDEO_INPUT_FRAMES_COUNT, len(LIST_FILENAMEPATH_SHOTS)))
for i in range(1, VIDEO_INPUT_FRAMES_COUNT):
    for j in range(len(LIST_FILENAMEPATH_SHOTS)):
        temp = graph_cost[i-1, :] + edges[:, j, i-1] + Gs_t_i[i, j]
        graph_cost[i, j] = temp.min()
        back_trace[i, j] = temp.argmin()

## Backward pass

In [18]:
result_t = -1 * np.ones(shape = (VIDEO_INPUT_FRAMES_COUNT)).astype(np.int16)
result_t[-1] = graph_cost[-1, :].argmin()
for i in range(VIDEO_INPUT_FRAMES_COUNT-1, 0, -1):
    result_t[i-1] = back_trace[i, result_t[i]]
result_t.shape

(2276,)

# Save the result

In [19]:
OUTPUT_VIDEO_FRAME_SIZE = (1280, 720)
file_output_video = cv2.VideoWriter(DATASET_FOLDER + VIDEO_FOLDER + "/" + 'gazed.mp4', cv2.VideoWriter_fourcc(*'mp4v'), VIDEO_INPUT_FPS, OUTPUT_VIDEO_FRAME_SIZE)
file_input_video = cv2.VideoCapture(DATASET_FOLDER + VIDEO_FOLDER + "/" + VIDEO_FOLDER + ".mp4")
for i in range(VIDEO_INPUT_FRAMES_COUNT):
    success, img_frame = file_input_video.read()
    temp_x1, temp_y1, temp_x2, temp_y2 = all_shots_coords[i, :, result_t[i]]
    file_output_video.write(cv2.resize(img_frame[temp_y1:temp_y2+1, temp_x1:temp_x2+1], OUTPUT_VIDEO_FRAME_SIZE))
file_input_video.release()
file_output_video.release()