In [None]:
# --- Import packages ---
import os
import json
import re
import math
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from scipy import spatial
from scipy.spatial.distance import cosine
from matplotlib.colors import ListedColormap

In [None]:

# --- Define paths and parameters (to be customized by user) ---
df_raw_path_prefix = "PATH_TO_PICKLE_DIRECTORY/"
video_input_path = "PATH_TO_VIDEO_FILE.mp4"
participant_id = "PARTICIPANT_ID"
start_point = 2400  # Adjust to where your analysis window starts

# --- Occlusion periods (in seconds) ---
occlusion = "yes"
start_time = 4 * 60 + 4
end_time = 5 * 60 + 0

occlusion_2 = "yes"
start_time_2 = 6 * 60 + 3
end_time_2 = 6 * 60 + 15

# --- Get video FPS ---
def get_video_fps(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError("Could not open video file.")
    fps_value = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    return fps_value

fps = get_video_fps(video_input_path)

# Optional: Adjust known variable frame rates
if 29 < fps < 31:
    fps = 30
elif 57 < fps < 59:
    fps = 58
elif 59 < fps < 61:
    fps = 60

print(f"Frames per second (fps): {fps}")

# --- Load raw data ---
df_raw = pd.read_pickle(os.path.join(df_raw_path_prefix, f"{participant_id}_df.pkl"))

# --- Rename columns and prepare working copy ---
df_raw = df_raw.loc[:len(df_raw), :]
df = df_raw.reset_index()
df.columns = ["Pb1", "Pb2", "Pb3"]
df_copy = df.copy()

# --- Create empty DataFrame for cleaned output ---
new_df = pd.DataFrame(columns=["P1", "P2", "P3"], index=range(0, len(df)))

# --- Select 10-minute segment starting from `start_point` ---
frame_window = int(10 * 60 * fps)
df_head = df_copy.loc[start_point:start_point + frame_window]
new_df = new_df.loc[start_point:start_point + frame_window]

# --- Trim to start at first non-NaN index ---
min_index = df_head[["Pb1", "Pb2", "Pb3"]].dropna().index.min()
df_head = df_head.loc[min_index:min_index + frame_window]
new_df = new_df.loc[min_index:min_index + frame_window]


In [None]:
def euc_distance(v1, v2):
    zero_indices = (v1 == 0.0) | (v2 == 0.0)
    
    # Replace zero values with NaN
    v1 = np.where(zero_indices, np.nan, v1)
    v2 = np.where(zero_indices, np.nan, v2)

    # Calculate Euclidean distance
    distance = np.sqrt(np.nansum((v1 - v2) ** 2))
    
    return distance


def cosine_2d(v1, v2):
    cosine = spatial.distance.cosine
    
    # Create masks for non-zero elements
    mask_v1 = v1 != 0.0
    mask_v2 = v2 != 0.0
    
    # Create masked arrays
    #
    masked_v1 = np.ma.masked_array(v1, mask=~mask_v1)
    masked_v2 = np.ma.masked_array(v2, mask=~mask_v2)
    
    # Check if there are non-masked elements in both arrays
    if masked_v1.mask.all() or masked_v2.mask.all():
        # Handle the case where one or both arrays have only masked values
        return 0.0
    
    # Flatten the masked arrays and calculate cosine similarity
    similarity = 1 - cosine(masked_v1.flatten(), masked_v2.flatten())

    return similarity

def is_float(value):
    return isinstance(value, float)

def is_numpy_array(value):
    return isinstance(value, np.ndarray)

def make_whateversnan_nan(df_copy, new_df, i):
    Pb1 = df_copy.loc[i, 'Pb1']
    Pb2 = df_copy.loc[i, 'Pb2']
    Pb3 = df_copy.loc[i, 'Pb3']

    P1 = new_df.loc[i, 'P1']
    P2 = new_df.loc[i, 'P2']
    P3 = new_df.loc[i, 'P3']
    P_num_new = sum(x is not None for x in [P1, P2, P3])

    if P_num_new == 3 and all(map(is_float, [Pb1, Pb2, Pb3])):
        print(f"three nans for i = {i}")
        new_df.loc[i, 'P1'] = np.nan
        new_df.loc[i, 'P2'] = np.nan
        new_df.loc[i, 'P3'] = np.nan
        pass
    elif P_num_new == 2 and is_numpy_array(P1) and \
            sum(map(is_float, [Pb1, Pb2, Pb3])) == 2:
        print(f"two nans moved to P2 and P3 for i = {i}")
        new_df.loc[i, 'P2'] = np.nan
        new_df.loc[i, 'P3'] = np.nan
        pass
    elif P_num_new == 3 and is_numpy_array(P1) and is_numpy_array(P2) and \
            sum(map(is_float, [Pb1, Pb2, Pb3])) == 1:
        print(f"our two arrays are already filled, one nan moved to P3 for i = {i}")
        new_df.loc[i, 'P3'] = np.nan
        pass
    elif P_num_new == 3 and is_float(P3) and \
            ((is_numpy_array(P1) and is_float(P2)) or (is_float(P1) and is_numpy_array(P2))) and \
            sum(map(is_float, [Pb1, Pb2, Pb3])) == 2:
        print(f"one nan moved to P3 for i = {i}")
        new_df.loc[i, 'P3'] = np.nan
        pass
    

def calculate_coses(new_df, df_copy, i, z, P_num_new):
    cosim_1 = cosine_2d(new_df.loc[i-z, 'P' + str(P_num_new)], df_copy.loc[i, 'Pb1'])
    cosim_2 = cosine_2d(new_df.loc[i-z, 'P' + str(P_num_new)], df_copy.loc[i, 'Pb2'])
    cosim_3 = cosine_2d(new_df.loc[i-z, 'P' + str(P_num_new)], df_copy.loc[i, 'Pb3'])

    all_coses = [('Pb1', cosim_1), ('Pb2', cosim_2), ('Pb3', cosim_3)] 

    # Sort the values in descending order and get the top two
    sorted_coses = sorted(all_coses, key=lambda x: x[1], reverse=True)[:2]
    return all_coses, sorted_coses

def calculate_distances(sorted_coses, df_copy, new_df, i, z, P_num_new):
    name_1, array_1 = sorted_coses[0][0], df_copy.loc[i, sorted_coses[0][0]]
    name_2, array_2 = sorted_coses[1][0], df_copy.loc[i, sorted_coses[1][0]]

    distances_1 = euc_distance(array_1, new_df.loc[i - z, 'P' + str(P_num_new)])
    distances_2 = euc_distance(array_2, new_df.loc[i - z, 'P' + str(P_num_new)])

    sum_distances_1 = np.nansum(distances_1)
    sum_distances_2 = np.nansum(distances_2)

    return sum_distances_1, sum_distances_2, name_1, array_1, name_2, array_2, distances_1, distances_2

def find_person(df_copy, new_df, i, P_num_new, sum_dist_thresh,  z = 1):
    if i - z < 0:
        return None
    #IF ALL THREE SHOULD BE NAN FILL WITH NAN 
    make_whateversnan_nan(df_copy, new_df, i)
    all_coses, sorted_coses = calculate_coses(new_df, df_copy, i, z, P_num_new)
    sum_distances_1, sum_distances_2, name_1, array_1, name_2, array_2, distances_1, distances_2 = calculate_distances(sorted_coses, df_copy, new_df, i, z, P_num_new)
    print(all_coses) 
    print(sorted_coses)

    # Check the difference between the top two values
    difference = sorted_coses[0][1] - sorted_coses[1][1]
    
    #if (float(sorted_coses[0][1]) > 0.97 and sum_distances_1 < sum_dist_thresh) or\
    if (difference >= 0.25 and float(sorted_coses[0][1]) > 0.94 and sum_distances_1 < sum_dist_thresh):
        max_cos = sorted_coses[0][0]
        print("For P" + str(P_num_new) + " max_cos index is " + max_cos + ", for i = {0}".format(i))
        new_df.loc[i, 'P' + str(P_num_new)] = df_copy.loc[i, max_cos]
    #if the difference is too small 
    elif (float(sorted_coses[0][1]) > 0.89) or\
    (sum_distances_1 < 300 or sum_distances_2 < 300 and float(sorted_coses[0][1]) > .73): 
        # COMPARE EUCLIDEAN DISTANCES TO MATCH, given the sum of distances is lower than 100
        print("For P" + str(P_num_new) + " In i = {0}, ".format(i) + "Not high enough max cosine value or difference between the top two values. Comparing distances.")
        # Extract the original arrays for the top two candidates
        if sum_distances_1 > sum_dist_thresh and sum_distances_2 > sum_dist_thresh:
            print("both sum distances are over thresh at " + str(sum_distances_1) + " and " + str(sum_distances_2) + "so we changed to NaN for i = {0}".format(i))
            new_df.loc[i, 'P' + str(P_num_new)] = np.nan
        elif sum_distances_1 < sum_distances_2 and sum_distances_1 < sum_dist_thresh and sum_distances_1 != 0.0:
            print(name_1 + " distance sum is smaller at " + str(sum_distances_1) + " compared to " + str(sum_distances_2))
            new_df.loc[i, 'P' + str(P_num_new)] = df_copy.loc[i, name_1]
        elif sum_distances_2 < sum_distances_1 and sum_distances_2 < sum_dist_thresh and sum_distances_2 != 0.0:
            print(name_2 + " has a lower distance at " + str(sum_distances_2) + " compared to " + str(sum_distances_1))
            new_df.loc[i, 'P' + str(P_num_new)] = df_copy.loc[i, name_2]
    else:
        print("max cosine similarity value was " + str(sorted_coses[0][1]) + " distances were " + str(sum_distances_1) + " " + str(sum_distances_2) + " for i = {0}".format(i))
        new_df.loc[i, 'P' + str(P_num_new)] = np.nan
        
def control_fakeP1_and_P2_duplicates(i, new_df, z):
    # Assuming new_df is accessible here
    comparables = [('P1', new_df.loc[i, 'P1']), ('P2', new_df.loc[i, 'P2']), ('P3', new_df.loc[i, 'P3'])]
    if np.array_equal(comparables[0][1], comparables[1][1]):
        distances_1 = euc_distance(new_df.loc[i, comparables[0][0]], new_df.loc[i-z, comparables[0][0]])
        distances_2 = euc_distance(new_df.loc[i, comparables[1][0]], new_df.loc[i-z, comparables[1][0]])
        sum_distances_1 = np.nansum(distances_1)
        sum_distances_2 = np.nansum(distances_2)
        if sum_distances_1 > sum_distances_2:
            new_df.loc[i, comparables[0][0]] = np.nan
        else:
            new_df.loc[i, comparables[1][0]] = np.nan
    elif np.array_equal(comparables[0][1], comparables[2][1]): 
        distances_1 = euc_distance(new_df.loc[i, comparables[0][0]], new_df.loc[i-z, comparables[0][0]])
        distances_2 = euc_distance(new_df.loc[i, comparables[2][0]], new_df.loc[i-z, comparables[2][0]])
        sum_distances_1 = np.nansum(distances_1)
        sum_distances_2 = np.nansum(distances_2)
        if sum_distances_1 > sum_distances_2:
            new_df.loc[i, comparables[0][0]] = np.nan
        else:
            new_df.loc[i, comparables[2][0]] = np.nan
    elif np.array_equal(comparables[1][1], comparables[2][1]):
        distances_1 = euc_distance(new_df.loc[i, comparables[1][0]], new_df.loc[i-z, comparables[1][0]])
        distances_2 = euc_distance(new_df.loc[i, comparables[2][0]], new_df.loc[i-z, comparables[2][0]])
        sum_distances_1 = np.nansum(distances_1)
        sum_distances_2 = np.nansum(distances_2)
        if sum_distances_1 > sum_distances_2:
            new_df.loc[i, comparables[1][0]] = np.nan
        else:
            new_df.loc[i, comparables[2][0]] = np.nan


def fill_gap(i, new_df, df_copy, z, P_num_new, sum_dist_thresh):
    if P_num_new == 3 and all(is_numpy_array(df_copy.loc[i, col]) for col in ['Pb1', 'Pb2', 'Pb3']):
        Pb1 = df_copy.loc[i, 'Pb1']
        Pb2 = df_copy.loc[i, 'Pb2']
        Pb3 = df_copy.loc[i, 'Pb3']
        P1 = new_df.loc[i, 'P1']
        P2 = new_df.loc[i, 'P2']
        P3 = new_df.loc[i, 'P3']
        if all(is_numpy_array(new_df.loc[i, cols]) for cols in ['P1', 'P2']) and is_float(new_df.loc[i, 'P3']):
            values = [df_copy.loc[i, 'Pb1'], df_copy.loc[i, 'Pb2'], df_copy.loc[i, 'Pb3']]
            other_value = next(val for val in values if not np.array_equal(val, new_df.loc[i, 'P1']) and not np.array_equal(val, new_df.loc[i, 'P2']))
            new_df.loc[i, 'P3'] = other_value
            print("all 3 should be there - filled in P3 with the missing value that didn't qualify tracking for i = {0}".format(i))
        elif all(is_numpy_array(new_df.loc[i, cols2]) for cols2 in ['P2', 'P3']) and is_float(new_df.loc[i, 'P1']):
            values = [df_copy.loc[i, 'Pb1'], df_copy.loc[i, 'Pb2'], df_copy.loc[i, 'Pb3']]
            other_value = next(val for val in values if not np.array_equal(val, new_df.loc[i, 'P2']) and not np.array_equal(val, new_df.loc[i, 'P3']))
            new_df.loc[i, 'P1'] = other_value
            print("all 3 should be there - filled in P1 with the missing value that didn't qualify tracking for i = {0}".format(i))
        elif all(is_numpy_array(new_df.loc[i, cols3]) for cols3 in ['P1', 'P3']) and is_float(new_df.loc[i, 'P2']):
            values = [df_copy.loc[i, 'Pb1'], df_copy.loc[i, 'Pb2'], df_copy.loc[i, 'Pb3']]
            other_value = next(val for val in values if not np.array_equal(val, new_df.loc[i, 'P1']) and not np.array_equal(val, new_df.loc[i, 'P3']))
            new_df.loc[i, 'P2'] = other_value
            print("all 3 should be there - filled in P2 with the missing value that didn't qualify tracking for i = {0}".format(i))
    #if there should be 2 arrays and theres just one
    elif P_num_new == 3 and sum(type(df_copy.loc[i, col]) == float for col in ['Pb1', 'Pb2', 'Pb3']) == 1 and\
    sum(type(new_df.loc[i, col]) == float for col in ['P1', 'P2', 'P3']) == 2:
        Pb1 = df_copy.loc[i, 'Pb1']
        Pb2 = df_copy.loc[i, 'Pb2']
        Pb3 = df_copy.loc[i, 'Pb3']
        P1 = new_df.loc[i, 'P1']
        P2 = new_df.loc[i, 'P2']
        P3 = new_df.loc[i, 'P3']
        #FIND ARRAY X FROM DF COPY, THE MISSING ONE
        if is_numpy_array(Pb1):
            if not np.array_equal(Pb1, P1) and not np.array_equal(Pb1, P2) and not np.array_equal(Pb1, P3):
                array_X = Pb1
        elif is_numpy_array(Pb2):
            if not np.array_equal(Pb2, P1) and not np.array_equal(Pb2, P2) and not np.array_equal(Pb2, P3):
                array_X = Pb2
        elif is_numpy_array(Pb3):
            if not np.array_equal(Pb3, P1) and not np.array_equal(Pb3, P2) and not np.array_equal(Pb3, P3):
                array_X = Pb3
        #if P1 is already filled
                if is_numpy_array(P1):
                    distances_1 = euc_distance(array_X, new_df.loc[i-z, 'P2'])
                    distances_2 = euc_distance(array_X, new_df.loc[i-z, 'P3'])
                    sum_distances_1 = np.nansum(distances_1)
                    sum_distances_2 = np.nansum(distances_2)
                    if sum_distances_1 > sum_dist_thresh and sum_distances_2 > sum_dist_thresh and\
                    sum_distances_1 != 0.0 and sum_distances_2 != 0.0:
                        print("sumdistances of both P2 and P3 columns with the missing array were over thresh")
                        P2 = np.nan
                        P3 = np.nan
                    elif sum_distances_1 < sum_distances_2:
                        P2 = array_X
                        print("filled P2 in with missing array")
                    elif sum_distances_1 > sum_distances_2:
                        P3 = array_X
                        print("filled P3 in with missing array")
                #IF P2 IS ALREADY FILLED
                elif is_numpy_array(P2):
                    distances_1 = euc_distance(array_X, new_df.loc[i-z, 'P1'])
                    distances_2 = euc_distance(array_X, new_df.loc[i-z, 'P3'])
                    sum_distances_1 = np.nansum(distances_1)
                    sum_distances_2 = np.nansum(distances_2)
                    if sum_distances_1 > sum_dist_thresh and sum_distances_2 > sum_dist_thresh and\
                    sum_distances_1 != 0.0 and sum_distances_2 != 0.0:
                        print("sumdistances of both P1 and P3 columns with the missing array were over thresh")
                        P1 = np.nan
                        P3 = np.nan
                    elif sum_distances_1 < sum_distances_2:
                        P1 = array_X
                        print("filled P1 in with missing array")
                    elif sum_distances_1 > sum_distances_2:
                        P3 = array_X
                        print("filled P3 in with missing array")
                elif is_numpy_array(P3):
                    distances_1 = euc_distance(array_X, new_df.loc[i-z, 'P1'])
                    distances_2 = euc_distance(array_X, new_df.loc[i-z, 'P2'])
                    sum_distances_1 = np.nansum(distances_1)
                    sum_distances_2 = np.nansum(distances_2)
                    if sum_distances_1 > sum_dist_thresh and sum_distances_2 > sum_dist_thresh and\
                    sum_distances_1 != 0.0 and sum_distances_2 != 0.0:
                        print("sumdistances of both P1 and P2 columns with the missing array were over thresh")
                        P1 = np.nan
                        P2 = np.nan
                    elif sum_distances_1 < sum_distances_2:
                        P1 = array_X
                        print("filled P1 in with missing array")
                    elif sum_distances_1 > sum_distances_2:
                        P2 = array_X
                        print("filled P2 in with missing array")


def track_df(df_copy, new_df):
    for i in tqdm(range(1, len(df_copy))):
        # find the appropriate z
        #
        z_per_person = [] 
        # now we will fill in this list DETERMINING += Z VALUES FOR I-Z
        for P_num_new in [1, 2, 3]:
            z = 1
            while (i - z >= 0) and np.isnan(new_df.loc[i-z, 'P'+ str(P_num_new)]).any():
                z += 1
            z_per_person.append(z) #after the for this will be z_per_person=[z1, z2, z3]
               # Starts basic tests
            z = z_per_person[P_num_new -1]
            find_person(df_copy, new_df, i, P_num_new, z = z, sum_dist_thresh = 320,)
            control_fakeP1_and_P2_duplicates(i, new_df, z)
            fill_gap(i, new_df, df_copy, z, P_num_new, sum_dist_thresh = 320)
            control_fakeP1_and_P2_duplicates(i, new_df, z)

In [None]:
if occlusion_2 == "yes":
    # --- Calculate frame numbers  corresponding to the start and end times ---
    start_frame = int(start_time * fps)
    end_frame = int(end_time * fps)
    start_frame_2 = int(start_time_2 * fps)
    end_frame_2 = int(end_time_2 * fps)

    # --- Frame ranges for occlusions ---
    occlusion_frame_range = range(start_frame, end_frame + 1)
    occlusion_frame_range_2 = range(start_frame_2, end_frame_2 + 1)

    # --- Select df_head segments ---
    df_head1 = df_head[df_head.index < start_frame]
    df_occlusion = df_head[df_head.index.isin(occlusion_frame_range)]
    df_head2 = df_head[(df_head.index > end_frame) & (df_head.index < start_frame_2)]
    df_occlusion_2 = df_head[df_head.index.isin(occlusion_frame_range_2)]
    df_head3 = df_head[df_head.index > end_frame_2]

    # --- Select corresponding new_df segments ---
    new_df1 = new_df[new_df.index < start_frame]
    new_df2 = new_df[(new_df.index > end_frame) & (new_df.index < start_frame_2)]
    new_df3 = new_df[new_df.index > end_frame_2]

    # --- Copy first non-NaN frame from each segment ---
    min_index1 = df_head1[['Pb1', 'Pb2', 'Pb3']].dropna().index.min()
    new_df1.loc[min_index1, 'P1'] = df_head1.loc[min_index1, 'Pb1']
    new_df1.loc[min_index1, 'P2'] = df_head1.loc[min_index1, 'Pb2']
    new_df1.loc[min_index1, 'P3'] = df_head1.loc[min_index1, 'Pb3']
    new_df1 = new_df1[new_df1.index >= min_index1]

    min_index2 = df_head2[['Pb1', 'Pb2', 'Pb3']].dropna().index.min()
    new_df2.loc[min_index2, 'P1'] = df_head2.loc[min_index2, 'Pb1']
    new_df2.loc[min_index2, 'P2'] = df_head2.loc[min_index2, 'Pb2']
    new_df2.loc[min_index2, 'P3'] = df_head2.loc[min_index2, 'Pb3']
    new_df2 = new_df2[new_df2.index >= min_index2]

    min_index3 = df_head3[['Pb1', 'Pb2', 'Pb3']].dropna().index.min()
    new_df3.loc[min_index3, 'P1'] = df_head3.loc[min_index3, 'Pb1']
    new_df3.loc[min_index3, 'P2'] = df_head3.loc[min_index3, 'Pb2']
    new_df3.loc[min_index3, 'P3'] = df_head3.loc[min_index3, 'Pb3']
    new_df3 = new_df3[new_df3.index >= min_index3]



In [None]:
df_head1 = df_head1.reset_index()
df_head1 = df_head1[["Pb1", "Pb2", "Pb3"]]
new_df1 = new_df1.reset_index()
new_df1 = new_df1[["P1", "P2", "P3"]]
#run tracking for first part
track_df(df_head1, new_df1)

In [None]:
new_index1 = new_df1.index + min_index1

# Create a new DataFrame with the same data and the updated index
new_df1_a = new_df1.copy()
new_df1_a.index = new_index1

df_head2 = df_head2.reset_index()
df_head2 = df_head2[["Pb1", "Pb2", "Pb3"]]
new_df2 = new_df2.reset_index()
new_df2 = new_df2[["P1", "P2", "P3"]]

track_df(df_head2, new_df2)

In [None]:
new_index2 = new_df2.index + min_index2

# Create a new DataFrame with the same data and the updated index
new_df2_a = new_df2.copy()
new_df2_a.index = new_index2

df_head3 = df_head3.reset_index()
df_head3 = df_head3[["Pb1", "Pb2", "Pb3"]]
new_df3 = new_df3.reset_index()
new_df3 = new_df3[["P1", "P2", "P3"]]

track_df(df_head3, new_df3)

In [None]:
new_index3 = new_df3.index + min_index3

# Create a new DataFrame with the same data and the updated index
new_df3_a = new_df3.copy()
new_df3_a.index = (new_index3)
#-((min_index1-start_point)*2)

# Assuming new_df1 and new_df2 are your DataFrames
new_df = pd.concat([new_df1_a, new_df2_a, new_df3_a])

# Get the full range of index values
index_range = range(new_df.index.min(), new_df.index.max()+1)

# Reindex the concatenated DataFrame with the full index range
new_df = new_df.reindex(index_range)

# Fill NaN values with NaN (optional, as they are already NaN by default)
new_df.fillna(np.nan, inplace=True)


new_df.to_pickle("new_df_10_mins_"+ppt+"_latest_occ.pkl")
new_df.to_csv("new_df_10_mins_"+ppt+"_latest_occ.csv")