In [1]:
from glob import glob
from PIL import Image
import imagehash
from typing import List

def get_frame_idx(fname: str) -> int:
    """for a given file name, return the frame index.
    
    for example, 'cropped/001.png' returns int 1.
    """
    idx = int(fname.split("/")[-1].split(".")[0])
    return idx





def create_blocks(stack: List[str]) -> List[List[int]]:
    """create continguous blocks of similar frames.
    """
    MAX_FRAME_DIFF = 5

    blocks = []
    block = []

    for i in range(len(stack) - 1):
        p_idx = frame(stack[i])
        n_idx = frame(stack[i + 1])

        if (n_idx - p_idx) < MAX_FRAME_DIFF:
            block.append(p_idx)
            block.append(n_idx)
        else:
            blocks.append(block)
            block = []
            
    return blocks



def create_snippets(video: List[str]) -> List[str]:
    """generate continguous snippets from a list of frames.
    
    snippets are frames that are separated by no more than MAX_FRAME_DIFF.
    """
    MAX_FRAME_DIFF = 5

    blocks = []
    block = []

    for i in range(len(video) - 1):
        p_idx = get_frame_idx(video[i])
        n_idx = get_frame_idx(video[i + 1])

        if (n_idx - p_idx) < MAX_FRAME_DIFF:
            block.append(video[i])
            block.append(video[i + 1])
        else:
            if len(block) > 0:
                blocks.append(sorted(list(set(block))))
            block = []
    blocks.append(sorted(list(set(block))))
    return blocks


def split_files_by_actors(folder: str, label: str) -> dict:
    """separate images by actor.
    
    a folder contains images from multiple actors. we will separate images by actor.
    """
    
    fnames = sorted(glob(folder + '/' + label + '/*'))

    actor_ids = _get_actor_ids(folder, label)  # get all possible IDs for a given folder
    
    fnames_by_actor = {}
    for actor in actor_ids:
        fnames_by_actor[actor] = []
    
    for actor in actor_ids:
        for fname in fnames:
            actor_id = int(fname.split(".")[-2])
            if actor_id == actor:
                fnames_by_actor[actor].append(fname)
    return fnames_by_actor


def _get_actor_ids(folder: str, label: str) -> List:
    fnames = sorted(glob(folder + '/' + label + '/*'))
    
    actor_ids = []
    for fname in fnames:
        actor_id = int(fname.split(".")[-2])
        if actor_id not in actor_ids:
            actor_ids.append(actor_id)
    return actor_ids


def remove_outliers_from_snippet(files: List[str]) -> List[str]:
    """    
    for a List of fnames, remove fnames that are outliers. return new List of fnames.
    """
    
    stack = []
    
    # remove other images

    for i in range(len(files) - 1):
        prev_f = files[i]
        next_f = files[i + 1]

        prev_h = imagehash.average_hash(Image.open(prev_f))
        next_h = imagehash.average_hash(Image.open(next_f))

        # keep if it's similar
        if next_h == prev_h:
            stack.append(prev_f)
            stack.append(next_f)
        else:
            pass
        
    return sorted(list(set(stack)))

# script

In [4]:
# get folders
folders = glob("dataset/*")

for folder in folders:
    print(">> loading folder", folder)
    
    # eval static or not_static folder
    collection_by_actor = split_files_by_actors(folder = folders[0], label = "not_static")
    
    # then evaluate for each actor
    for k in collection_by_actor.keys():
        print(">> evaluating actor", k)
        videos_by_id = collection_by_actor[k]  # all videos by actor id
        
        # convert to snippets
        snippets = create_snippets(videos_by_id)  # snippets without outliers removed
        
        for snippet in snippets:
            processed_snippet = remove_outliers_from_snippet(snippet)
            
            final_snippets = create_snippets(processed_snippet)  # recreate snippets now that outliers have been removed
                
            for g in final_snippets:
                if len(g) > 10:
                    print("len", len(g), g[0])

        print("-" * 50)
    print("=" * 50)

>> loading folder dataset/608832786432738882426817735212
>> evaluating actor 0
--------------------------------------------------
>> evaluating actor 1
--------------------------------------------------
>> evaluating actor 2
--------------------------------------------------
>> evaluating actor 3
--------------------------------------------------
>> evaluating actor 4
--------------------------------------------------
>> evaluating actor 5
--------------------------------------------------
>> evaluating actor 6
--------------------------------------------------
>> evaluating actor 7
--------------------------------------------------
>> evaluating actor 8
--------------------------------------------------
>> evaluating actor 9
--------------------------------------------------
>> evaluating actor 10
--------------------------------------------------
>> evaluating actor 11
--------------------------------------------------
>> loading folder dataset/709090324988773024868607674872
>> evalu