In [7]:
from glob import glob
from PIL import Image
import imagehash
from typing import List

def get_frame_idx(fname: str) -> int:
    """for a given file name, return the frame index.
    
    for example, 'cropped/001.png' returns int 1.
    """
    idx = int(fname.split("/")[-1].split(".")[0])
    return idx





def create_blocks(stack: List[str]) -> List[List[int]]:
    """create continguous blocks of similar frames.
    """
    MAX_FRAME_DIFF = 5

    blocks = []
    block = []

    for i in range(len(stack) - 1):
        p_idx = frame(stack[i])
        n_idx = frame(stack[i + 1])

        if (n_idx - p_idx) < MAX_FRAME_DIFF:
            block.append(p_idx)
            block.append(n_idx)
        else:
            blocks.append(block)
            block = []
            
    return blocks



def create_snippets(video: List[str]) -> List[str]:
    """generate continguous snippets from a list of frames.
    
    snippets are frames that are separated by no more than MAX_FRAME_DIFF.
    """
    MAX_FRAME_DIFF = 5

    blocks = []
    block = []

    for i in range(len(video) - 1):
        p_idx = get_frame_idx(video[i])
        n_idx = get_frame_idx(video[i + 1])

        if (n_idx - p_idx) < MAX_FRAME_DIFF:
            block.append(video[i])
            block.append(video[i + 1])
        else:
            if len(block) > 0:
                blocks.append(sorted(list(set(block))))
            block = []
    blocks.append(sorted(list(set(block))))
    return blocks


def split_files_by_actors(folder: str, label: str) -> dict:
    """separate images by actor.
    
    a folder contains images from multiple actors. we will separate images by actor.
    """
    
    fnames = sorted(glob(folder + '/' + label + '/*'))

    actor_ids = _get_actor_ids(folder, label)  # get all possible IDs for a given folder
    
    fnames_by_actor = {}
    for actor in actor_ids:
        fnames_by_actor[actor] = []
    
    for actor in actor_ids:
        for fname in fnames:
            actor_id = int(fname.split(".")[-2])
            if actor_id == actor:
                fnames_by_actor[actor].append(fname)
    return fnames_by_actor


def _get_actor_ids(folder: str, label: str) -> List:
    fnames = sorted(glob(folder + '/' + label + '/*'))
    
    actor_ids = []
    for fname in fnames:
        actor_id = int(fname.split(".")[-2])
        if actor_id not in actor_ids:
            actor_ids.append(actor_id)
    return actor_ids


def remove_outliers_from_snippet(files: List[str]) -> List[str]:
    """remove images that are not similar with any other images in the batch.
    
    files is a sorted list of image file names, sorted by frame index number. this helps us
    keep files that are similar to its neighbors.

    we want to keep images that are similar with at least one other image in the batch.
    this is because we want to find long contiguous blocks of similar images, which would
    suggest static faces.
    """
    
    stack = []
    
    # remove other images

    for i in range(len(files) - 1):
        prev_f = files[i]
        next_f = files[i + 1]

        prev_h = imagehash.average_hash(Image.open(prev_f))
        next_h = imagehash.average_hash(Image.open(next_f))

        # keep if it's similar
        if next_h == prev_h:
            stack.append(prev_f)
            stack.append(next_f)
        else:
            pass
        
    return sorted(list(set(stack)))

# script

In [8]:
# get folders
folders = glob("dataset/*")
folders

['dataset/547538174468711490516541559363']

In [9]:
# for that folder, look at static label
collection_by_actor = split_files_by_actors(folder = folders[0], label = "static")

In [10]:
videos_by_id = collection_by_actor[1]  # all videos by actor id

In [13]:
snippets = create_snippets(videos_by_id)  # snippets without outliers removed
snippets[0]

['dataset/547538174468711490516541559363/static/00224.00001.jpg',
 'dataset/547538174468711490516541559363/static/00228.00001.jpg',
 'dataset/547538174468711490516541559363/static/00232.00001.jpg',
 'dataset/547538174468711490516541559363/static/00236.00001.jpg',
 'dataset/547538174468711490516541559363/static/00240.00001.jpg',
 'dataset/547538174468711490516541559363/static/00244.00001.jpg',
 'dataset/547538174468711490516541559363/static/00248.00001.jpg',
 'dataset/547538174468711490516541559363/static/00252.00001.jpg',
 'dataset/547538174468711490516541559363/static/00256.00001.jpg',
 'dataset/547538174468711490516541559363/static/00260.00001.jpg',
 'dataset/547538174468711490516541559363/static/00264.00001.jpg',
 'dataset/547538174468711490516541559363/static/00268.00001.jpg']

In [15]:
processed_snippets = remove_outliers_from_snippet(snippets[0])
processed_snippets

['dataset/547538174468711490516541559363/static/00228.00001.jpg',
 'dataset/547538174468711490516541559363/static/00232.00001.jpg',
 'dataset/547538174468711490516541559363/static/00236.00001.jpg',
 'dataset/547538174468711490516541559363/static/00240.00001.jpg',
 'dataset/547538174468711490516541559363/static/00244.00001.jpg',
 'dataset/547538174468711490516541559363/static/00248.00001.jpg',
 'dataset/547538174468711490516541559363/static/00252.00001.jpg',
 'dataset/547538174468711490516541559363/static/00256.00001.jpg',
 'dataset/547538174468711490516541559363/static/00260.00001.jpg',
 'dataset/547538174468711490516541559363/static/00264.00001.jpg',
 'dataset/547538174468711490516541559363/static/00268.00001.jpg']

In [16]:
create_snippets(processed_snippets)  # recreate snippets now that outliers have been removed

[['dataset/547538174468711490516541559363/static/00228.00001.jpg',
  'dataset/547538174468711490516541559363/static/00232.00001.jpg',
  'dataset/547538174468711490516541559363/static/00236.00001.jpg',
  'dataset/547538174468711490516541559363/static/00240.00001.jpg',
  'dataset/547538174468711490516541559363/static/00244.00001.jpg',
  'dataset/547538174468711490516541559363/static/00248.00001.jpg',
  'dataset/547538174468711490516541559363/static/00252.00001.jpg',
  'dataset/547538174468711490516541559363/static/00256.00001.jpg',
  'dataset/547538174468711490516541559363/static/00260.00001.jpg',
  'dataset/547538174468711490516541559363/static/00264.00001.jpg',
  'dataset/547538174468711490516541559363/static/00268.00001.jpg']]