In [None]:
import asyncio
import logging
import os
from pathlib import Path
import sys

import imageio.v3 as iio
from sklearn.cluster import HDBSCAN
from matplotlib import colormaps, colors
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import umap

from mime_db import MimeDb
from pose_functions import *

In [None]:
VIDEO_FILE = "JuliusCaesar—WinterMainStage23.mp4" # Just the name of the video file, no path

video_path = Path("videos", VIDEO_FILE)

# Connect to the database
db = await MimeDb.create()

# Get video metadata
video_name = video_path.name
video_id = await db.get_video_id(video_name)
video_id = video_id[0]["id"]

video_movelets = await db.get_movelet_data_from_video(video_id)

movelets_df = pd.DataFrame.from_records(video_movelets, columns=video_movelets[0].keys())

In [None]:
print("TOTAL MOVELETS:", len(movelets_df))
print("NON-MOTION MOVELETS:", len(movelets_df[movelets_df['movement'].isna()]))
print("MOVELETS WITH STILL MOTION:", len(movelets_df[movelets_df['movement'] == 0]))
print("MOVELETS WITH MOVEMENT < 10px/sec:", len(movelets_df[(movelets_df['movement'] >= 0) & (movelets_df['movement'] < 10)]))

print("MEAN MOVEMENT PER MOVELET (norm px/sec):", np.nanmean(movelets_df['movement']))
print("MEDIAN MOVEMENT PER MOVELET (norm px/sec):", np.nanmedian(movelets_df['movement']))

In [None]:
nonnull_movelets_df = movelets_df.copy()
nonnull_movelets_df['movement'].fillna(-1, inplace=True)
n, bins, patches = plt.hist(nonnull_movelets_df[nonnull_movelets_df['movement'] <= 500]['movement'], bins=300)
plt.xlabel("Movement (normalized pixels/sec)")
plt.ylabel("# Movelets")
top_bin = n[1:].argmax()
print('most frequent bin: (' + str(bins[top_bin]) + ',' + str(bins[top_bin+1]) + ')')
print('mode: '+ str((bins[top_bin] + bins[top_bin+1])/2))
movement_mode = (bins[top_bin] + bins[top_bin+1])/2

In [None]:
frozen_movelets = movelets_df[(movelets_df['movement'] >= 0) & (movelets_df['movement'] < movement_mode)].reset_index()
frozen_poses = frozen_movelets['norm'].tolist()

In [None]:
standard_embedding = umap.UMAP(
    random_state=42,
).fit_transform(frozen_poses)

plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], s=4)

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=10,
    min_dist=1.0,
    n_components=2,
    random_state=42,
).fit_transform(frozen_poses)

plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=4)

In [None]:
print("fitting clustering model")

hdb = HDBSCAN(min_cluster_size=3, min_samples=4) # , max_cluster_size=15
hdb.fit(frozen_poses)
labels = hdb.labels_.tolist()

assigned_poses = 0

cluster_to_poses = {}
for i, cluster_id in enumerate(labels):
    if cluster_id not in cluster_to_poses:
        cluster_to_poses[cluster_id] = [i]
    else:
        cluster_to_poses[cluster_id].append(i)

poses_per_track_per_cluster = []
        
for cluster_id in range(-1, max(labels) + 1):
    print("Poses in cluster", cluster_id, labels.count(cluster_id))

    cluster_track_poses = {}
    for movelet_id in cluster_to_poses[cluster_id]:
        movelet_track = frozen_movelets.iloc[movelet_id]['track_id']
        if movelet_track not in cluster_track_poses:
            cluster_track_poses[movelet_track] = 1
        else:
            cluster_track_poses[movelet_track] += 1
            
    if cluster_id != -1:
        assigned_poses += labels.count(cluster_id)
        poses_per_track_per_cluster.append(labels.count(cluster_id) / len(cluster_track_poses))
    
    print("Tracks in cluster", cluster_id, len(cluster_track_poses))

print("assigned", assigned_poses, "poses out of", len(labels), round(assigned_poses/len(labels),4))

fig = plt.figure(figsize=(10,10))
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)

fig2 = plt.figure(figsize=(10,4))
n, bins, patches = plt.hist(poses_per_track_per_cluster, bins=30)

In [None]:
print("fitting UMAP preclustered model")

hdb = HDBSCAN(min_cluster_size=3, min_samples=4) # , max_cluster_size=15
hdb.fit(clusterable_embedding)
labels = hdb.labels_.tolist()

cluster_to_poses = {}
for i, cluster_id in enumerate(labels):
    if cluster_id not in cluster_to_poses:
        cluster_to_poses[cluster_id] = [i]
    else:
        cluster_to_poses[cluster_id].append(i)
        
# Build an alternative, filtered movelet set that is
# filtered down to just one movelet per track in a cluster
# i.e., when more than one pose per track is in a given
# cluster, just keep the first one. This has the effect
# of stripping out repeated poses that are part of the
# same low-motion movelet.

filtered_movelet_indices = []

poses_per_track_per_cluster = []
        
for cluster_id in range(-1, max(labels) + 1):
    print("Poses in cluster", cluster_id, labels.count(cluster_id))

    cluster_track_poses = {}
    for movelet_id in cluster_to_poses[cluster_id]:
        movelet_track = frozen_movelets.iloc[movelet_id]['track_id']
        if movelet_track not in cluster_track_poses:
            if cluster_id != -1:
                filtered_movelet_indices.append(movelet_id)
            cluster_track_poses[movelet_track] = 1 # Include non-clustered poses?
        else:
            cluster_track_poses[movelet_track] += 1
            
    if cluster_id != -1:
        assigned_poses += labels.count(cluster_id)
        poses_per_track_per_cluster.append(labels.count(cluster_id) / len(cluster_track_poses))
    
    print("Tracks in cluster", cluster_id, len(cluster_track_poses))

print("assigned", assigned_poses, "poses out of", len(labels), round(assigned_poses/len(labels),4))

fig = plt.figure(figsize=(10,10))
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)

fig2 = plt.figure(figsize=(10,4))
n, bins, patches = plt.hist(poses_per_track_per_cluster, bins=30)

In [None]:
print(len(filtered_movelet_indices))

filtered_movelet_counts = dict()
for i in filtered_movelet_indices:
    filtered_movelet_counts[i] = filtered_movelet_counts.get(i, 0) + 1

print("Filtered movelets:",len(set(filtered_movelet_indices)))
filtered_movelets = frozen_movelets.iloc[list(set(filtered_movelet_indices))]
filtered_movelets.reset_index(inplace=True)
filtered_poses = filtered_movelets['norm'].tolist()
filtered_poses = [np.nan_to_num(pose, nan=-1) for pose in filtered_poses]
len(filtered_movelets)

In [None]:
print("visualizing UMAP preclustered model")
show_poses = True
plot_images = True

if show_poses:
    ord_cluster_to_poses = res = OrderedDict(sorted(cluster_to_poses.items(), key = lambda x : len(x[1]), reverse=True)).keys()
    for cluster_id in ord_cluster_to_poses:
        cluster_poses = []
        fig, ax = plt.subplots()
        fig.set_size_inches(UPSCALE * 100 / fig.dpi, UPSCALE * 100 / fig.dpi)
        fig.canvas.draw()
        print("CLUSTER:", cluster_id, "POSES:", len(cluster_to_poses[cluster_id]))
        for pose_index in cluster_to_poses[cluster_id]:
            cl_pose = frozen_poses[pose_index]
            cl_pose[cl_pose==-1] = np.nan
            cluster_poses.append(cl_pose)
        cluster_average = np.nanmean(np.array(cluster_poses), axis=0).tolist()
        armature_prevalences = get_armature_prevalences(cluster_poses)
        cluster_average = np.array_split(cluster_average, len(cluster_average) / 2)
        #print("Average pose in cluster", cluster_id, cluster_average)
        cluster_average_img = draw_normalized_and_unflattened_pose(
            cluster_average, armature_prevalences=armature_prevalences
        )
        #plt.figure(figsize=(2,2))
        plt.imshow(cluster_average_img)
        plt.show()
        

fig = plt.figure(figsize=(40,40))
ax = fig.gca()
cm = colormaps["Spectral"]
norm = colors.Normalize(vmin=-1, vmax=max(labels))

if plot_images:
    
    ax.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], alpha=0)
    for i, cluster_id in enumerate(labels):
        if cluster_id == -1:
            continue
        cl_pose = frozen_poses[i]
        cl_pose[cl_pose==-1] = np.nan
        cluster_pose = np.array_split(cl_pose, len(cl_pose) / 2)
        cluster_pose_img = draw_normalized_and_unflattened_pose(
            cluster_pose, armature_prevalences=[1] * 19
        )
        #img = Image.fromarray(img_region)
        img = cluster_pose_img
        img.thumbnail((40, 40), resample=Image.Resampling.LANCZOS)
        ab = AnnotationBbox(OffsetImage(np.asarray(img)), (clusterable_embedding[i, 0], clusterable_embedding[i, 1]), frameon=False)
        #ab.patch.set_linewidth(0)
        #ab.patch.set(color=cm(norm(cluster_id)))

        ax.add_artist(ab)
else:
    ax.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)

In [None]:
standard_embedding = umap.UMAP(
    random_state=42,
).fit_transform(filtered_poses)

plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], s=4)

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=10,
    min_dist=1.0,
    n_components=2,
    random_state=42,
).fit_transform(filtered_poses)

plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=4)

In [None]:
print("fitting clustering model")

hdb = HDBSCAN(min_cluster_size=3, min_samples=4) # , max_cluster_size=15
hdb.fit(filtered_poses)
labels = hdb.labels_.tolist()

assigned_poses = 0

cluster_to_poses = {}
for i, cluster_id in enumerate(labels):
    if cluster_id not in cluster_to_poses:
        cluster_to_poses[cluster_id] = [i]
    else:
        cluster_to_poses[cluster_id].append(i)

poses_per_track_per_cluster = []
        
for cluster_id in range(-1, max(labels) + 1):
    print("Poses in cluster", cluster_id, labels.count(cluster_id))

    cluster_track_poses = {}
    for movelet_id in cluster_to_poses[cluster_id]:
        movelet_track = filtered_movelets.iloc[movelet_id]['track_id']
        if movelet_track not in cluster_track_poses:
            cluster_track_poses[movelet_track] = 1
        else:
            cluster_track_poses[movelet_track] += 1
            
    if cluster_id != -1:
        assigned_poses += labels.count(cluster_id)
        poses_per_track_per_cluster.append(labels.count(cluster_id) / len(cluster_track_poses))
    
    print("Tracks in cluster", cluster_id, len(cluster_track_poses))

print("assigned", assigned_poses, "poses out of", len(labels), round(assigned_poses/len(labels),4))

fig = plt.figure(figsize=(10,10))
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)

fig2 = plt.figure(figsize=(10,4))
n, bins, patches = plt.hist(poses_per_track_per_cluster, bins=30)

In [None]:
print("fitting UMAP preclustered model")

hdb = HDBSCAN(min_cluster_size=3, min_samples=4) # , max_cluster_size=15
hdb.fit(clusterable_embedding)
labels = hdb.labels_.tolist()

cluster_to_poses = {}
for i, cluster_id in enumerate(labels):
    if cluster_id not in cluster_to_poses:
        cluster_to_poses[cluster_id] = [i]
    else:
        cluster_to_poses[cluster_id].append(i)
        
# Build an alternative, filtered movelet set that is
# filtered down to just one movelet per track in a cluster
# i.e., when more than one pose per track is in a given
# cluster, just keep the first one. This has the effect
# of stripping out repeated poses that are part of the
# same low-motion movelet.

filtered_movelet_indices = []

poses_per_track_per_cluster = []
        
for cluster_id in range(-1, max(labels) + 1):
    print("Poses in cluster", cluster_id, labels.count(cluster_id))

    cluster_track_poses = {}
    for movelet_id in cluster_to_poses[cluster_id]:
        movelet_track = filtered_movelets.iloc[movelet_id]['track_id']
        if movelet_track not in cluster_track_poses:
            if cluster_id != -1:
                filtered_movelet_indices.append(movelet_id)
            cluster_track_poses[movelet_track] = 1 # Include non-clustered poses?
        else:
            cluster_track_poses[movelet_track] += 1
            
    if cluster_id != -1:
        assigned_poses += labels.count(cluster_id)
        poses_per_track_per_cluster.append(labels.count(cluster_id) / len(cluster_track_poses))
    
    print("Tracks in cluster", cluster_id, len(cluster_track_poses))

print("assigned", assigned_poses, "poses out of", len(labels), round(assigned_poses/len(labels),4))

fig = plt.figure(figsize=(10,10))
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)

fig2 = plt.figure(figsize=(10,4))
n, bins, patches = plt.hist(poses_per_track_per_cluster, bins=30)

In [None]:
print("visualizing UMAP preclustered model")
show_poses = True
plot_images = True

from collections import OrderedDict

if show_poses:
    ord_cluster_to_poses = res = OrderedDict(sorted(cluster_to_poses.items(), key = lambda x : len(x[1]), reverse=True)).keys()
    for cluster_id in ord_cluster_to_poses:
        fig, ax = plt.subplots()
        fig.set_size_inches(UPSCALE * 100 / fig.dpi, UPSCALE * 100 / fig.dpi)
        fig.canvas.draw()
        
        cluster_poses = []
        print("CLUSTER:", cluster_id, "POSES:", len(cluster_to_poses[cluster_id]))
        for pose_index in cluster_to_poses[cluster_id]:
            cl_pose = filtered_poses[pose_index]
            cl_pose[cl_pose==-1] = np.nan
            cluster_poses.append(cl_pose)
        cluster_average = np.nanmean(np.array(cluster_poses), axis=0).tolist()
        armature_prevalences = get_armature_prevalences(cluster_poses)
        cluster_average = np.array_split(cluster_average, len(cluster_average) / 2)
        #print("Average pose in cluster", cluster_id, cluster_average)
        cluster_average_img = draw_normalized_and_unflattened_pose(
            cluster_average, armature_prevalences=armature_prevalences
        )
        #plt.figure(figsize=(2,2))
        plt.imshow(cluster_average_img)
        plt.show()
        

fig = plt.figure(figsize=(40,40))
ax = fig.gca()
cm = colormaps["Spectral"]
norm = colors.Normalize(vmin=-1, vmax=max(labels))

if plot_images:
    
    ax.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], alpha=0)
    #ax.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)
    for i, cluster_id in enumerate(labels):
        #if cluster_id == -1:
        #    continue
        cl_pose = frozen_poses[i]
        cl_pose[cl_pose==-1] = np.nan
        cluster_pose = np.array_split(cl_pose, len(cl_pose) / 2)
        cluster_pose_img = draw_normalized_and_unflattened_pose(
            cluster_pose, armature_prevalences=[1] * 19
        )
        #img = Image.fromarray(img_region)
        img = cluster_pose_img
        img.thumbnail((40, 40), resample=Image.Resampling.LANCZOS)
        ab = AnnotationBbox(OffsetImage(np.asarray(img)), (clusterable_embedding[i, 0], clusterable_embedding[i, 1]), frameon=False)
        ax.text(clusterable_embedding[i,0], clusterable_embedding[i, 1], cluster_id) 
        #ab.patch.set_linewidth(0)
        #ab.patch.set(color=cm(norm(cluster_id)))

        ax.add_artist(ab)
else:
    ax.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], c=labels, cmap='Spectral', s=4)