In [None]:
%matplotlib inline

import subprocess
from pathlib import Path
from itertools import chain
from datetime import timedelta

import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

VIDEO_SRC = [r'D:\downloads\test_videos', r'D:\downloads\poi']
VIDEO_CSV = '../data/video_metadata.csv'

In [None]:
def get_video_data(video: Path, prefix='original-'):
    name = video.stem
    p = subprocess.run(['ffprobe', '-v', 'quiet', '-of', 'default=nw=1:nk=1',
                        '-show_entries', 'format=duration:stream=width,height',
                        str(video)], capture_output=True)
    width, height, duration = p.stdout.strip().split()
    name = name[len(prefix):] if name.startswith(prefix) else name
    width = int(width)
    height = int(height)
    duration = float(duration) if duration not in (b'', b'N/A') else 0.0
    return name, width, height, width*height, duration

video_src = Path(VIDEO_CSV)
column_types = [('video_id', str), ('width', int), ('height', int), ('area', int), ('duration', float)]
if video_src.exists():
    df = pd.read_csv(video_src, index_col=0, dtype=dict(column_types))
else:
    df = pd.DataFrame((get_video_data(video) for video in tqdm(chain(*[Path(p).glob('**\*.mp4') for p in VIDEO_SRC]))),
                      columns=[n for n, t in coumn_types])
    df.to_csv(video_src)
df.head()

In [None]:
df.describe()

In [None]:
duration = df['duration'].sum()
print(f"Total length of footage: {timedelta(seconds=duration)} ({duration} seconds)")

In [None]:
fig = plt.figure()
ax = plt.axes()
df['duration'].plot(kind='hist', bins=20, title='Video distribution by Length', figsize=(9, 5), edgecolor='black', ax=ax)
plt.xlabel('Video length in seconds')
plt.show()