## Basic Setup

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from fastai.vision import *
from pathlib import Path
from utils import *
import matplotlib.pyplot as plt
import cv2

In [3]:
learn = load_learner('/home/rahul/github_projects/shot-type-classifier/models/', file='shot-type-classifier.pkl');

In [4]:
path = Path('/home/rahul/Desktop/movies/the-social-network-splits/')

## Reading in Video with `cv2`

Somehow, when reading in videos with `cv2`, the last 5 frames always get skipped.

<h3> OpenCV Properties with Indices </h3>

0. `CV_CAP_PROP_POS_MSEC` Current position of the video file in milliseconds.
1. `CV_CAP_PROP_POS_FRAMES` 0-based index of the frame to be decoded/captured next.
2. `CV_CAP_PROP_POS_AVI_RATIO` Relative position of the video file
3. `CV_CAP_PROP_FRAME_WIDTH` Width of the frames in the video stream.
4. `CV_CAP_PROP_FRAME_HEIGHT` Height of the frames in the video stream.
5. `CV_CAP_PROP_FPS` Frame rate.
6. `CV_CAP_PROP_FOURCC` 4-character code of codec.
7. `CV_CAP_PROP_FRAME_COUNT` Number of frames in the video file.
8. `CV_CAP_PROP_FORMAT` Format of the Mat objects returned by retrieve() .
9. `CV_CAP_PROP_MODE` Backend-specific value indicating the current capture mode.
10. `CV_CAP_PROP_BRIGHTNESS` Brightness of the image (only for cameras).
11. `CV_CAP_PROP_CONTRAST` Contrast of the image (only for cameras).
12. `CV_CAP_PROP_SATURATION` Saturation of the image (only for cameras).
13. `CV_CAP_PROP_HUE` Hue of the image (only for cameras).
14. `CV_CAP_PROP_GAIN` Gain of the image (only for cameras).
15. `CV_CAP_PROP_EXPOSURE` Exposure (only for cameras).
16. `CV_CAP_PROP_CONVERT_RGB` Boolean flags indicating whether images should be converted to RGB.
17. `CV_CAP_PROP_WHITE_BALANCE` Currently unsupported
18. `CV_CAP_PROP_RECTIFICATION` Rectification flag for stereo cameras (note: only supported by DC1394 v 2.x backend currently)

In [198]:
to_img = lambda x: Image(pil2tensor(x, np.float32).div_(255))

get_shot_idx  = lambda path: int(
    str(path)            ## /home/...../shot#0002.mp4
        .rsplit('/')[-1] ## shot#0002.mp4
        .split('.')[0]   ## shot#0002
        .split('#')[-1]  ## 0002 --> 2
)

# f = str(path/'shot#0002.mp4')
# get_shot_idx(f)
# get_shot_idx(os.listdir(path)[0])

def get_key_pred(df):
    i = df.groupby(['shot_no'])['prediction'].transform(max) == df['prediction']
    return df[i]

def predict(frame, frame_idx, fname, fps=fps, num_frames=num_frames):
    classes = ['Close-Up', 'Extreme Close-Up', 'Extreme Wide',
               'Long', 'Medium', 'Medium Close-Up']
    shot_idx = get_shot_idx(fname)

    # get preds
    pred, _, pred_ps = learn.predict(to_img(frame))

    # form data-frame
    df = pd.DataFrame(list(zip(classes, pred_ps.numpy())), columns=['shot-type', 'prediction'])

    # replace shot-types with acronyms
    df['shot-type'].replace({    'Extreme Wide': 'EWS',
                                         'Long': 'LS',
                                       'Medium': 'MS',
                              'Medium Close-Up': 'MCU',
                                     'Close-Up': 'CU',
                             'Extreme Close-Up': 'ECU'}, inplace=True)
    # set sorting order for shot-types and sort
    df['shot-type'] = pd.Categorical(df['shot-type'], ['EWS', 'LS', 'MS', 'MCU', 'CU', 'ECU'])
    df = df.sort_values('shot-type').reset_index(drop=True)

    # add more info
    df['prediction'] *= 100
    df['frame_no']    = frame_idx+1
    df['shot_no']     = shot_idx
    df['shot_length'] = num_frames / fps
    
    print(f'Shot #{shot_idx:04d}, Frame #{frame_idx+1:04d}: {pred}')
    return df, get_key_pred(df)

In [206]:
preds_all, preds_key = [], []
os.chdir(path)

for fname in sorted(os.listdir(path))[3:5]:
    # prep video and its properties
    cap = cv2.VideoCapture(fname)
    fps = cap.get(cv2.CAP_PROP_FPS)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # get frame indices (targets) for video
    frame_stride=5
    target_frames = np.arange(num_frames)[1::frame_stride]

    # grab the targeted frames
    frames=[]
    for i in target_frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret==True: frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        else: break
    cap.release()

    # get predictions with multiple levels of info
    for i,frame in zip(target_frames,frames):
        pred_all, pred_key = predict(frame, i, fname, fps, num_frames)
        preds_all.append(pred_all); preds_key.append(pred_key)
        
preds_all = pd.concat(preds_all, ignore_index=True)
preds_key = pd.concat(preds_key, ignore_index=True)

True

True

True

True

True

True

True

Shot #0004, Frame #0002: Close-Up
Shot #0004, Frame #0007: Close-Up
Shot #0004, Frame #0012: Close-Up
Shot #0004, Frame #0017: Close-Up
Shot #0004, Frame #0022: Close-Up
Shot #0004, Frame #0027: Close-Up
Shot #0004, Frame #0032: Close-Up


True

True

True

True

True

True

Shot #0005, Frame #0002: Close-Up
Shot #0005, Frame #0007: Close-Up
Shot #0005, Frame #0012: Close-Up
Shot #0005, Frame #0017: Close-Up
Shot #0005, Frame #0022: Close-Up
Shot #0005, Frame #0027: Close-Up


In [207]:
preds_all

Unnamed: 0,shot-type,prediction,frame_no,shot_no,shot_length
0,EWS,0.002569,2,4,1.334667
1,LS,0.001226,2,4,1.334667
2,MS,0.003324,2,4,1.334667
3,MCU,1.149294,2,4,1.334667
4,CU,98.827076,2,4,1.334667
...,...,...,...,...,...
73,LS,0.001502,27,5,1.251250
74,MS,0.012961,27,5,1.251250
75,MCU,23.090650,27,5,1.251250
76,CU,76.863283,27,5,1.251250


In [208]:
preds_key

Unnamed: 0,shot-type,prediction,frame_no,shot_no,shot_length
0,CU,98.827076,2,4,1.334667
1,CU,98.840779,7,4,1.334667
2,CU,97.967166,12,4,1.334667
3,CU,98.168874,17,4,1.334667
4,CU,96.972692,22,4,1.334667
5,CU,99.18859,27,4,1.334667
6,CU,97.409058,32,4,1.334667
7,CU,80.196357,2,5,1.25125
8,CU,69.539183,7,5,1.25125
9,CU,76.74737,12,5,1.25125


## Reading in Video with `ffmpeg-python`

Not feasible for larger video files as the entire video is read into memory at once, as opposed to `OpenCV`'s selective frame reading‘

In [9]:
def predict(x):
    preds = learn.predict(x)
    return f'{preds[0]} ({"{:.3f}".format(preds[2][preds[1]])})'

np_to_img = lambda x: Image(pil2tensor(x, np.float32).div_(255))

In [None]:
os.chdir(path)
i=5

for f in sorted(os.listdir(path))[:3]:
    clip = [np_to_img(x) for x in read_video_tensor(f,0)]
    [predict(x) for x in clip[0::i]]

In [6]:
vid = read_video_tensor(path/'shot#1.mp4')
vid = [np_to_img(x) for x in vid]

In [7]:
vid[0::10]

[Image (3, 720, 1280), Image (3, 720, 1280), Image (3, 720, 1280)]

In [8]:
i=5
[predict(x) for x in vid[0::i]]

['Medium Close-Up (0.968)',
 'Medium Close-Up (0.973)',
 'Medium Close-Up (0.963)',
 'Medium Close-Up (0.968)',
 'Medium Close-Up (0.971)']