# Right Left up down, motion vectors v4

## Setup

In [1]:
!pip install -r requirements.txt

## only needed to save model arcitecture
# import os
# os.environ["PATH"] += os.pathsep + '/path/to/graphviz/bin'

Collecting google-cloud
  Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Collecting imutils
  Downloading imutils-0.5.4.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting gensim
  Downloading gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy==1.24.2
  Downloading numpy-1.24.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting google-cloud-storage
  Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m 

In [2]:
import os
from pathlib import Path
import pickle

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

from frame_generator_mv_v4 import FrameGenerator
from model_mv_v4 import create_R2Plus1D_mv_model

In [3]:
DATA_PATH = "data_right_left_up_down_1200"
NOTEBOOK_NAME = "2_2d_plus_1_rlud_1200_mv_v4"
RESULTS_PATH = DATA_PATH + "/" + NOTEBOOK_NAME

index_df = pd.read_csv(f'{DATA_PATH}/indx_df.csv')

# PARAMS
# number of frames taken from each video
n_frames = 74
# number of motion vectors taken from each video
#  ????
batch_size = 8
# Define the dimensions of one frame in the set of frames created
HEIGHT=420
WIDTH = 10
MOTION_CHANNELS=1 # or 10 

assert(os.path.isdir(DATA_PATH ))

if not os.path.isdir(RESULTS_PATH):
    os.mkdir(RESULTS_PATH)

index_df = pd.read_csv(f'{DATA_PATH}/indx_df.csv')

print(f"classes being compared {index_df['category'].unique()}")

classes being compared ['Pushing something from left to right'
 'Pushing something from right to left' 'Moving something up'
 'Moving something down']


## Preprocess video data

Load something something data tf.data.Dataset

In [4]:
subset_paths = {
    "test": Path(f'{DATA_PATH}/test'),
    "train": Path(f'{DATA_PATH}/train'),
    "val": Path(f'{DATA_PATH}/validation'),
}

output_signature = (
    tf.TensorSpec(shape = (n_frames, HEIGHT, WIDTH), dtype = tf.float32),
    tf.TensorSpec(shape = (), dtype = tf.int16)
)
train_ds = tf.data.Dataset.from_generator(
    FrameGenerator(subset_paths['train'],
        n_frames=n_frames,
        height=HEIGHT,
        index_df=index_df,
        training=True
    ),
    output_signature = output_signature
)
# Batch the data
train_ds = train_ds.batch(batch_size)

val_ds = tf.data.Dataset.from_generator(
    FrameGenerator(subset_paths['train'],
        n_frames=n_frames,
        height=HEIGHT,
        index_df=index_df,
        training=True
    ),
    output_signature = output_signature
)
# Batch the data
val_ds = val_ds.batch(batch_size)

test_ds = tf.data.Dataset.from_generator(
    FrameGenerator(subset_paths['train'],
        n_frames=n_frames,
        height=HEIGHT,
        index_df=index_df,
        training=True
    ),
    output_signature = output_signature
)
print(type(test_ds))

# Batch the data
test_ds = test_ds.batch(batch_size)

print(type(test_ds))


<class 'tensorflow.python.data.ops.dataset_ops.FlatMapDataset'>
<class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>


## Model Creation


Given your motion vector data shape of (74, 420, 10), the values for your 3D CNN parameters would be:

n_frames: This represents the depth of your data in the context of a 3D CNN. In your case, this would be 74.

height and width: These usually correspond to the spatial dimensions of the data. For your motion vectors, height would be 420, and width would be 10.

motion_channels: This is typically the number of channels or features in each frame. In standard image processing, this might correspond to color channels like RGB. For motion vectors, if each element in your width (10) represents a different feature or channel of motion data, then you have 10 motion channels. However, if the 10 elements are part of a single motion vector, you might consider this as a single channel with a vector of length 10.

So, for your 3D CNN, you would set these parameters as n_frames=74, height=420, width=10, and motion_channels=1 or 10 depending on how you interpret the motion vector data.

In [5]:
input_shape = (n_frames, HEIGHT, WIDTH, MOTION_CHANNELS)
model = create_R2Plus1D_mv_model(input_shape, num_classes=4)

## Validate Model and Data

In [6]:
# testing tensor is setup correct
iter(train_ds)

<tensorflow.python.data.ops.iterator_ops.OwnedIterator at 0x7f2e4c589910>

## Build Model

In [7]:
frames, label = next(iter(train_ds))

In [8]:
model.build(frames)

## Visualise Model

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 74, 420, 10  0           []                               
                                , 1)]                                                             
                                                                                                  
 time_distributed (TimeDistribu  (None, 74, 420, 10,  640        ['input_1[0][0]']                
 ted)                            64)                                                              
                                                                                                  
 time_distributed_1 (TimeDistri  (None, 74, 420, 10,  256        ['time_distributed[0][0]']       
 buted)                          64)                                                          

## Load the Model

Using BinaryCrossentropy as it is more effective for binary data

from_logits is false because final layer includes a sigmoid activation,

In [16]:
previous_runs = 3

In [17]:
previous_runs += 1

model.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer = keras.optimizers.Adam(learning_rate = 0.0001),
              metrics=[
                    'accuracy',
                    # tf.keras.metrics.Precision(),
                    # tf.keras.metrics.Recall()
                    # tf.keras.metrics.Precision(class_id=0, name='precision_neg'),
                    # tf.keras.metrics.Precision(class_id=1, name='precision_pos'),
                    # tf.keras.metrics.Recall(class_id=0, name='recall_neg'),
                    # tf.keras.metrics.Recall(class_id=1, name='recall_pos')
                ]
            )

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    RESULTS_PATH + '/model-runs-' + str(previous_runs) + '-cp-{epoch:02d}-{val_loss:.2f}.ckpt',
    save_best_only=True,  # Save only the best model based on a monitored metric (e.g., val_loss), will only replace saved value if it is better
    monitor='val_loss',
    mode='min',  # 'min' for loss, 'max' for accuracy
    save_weights_only=True
)


early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True
)

In [18]:
latest = tf.train.latest_checkpoint(RESULTS_PATH)

if latest is not None:
    print(f"loading model from weights: {latest}")
    model.load_weights(latest)

loading model from weights: data_right_left_up_down_1200/2_2d_plus_1_rlud_1200_mv_v4/model-runs-3-cp-07-0.19.ckpt


## Train the Model

In [19]:
previously_run_epochs = 0 + 16 + 15 + 16

history = model.fit(
        x=train_ds,
        epochs = 50 - previously_run_epochs,
        validation_data=val_ds,
        callbacks=[checkpoint_callback, early_stopping],
    )

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Analyse results

## Analyse results

In [14]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

true_labels = []
predictions = []

# Iterate over the batched test dataset
for batch in test_ds:
    x, y = batch  # x is the batch of features, y is the batch of labels
    true_labels.extend(y.numpy())  # Store true labels
    preds = model.predict(x)  # Generate predictions for the batch
    preds = softmax(preds, axis=1)  # Apply softmax to convert logits to probabilities
    preds = np.argmax(preds, axis=1)  # Get the class with the highest probability
    predictions.extend(preds)

# Convert lists to numpy arrays
true_labels = np.array(true_labels)
predictions = np.array(predictions)



In [15]:
fg = FrameGenerator(subset_paths['train'],
        n_frames=n_frames,
        height=HEIGHT,
        index_df=index_df,
        training=True
)
class_id_value = {
    fg.class_ids_for_name[x]: x for x in fg.class_ids_for_name.keys()
 }

# Convert lists to numpy arrays if they aren't already
true_labels = np.array(true_labels)
predictions = np.array(predictions)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)

# Calculate precision, recall, and F1-score for each class
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average=None)

# Print accuracy and F1-scores for each class
print(f"Overall Accuracy: {accuracy}")
for i, (prec, rec, f1) in enumerate(zip(precision, recall, f1_score)):
    print(f"Class {class_id_value[i]}: Precision: {prec}, Recall: {rec}, F1 Score: {f1}")

Overall Accuracy: 0.94453125
Class Pushing something from left to right: Precision: 0.9681190223166843, Recall: 0.9489583333333333, F1 Score: 0.9584429247764334
Class Pushing something from right to left: Precision: 0.96723044397463, Recall: 0.953125, F1 Score: 0.9601259181532004
Class Moving something up: Precision: 0.8838095238095238, Recall: 0.9666666666666667, F1 Score: 0.9233830845771145
Class Moving something down: Precision: 0.9667774086378738, Recall: 0.909375, F1 Score: 0.9371980676328503
