In [None]:
# !pip install pandas
# !pip install tensorflow

# CNN RNN
По этому примеру: https://huggingface.co/keras-io/video-classification-cnn-rnn/blob/main/main.ipynb

In [19]:
from tensorflow import keras

import pandas as pd
import numpy as np
import os
import cv2
from datetime import datetime
from pathlib import Path
import imageio

In [9]:
annotations_path = 'SLOVO_DATAFRAME.tsv'
video_path = 'animals'
current_dir = os.getcwd()

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 12

MAX_SEQ_LENGTH=20
NUM_FEATURES=2048

In [10]:
annot_df = pd.read_csv(annotations_path, sep='\t')
annot_df.head()

Unnamed: 0,attachment_id,text,user_id,height,width,length,begin,end
0,44e8d2a0-7e01-450b-90b0-beb7400d2c1e,Ё,185bd3a81d9d618518d10abebf0d17a8,640,360,156.0,36,112
1,df5b08f0-41d1-4572-889c-8b893e71069b,А,185bd3a81d9d618518d10abebf0d17a8,640,360,150.0,36,76
2,17f53df4-c467-4aff-9f48-20687b63d49a,Р,185bd3a81d9d618518d10abebf0d17a8,640,360,133.0,40,97
3,e3add916-c708-4339-ad98-7e2740be29e9,Е,185bd3a81d9d618518d10abebf0d17a8,640,360,144.0,43,107
4,bd7272ed-1850-48f1-a2a8-c8fed523dc37,Ч,185bd3a81d9d618518d10abebf0d17a8,640,360,96.0,20,70


In [21]:
# Возьмем для начала уменьшенный датасет из 30 животных и поделим датасет на train и test
selected_animals = [
    'пингвин','жираф', 'лягушка', 'бегемот', 'козел',
    'лиса', 'динозавр', 'кролик', 'собака', 'обезьяна',
    'корова', 'свинья', 'медуза', 'курица', 'павлин',
    'дельфин', 'слон', 'медведь', 'лебедь', 'орел',
    'бык', 'змея', 'птица', 'лось', 'пчела',
    'лев', 'тигр', 'мышь', 'паук', 'бабочка'
    ]
train_df = pd.DataFrame(columns=['attachment_id', 'text', 'begin', 'end'])
test_df = pd.DataFrame(columns=['attachment_id', 'text', 'begin', 'end'])
for animal in selected_animals:
    train_df = pd.concat([train_df, annot_df[annot_df.text==animal][['attachment_id', 'text', 'begin', 'end']][:18]], ignore_index=True)
    test_df = pd.concat([test_df, annot_df[annot_df.text==animal][['attachment_id', 'text', 'begin', 'end']][18:]], ignore_index=True)
    
test_df.head(8)
    

Unnamed: 0,attachment_id,text,begin,end
0,e61c12a9-f727-441d-86e8-a9b854decd3e,пингвин,32,111
1,2eca9c1e-103c-4a23-98c8-1e994fe76762,пингвин,18,65
2,78c4c85b-5b75-42e7-bf3b-ee323b05c573,жираф,62,126
3,8ee72ac2-cd61-4995-93eb-4e9e5a43f873,жираф,15,56
4,a912411f-d2b8-46f3-9741-c326897a08c8,лягушка,5,55
5,98eedd68-135e-498e-9704-3cde2655a480,лягушка,10,61
6,2fe89f0b-eb56-4225-865d-3f55cf9059be,бегемот,28,109
7,2e133e33-aff4-44c1-b8f0-30ae2cdb03b7,бегемот,24,75


In [25]:
# The following two methods are taken from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, begin, end, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    frame_index=begin
    try:
        while True and frame_index <= end:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)
            frame_index+=1

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [13]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

2023-10-19 20:56:55.907382: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [14]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["text"])
)
print(label_processor.get_vocabulary())

['бабочка', 'бегемот', 'бык', 'дельфин', 'динозавр', 'жираф', 'змея', 'козел', 'корова', 'кролик', 'курица', 'лебедь', 'лев', 'лиса', 'лось', 'лягушка', 'медведь', 'медуза', 'мышь', 'обезьяна', 'орел', 'павлин', 'паук', 'пингвин', 'птица', 'пчела', 'свинья', 'слон', 'собака', 'тигр']


In [27]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["attachment_id"].values.tolist()
    begins = df["begin"].values.tolist()
    ends =df["end"].values.tolist()
    labels = df["text"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(''.join([root_dir, path, '.mp4']), begin=begins[idx], end=ends[idx])
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, video_path)
test_data, test_labels = prepare_all_videos(test_df, video_path)

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (540, 20, 2048)
Frame masks in train set: (540, 20)


In [28]:
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

# logdir = str(Path(os.path.join(
#     current_dir,
#     "logs",
#     "scalars",
#     datetime.now().strftime('%Y%m%d-%H%M%S'),
# )))+'\\'
logdir = f"logs/scalars/{datetime.now().strftime('%Y%m%d-%H%M%S')}/"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

def run_experiment():
    filepath = Path.cwd()
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        batch_size=BATCH_SIZE,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[
            checkpoint,
            tensorboard_callback,
        ],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model

In [29]:
history, sequence_model = run_experiment()

Epoch 1/12
Epoch 1: val_loss improved from inf to 3.40747, saving model to /home/jupyter/work/resources
Epoch 2/12
Epoch 2: val_loss did not improve from 3.40747
Epoch 3/12
Epoch 3: val_loss did not improve from 3.40747
Epoch 4/12
Epoch 4: val_loss did not improve from 3.40747
Epoch 5/12
Epoch 5: val_loss did not improve from 3.40747
Epoch 6/12
Epoch 6: val_loss did not improve from 3.40747
Epoch 7/12
Epoch 7: val_loss did not improve from 3.40747
Epoch 8/12
Epoch 8: val_loss did not improve from 3.40747
Epoch 9/12
Epoch 9: val_loss did not improve from 3.40747
Epoch 10/12
Epoch 10: val_loss did not improve from 3.40747
Epoch 11/12
Epoch 11: val_loss did not improve from 3.40747
Epoch 12/12
Epoch 12: val_loss did not improve from 3.40747
Test accuracy: 3.33%


In [42]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path, begin, end):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path), begin, end)
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


test_video = np.random.choice(test_df["attachment_id"].values.tolist())
begin = int(test_df[test_df.attachment_id==test_video]['begin'])
end = int(test_df[test_df.attachment_id==test_video]['end'])
test_video_path = f"animals/{test_video}.mp4"
print(f"Test video path: {test_video_path}")
test_frames = sequence_prediction(test_video_path, begin, end)
#to_gif(test_frames[:MAX_SEQ_LENGTH])

Test video path: animals/5c6dd45e-9224-4527-94fb-ca36308f46ba.mp4
  слон:  3.35%
  бегемот:  3.35%
  бык:  3.35%
  медуза:  3.35%
  корова:  3.35%
  козел:  3.35%
  свинья:  3.35%
  кролик:  3.35%
  лебедь:  3.35%
  медведь:  3.35%
  дельфин:  3.35%
  орел:  3.34%
  жираф:  3.34%
  пингвин:  3.34%
  динозавр:  3.34%
  лиса:  3.34%
  лягушка:  3.34%
  обезьяна:  3.33%
  курица:  3.33%
  павлин:  3.33%
  собака:  3.32%
  змея:  3.31%
  тигр:  3.31%
  лось:  3.31%
  лев:  3.31%
  мышь:  3.31%
  паук:  3.31%
  птица:  3.31%
  пчела:  3.31%
  бабочка:  3.31%


In [43]:
test_df[test_df.attachment_id=='5c6dd45e-9224-4527-94fb-ca36308f46ba']

Unnamed: 0,attachment_id,text,begin,end
11,5c6dd45e-9224-4527-94fb-ca36308f46ba,лиса,20,54
