In [None]:
### Imports general ###
#######################

import numpy as np
import pandas as pd
import cv2
import shutil
import os
from tqdm import tqdm

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/0_Masterarbeit/5_Pipelines/Data'

Mounted at /content/drive


In [None]:
### Upload video and audio files ###
####################################

data_file = 'salomon'

# copy zip files
shutil.copy(os.path.join(data_dir, f'Video_{data_file}.zip'), './')
shutil.copy(os.path.join(data_dir, f'Audio_{data_file}.zip'), './')

# create folders to unpack zip files to
os.makedirs('./Video')
os.makedirs('./Audio')

# unpack zip files
shutil.unpack_archive(f'./Video_{data_file}.zip', extract_dir = './Video')
shutil.unpack_archive(f'./Audio_{data_file}.zip', extract_dir = './Audio')

### Action detection

In [None]:
### Installation scene detection ###
####################################

!pip install --upgrade scenedetect[opencv]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scenedetect[opencv]
  Downloading scenedetect-0.6.1-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scenedetect
Successfully installed scenedetect-0.6.1


In [None]:
### Clone git repo ###
######################

# clone
!git clone https://github.com/metalbubble/moments_models.git

Cloning into 'moments_models'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 139 (delta 53), reused 43 (delta 43), pack-reused 79[K
Receiving objects: 100% (139/139), 58.78 KiB | 19.59 MiB/s, done.
Resolving deltas: 100% (75/75), done.


In [None]:
### Imports for action detection ###
####################################

# scene detection
from scenedetect import detect, ContentDetector

# action detection model
from moments_models import models
import torch
from torch.nn import functional as F
from torchvision import transforms

In [None]:
### Load model ###
##################

# model
model = models.load_model('multi_resnet3d50')

# categories
categories = models.load_categories('./moments_models/category_multi_momentsv2.txt')

# load transform
transform = transforms.Compose([transforms.ToPILImage(),
                                transforms.Resize((224, 224)),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406],
                                                     [0.229, 0.224, 0.225])])

In [None]:
### Function to apply action detection model to individual scenes ###
#####################################################################

def action_detection(video_file):


  ### Scelect sample frames for each video ###
  ############################################

  # detect scenes
  scene_list = detect(video_file, ContentDetector())

  # if no scenes detected -> eg. whole video in one shot
  if len(scene_list) == 0:

    # extract number of frames in the video
    cap = cv2.VideoCapture(video_file)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()



    if num_frames >= 144:

      # split the video in 9 equal parts
      equal_parts = list(np.linspace(0, num_frames, 10, dtype = int))

      # extract tupels of first and last scene for each part
      scene_list = [(start, stop) for start, stop in zip(equal_parts, equal_parts[1:])]

      # select every third part to be analyzed
      scene_list = [scene_list[j] for j in range(0, len(scene_list), 3)]

    else:
      # split the video in 6 equal parts
      equal_parts = list(np.linspace(0, num_frames, 7, dtype = int))

      # extract tupels of first and last scene for each part
      scene_list = [(start, stop) for start, stop in zip(equal_parts, equal_parts[1:])]

      # select every third part to be analyzed
      scene_list = [scene_list[j] for j in range(0, len(scene_list), 2)]


    # get list of start frames of each part
    scene_start_frames = [i[0] for i in scene_list]

    # get list of part lengths in num frames
    scene_len_in_frames = [i[1] - i[0] for i in scene_list]

  else: # scenes detected

    # select every third scene to be analyzed
    scene_list = [scene_list[j] for j in range(0, len(scene_list), 3)]

    # get list of start frames for each scene
    scene_start_frames = [i[0].get_frames() for i in scene_list]

    # get list of scene lengths in num frames
    scene_len_in_frames = [i[1].get_frames() - i[0].get_frames() for i in scene_list]


  # get list of 16 subsequent sample frames for each scene as required by the model
  sample_frames_per_scene = []
  for i, j in zip(scene_start_frames, scene_len_in_frames):

    if j >= 16: # only consider scenes that include at least 16 frames
      sample_frames_per_scene.append(
         list(np.linspace(i, i+j-1, 16, dtype = int))
         )

  ### Extract the selected sample frames per scene ###
  ####################################################

  cap = cv2.VideoCapture(video_file)

  scene_frames = []
  for l in sample_frames_per_scene:

    rgb_frames = []
    for f in l:
      # set to position of respective sample frame
      cap.set(cv2.CAP_PROP_POS_FRAMES, f)

      # Read the frame from the video
      ret, frame = cap.read()

      # convert frame array to RGB format
      img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      rgb_frames.append(img)

    scene_frames.append(rgb_frames)


  # Release the video capture object and close the windows
  cap.release()
  cv2.destroyAllWindows()



  ### Predict category probabilities and average over video ###
  #############################################################

  for i, frs in enumerate(scene_frames):

    # create transformed model input of 16 subsequent frames per scene to get prediction for scene
    input = torch.stack([transform(frame) for frame in frs], 1).unsqueeze(0)

    # Make video prediction
    with torch.no_grad():
      logits = model(input) # extract logits
      h_x = F.softmax(logits, 1).mean(dim=0) # convert logits to class probabilities

    # sum class probabilities over all scenes
    if i==0:
      average_probs = h_x

    else:
      average_probs += h_x

  # average class probabilities by number of considered scenes
  average_probs /= len(sample_frames_per_scene)

  return average_probs

In [None]:
### Extract for each video ###
##############################

video_id = []
action_probs = []
for video_file in tqdm(os.listdir('./Video')):

  video_id.append(video_file[:-4])
  action_probs.append(action_detection(os.path.join('./Video', video_file)).tolist())

  0%|          | 0/48 [00:00<?, ?it/s]INFO:pyscenedetect:Detecting scenes...
  2%|▏         | 1/48 [00:12<09:58, 12.74s/it]INFO:pyscenedetect:Downscale factor set to 2, effective resolution: 360 x 640
INFO:pyscenedetect:Detecting scenes...
  4%|▍         | 2/48 [00:39<16:01, 20.90s/it]INFO:pyscenedetect:Detecting scenes...
  6%|▋         | 3/48 [00:49<11:57, 15.94s/it]INFO:pyscenedetect:Downscale factor set to 2, effective resolution: 360 x 640
INFO:pyscenedetect:Detecting scenes...
  8%|▊         | 4/48 [01:18<15:38, 21.34s/it]INFO:pyscenedetect:Downscale factor set to 2, effective resolution: 360 x 640
INFO:pyscenedetect:Detecting scenes...
 10%|█         | 5/48 [01:34<13:44, 19.18s/it]INFO:pyscenedetect:Downscale factor set to 2, effective resolution: 360 x 640
INFO:pyscenedetect:Detecting scenes...
 12%|█▎        | 6/48 [02:06<16:32, 23.63s/it]INFO:pyscenedetect:Downscale factor set to 2, effective resolution: 360 x 640
INFO:pyscenedetect:Detecting scenes...
 15%|█▍        | 7/48 [

In [None]:
### Create final dataframe for action detection ###
###################################################

action_df = pd.DataFrame(action_probs)
action_name_dict = {i:f"p_action_{c}" for i,c in enumerate(categories)}
action_df = action_df.rename(columns = action_name_dict)
action_df['video_id'] = video_id

### Face expression detection ###

In [None]:
### Installations face expression detection ###
###############################################

!pip install face_detection
!pip install deepface

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting face_detection
  Downloading face_detection-0.2.2.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: face_detection
  Building wheel for face_detection (setup.py) ... [?25l[?25hdone
  Created wheel for face_detection: filename=face_detection-0.2.2-py3-none-any.whl size=25560 sha256=2103a9396fdfc6bf5f13576c03f46fb0d498ec3dd6d3f7ef2cf4bfcab409482f
  Stored in directory: /root/.cache/pip/wheels/f9/14/a1/617e184738e71e46c1e75f068f67a911917ae5d02faeabc4e4
Successfully built face_detection
Installing collected packages: face_detection
Successfully installed face_detection-0.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepface
  Downloading deepface-0.0.79-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[

In [None]:
### Imports face expression detection ###
#########################################

import face_detection
from deepface import DeepFace

Directory  /root /.deepface created
Directory  /root /.deepface/weights created


In [None]:
### Load model ###
##################

face_detector = face_detection.build_detector('RetinaNetResNet50', confidence_threshold=.8, nms_iou_threshold=.3)

Downloading: "https://folk.ntnu.no/haakohu/RetinaFace_ResNet50.pth" to /root/.cache/torch/hub/checkpoints/RetinaFace_ResNet50.pth
100%|██████████| 104M/104M [00:10<00:00, 10.8MB/s]


In [None]:
### Function to detect faces in video and apply expression recognition  ###
###########################################################################

def face_exp_detection(video_file):

    # initialize video capturing object
    cap = cv2.VideoCapture(video_file)

    # extract fps to set interval between frames to be contidered
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # frame interval -> every n = 2 second, a frame is considered in prediction
    frame_interval = 2 * fps

    # initialize counter and emotion list object
    counter = 0
    expression_list = []

    # loop though video
    while True:
      ret, frame = cap.read()

      counter+=1

      if not ret:
        break

      if counter % frame_interval != 0:
        continue

      # detect faces in frame
      det = face_detector.detect(frame)

      # if no face detected continue with next frame
      if len(det) == 0:
        continue

      # crop faces from frames and apply emotion classification
      for bbox in det:

        # crop for face
        xmin, ymin, xmax, ymax , _ = bbox
        face = frame[abs(int(ymin)):abs(int(ymax)), abs(int(xmin)):abs(int(xmax))]

        # apply emotion detection
        expression_det = DeepFace.analyze(face, actions = 'emotion', enforce_detection= False, silent = True)

        expression_dict = expression_det[0]['emotion']

        expression_list.append([expression_dict[emo] for emo in expression_dict])

    # Release the video capture object and close the windows
    cap.release()
    cv2.destroyAllWindows()

    # if no face detected in the entire video -> return expression list of all 0's
    if len(expression_list) == 0:
       return [0,0,0,0,0,0,0]

    # if one faces detected in video -> take mean over all class probabilities and devide by 100 (since model scales probs by 100)
    else:
      return list(np.array(expression_list).mean(0) / 100)

In [None]:
### Extract for each video ###
##############################

video_id = []
exp_video_list = []

for video_file in tqdm(os.listdir('./Video')):

  video_id.append(video_file[:-4])
  exp_video_list.append(face_exp_detection(os.path.join('./Video', video_file)))

  0%|          | 0/48 [00:00<?, ?it/s]

facial_expression_model_weights.h5 will be downloaded...


Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: /root/.deepface/weights/facial_expression_model_weights.h5

  0%|          | 0.00/5.98M [00:00<?, ?B/s][A
 18%|█▊        | 1.05M/5.98M [00:00<00:00, 10.2MB/s][A
 35%|███▌      | 2.10M/5.98M [00:00<00:00, 6.56MB/s][A
100%|██████████| 5.98M/5.98M [00:00<00:00, 10.6MB/s]
100%|██████████| 48/48 [02:46<00:00,  3.48s/it]


In [None]:
### Create final dataframe for face expression detection ###
############################################################
# create df
face_exp_df = pd.DataFrame(exp_video_list)

# create dict of col names
exp_classes = ["angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"]
exp_name_dict = {i:f"p_face_{c}" for i,c in enumerate(exp_classes)}

# add names and video_id column
face_exp_df = face_exp_df.rename(columns = exp_name_dict)
face_exp_df['video_id'] = video_id

### Scene Detection ###

In [None]:
### Installations scene detection ###
#####################################

!pip uninstall -y transformers
!pip install transformers==4.28.0

[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 tran

In [None]:
### Imports scene detection ###
###############################

from transformers import ViTForImageClassification, ViTFeatureExtractor

In [None]:
### Load Model ###
##################

# path to pre-trained model
scene_model_path = '/content/drive/MyDrive/0_Masterarbeit/5_Pipelines/Models/best_scene_detection_model'

# classes to be detected
scene_classes = ['airport', 'alley', 'athlectic_field', 'auditorium', 'bar',
          'basketball_court', 'bathroom', 'beach', 'bedroom', 'bistro',
          'canyon', 'computer_room', 'desert', 'discotheque', 'factory',
          'field', 'forest', 'gym', 'harbor', 'highway', 'hill',
          'kitchen', 'lake', 'library', 'living_room', 'locker_room',
          'market', 'mountain', 'ocean', 'office', 'park', 'raceway',
          'river', 'skatepark', 'snowfield', 'stadium', 'street',
          'swimming_pool', 'tennis_court']

# Load feature extractor
scene_feature_extractor = ViTFeatureExtractor.from_pretrained(scene_model_path)

# Load model
scene_model = ViTForImageClassification.from_pretrained(
    scene_model_path,
    num_labels=len(scene_classes),
    id2label={str(i): c for i, c in enumerate(scene_classes)},
    label2id={c: str(i) for i, c in enumerate(scene_classes)}
)

In [None]:
### Function to apply scene detection to video ###
##################################################

def scene_detection(video_file):

   # initialize video capturing object
    cap = cv2.VideoCapture(video_file)

    # extract fps to set interval between frames to be contidered
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # frame interval -> every n = 2 second, a frame is considered in prediction
    frame_interval = 2 * fps

    # initialize counter and scene list object
    counter = 0
    scene_list = []

    # loop though video frame by frame
    while True:
      ret, frame = cap.read()

      counter+=1

      if not ret:
        break

      # only consider first frame of every specified interval
      if counter % frame_interval != 0:
        continue


      # feature extraction
      inp = scene_feature_extractor(frame[:,:,::-1], return_tensors='pt')

      # prediction
      with torch.no_grad():

        # get model prediction as logits
        logits = scene_model(inp['pixel_values'])['logits']

      # convert to class probabilities and save
      scene_list.append(logits.softmax(dim = -1)[0].tolist())



    # Release the video capture object and close the windows
    cap.release()
    cv2.destroyAllWindows()

    # return mean of class probabilites over all considered frames
    return list(np.array(scene_list).mean(0))

In [None]:
### Apply scene detection to all videos ###
###########################################

video_id = []
scene_video_list = []

for video_file in tqdm(os.listdir('./Video')):

  video_id.append(video_file[:-4])
  scene_video_list.append(scene_detection(os.path.join('./Video', video_file)))

100%|██████████| 48/48 [07:44<00:00,  9.68s/it]


In [None]:
### Create final dataframe for scene detection ###
##################################################

# create df
scene_df = pd.DataFrame(scene_video_list)

# create dict of col names
scene_name_dict = {i:f"p_scene_{c}" for i,c in enumerate(scene_classes)}

# add names and video_id column
scene_df = scene_df.rename(columns = scene_name_dict)
scene_df['video_id'] = video_id

### Sound detection

In [None]:
### Import sound detection ###
##############################

import librosa
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

In [None]:
### Load model ###
##################

# path to pre-trained model
sound_model_path = '/content/drive/MyDrive/0_Masterarbeit/5_Pipelines/Models/best_sound_detection_model'

# classes to be detected
sound_classes = ['airplane', 'angry voice', 'breathing', 'brushing_teeth', 'calm voice',
                'can_opening', 'car_horn', 'cat', 'chainsaw', 'chirping_birds',
                'church_bells', 'clapping', 'clock_alarm', 'clock_tick', 'coughing',
                'cow', 'crackling_fire', 'crickets', 'crow', 'crying_baby', 'dog',
                'door_wood_creaks', 'door_wood_knock', 'drinking_sipping', 'engine',
                'fireworks', 'footsteps', 'frog', 'glass_breaking', 'hand_saw',
                'happy voice', 'helicopter', 'hen', 'insects', 'keyboard_typing',
                'laughing', 'mouse_click', 'pig', 'pouring_water', 'rain', 'rooster',
                'sad voice', 'sea_waves', 'sheep', 'siren', 'sneezing', 'snoring',
                'thunderstorm', 'toilet_flush', 'train', 'vacuum_cleaner', 'washing_machine',
                'water_drops', 'wind']

# Load feature extractor
sound_feature_extractor = AutoFeatureExtractor.from_pretrained(sound_model_path)

# Load model
sound_model = AutoModelForAudioClassification.from_pretrained(sound_model_path)

In [None]:
### Function to apply sound detection to video ###
##################################################

def sound_detection(audio_file):

  # load audio data
  audio_data, sr = librosa.load(audio_file)

  # resample to sampling rate the model was trained on
  a_rs = librosa.resample(audio_data, orig_sr = sr, target_sr = 16000)

  # split audio file into 10 parts of equal length
  parts = np.linspace(0, len(a_rs), 10)

  # loop to consider each part individually
  audio_list = []

  for i in [0, 3, 5, 7]:

    # apply Short-time Fourier Transform to respective part to get model input
    inp = sound_feature_extractor(a_rs[int(parts[i]): int(parts[i+1])], sampling_rate=16000, return_tensors="pt")

    # extract class probabilities form output logits
    with torch.no_grad():
      class_probs = sound_model(**inp).logits.softmax(dim = -1)[0]

    audio_list.append(class_probs.tolist())

  # return mean class probabilites over all audio parts
  return list(np.array(audio_list).mean(0))

In [None]:
### Apply sound detection to all audio files ###
################################################

video_id = []
sound_video_list = []

for audio_file in tqdm(os.listdir('./Audio')):

  video_id.append(audio_file[:-4])
  sound_video_list.append(sound_detection(os.path.join('./Audio', audio_file)))

100%|██████████| 48/48 [13:52<00:00, 17.34s/it]


In [None]:
### Create final dataframe for sound detection ###
##################################################

# create df
sound_df = pd.DataFrame(sound_video_list)

# create dict of col names
sound_name_dict = {i:f"p_sound_{c}" for i,c in enumerate(sound_classes)}

# add names and video_id column
sound_df = sound_df.rename(columns = sound_name_dict)
sound_df['video_id'] = video_id

In [None]:
### Merge all 4 dataframes into one ###
#######################################

# merge
mid_level_features = action_df.merge(face_exp_df, on='video_id').merge(scene_df, on='video_id').merge(sound_df, on = 'video_id')

# move video_id column to beginning
first_column = mid_level_features.pop('video_id')
mid_level_features.insert(0, 'video_id', first_column)

In [None]:
### Save as csv file ###
########################
save_dir = '/content/drive/MyDrive/0_Masterarbeit/5_Pipelines/Feature_outputs'

mid_level_features.to_csv(f'./mid_level_features_{data_file}.csv')
shutil.copy(f'./mid_level_features_{data_file}.csv', save_dir)

'/content/drive/MyDrive/0_Masterarbeit/5_Pipelines/Feature_outputs/mid_level_features_salomon.csv'