<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/1_Extracted_Data/4_High_level_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Imports general ###
#######################

import numpy as np
import pandas as pd
import cv2
import torch
import shutil
import os
from tqdm import tqdm

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir_video = '/content/drive/MyDrive/VideoAdEngagement/0_Downloaded_Data/1_Raw_Video_Data'
data_dir_audio = '/content/drive/MyDrive/VideoAdEngagement/0_Downloaded_Data/2_Raw_Audio_Data'
save_dir =       '/content/drive/MyDrive/VideoAdEngagement/1_Extracted_Data/Extracted_Features'
model_dir =      '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models'

Mounted at /content/drive


In [None]:
### Upload video and audio files ###
####################################

data_file = 'ferrari'

# copy zip files
shutil.copy(os.path.join(data_dir, f'Video_{data_file}.zip'), './')
shutil.copy(os.path.join(data_dir, f'Audio_{data_file}.zip'), './')

# create folders to unpack zip files to
os.makedirs('./Video')
os.makedirs('./Audio')

# unpack zip files
shutil.unpack_archive(f'./Video_{data_file}.zip', extract_dir = './Video')
shutil.unpack_archive(f'./Audio_{data_file}.zip', extract_dir = './Audio')

### Intent detection

In [None]:
### Installations intetn detection ###
######################################

!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-0_s7ql30
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-0_s7ql30
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369501 sha256=173408078595bd9c9c3bd816be6badc4e9775ed28da36cebd5b11df3332a9a63
  Stored in directory: /tmp/pip-ephem-wheel-cache-m3o_sn3n/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Inst

In [None]:
### Imports for intetn detection ###
####################################

from PIL import Image
import torch
import clip

In [None]:
### Load Model ###
##################

# set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load model and prepocessor
intent_model, intent_preprocess = clip.load("ViT-B/32",device=device)

# load pre-trained model
checkpoint = torch.load( os.path.join(model_dir, 'best_intentonomy_model.pt'))
intent_model.load_state_dict(checkpoint['model_state_dict'])

# classes
labels_intent = ['virtue','self-fulfill', 'openness to experience', 'security and belonging',
          'power', 'health', 'familiy', 'ambition and ability', 'financial and occupational success']

# tokenize class labels
labels_intent_tok = clip.tokenize([f"The picture represents {l}" for l in labels_intent]).to(device)

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 150MiB/s]


In [None]:
### Function to apply intent detection to individal frames of the video ###
###########################################################################

def intent_detection(video_file):

   # initialize video capturing object
    cap = cv2.VideoCapture(video_file)

    # extract fps to set interval between frames to be contidered
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # frame interval -> every n = 2 second, a frame is considered in prediction
    frame_interval = 2 * fps

    # initialize counter and emotion list object
    counter = 0
    intent_list = []

    # loop though video
    while True:
      ret, frame = cap.read()

      counter+=1

      if not ret:
        break

      if counter % frame_interval != 0:
        continue


      # feature extraction
      img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      img_pre_process = intent_preprocess(Image.fromarray(img)).unsqueeze(0).to(device)

      # prediction
      with torch.no_grad():
            logits , _ = intent_model(img_pre_process, labels_intent_tok)
            p = logits.softmax(dim=-1)

      intent_list.append(p[0].tolist())



    # Release the video capture object and close the windows
    cap.release()
    cv2.destroyAllWindows()

    return list(np.array(intent_list).mean(0))

In [None]:
### Apply to all videos ###
###########################

video_id = []
intent_video_list = []

for video_file in tqdm(os.listdir('./Video')):

  video_id.append(video_file[:-4])
  intent_video_list.append(intent_detection(os.path.join('./Video', video_file)))

100%|██████████| 182/182 [05:17<00:00,  1.75s/it]


In [None]:
### Create final dataframe for intent detection ###
###################################################

intent_df = pd.DataFrame(intent_video_list, columns=  [f"p_intent_{l}" for l in labels_intent])
intent_df['video_id'] = video_id

### Memorability classification

In [None]:
### Installations memorability classification ###
#################################################

!pip uninstall -y transformers
!pip install transformers==4.28.0

[0mCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 tokenizers-0.13.3 transformers-4.28.0


In [None]:
### Imports memorability classification ###
###########################################

from transformers import ViTForImageClassification, ViTFeatureExtractor

In [None]:
### Load model ###
##################

# feature extractor
mem_feature_extractor = ViTFeatureExtractor.from_pretrained(os.path.join(model_dir, 'best_memorability_model'))

# classes
labels_mem = ['high_mem', 'low_mem', 'medium_mem']

# model
mem_model = ViTForImageClassification.from_pretrained(
    os.path.join(model_dir, 'best_memorability_model'),
    num_labels=len(labels_mem),
    id2label={str(i): c for i, c in enumerate(labels_mem)},
    label2id={c: str(i) for i, c in enumerate(labels_mem)}
)



In [None]:
### Function to apply memorability classification to individal frames of the video ###
######################################################################################

def mem_detection(video_file):

   # initialize video capturing object
    cap = cv2.VideoCapture(video_file)

    # extract fps to set interval between frames to be contidered
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # frame interval -> every n = 2 second, a frame is considered in prediction
    frame_interval = 2 * fps

    # initialize counter and emotion list object
    counter = 0
    mem_list = []

    # loop though video
    while True:
      ret, frame = cap.read()

      counter+=1

      if not ret:
        break

      if counter % frame_interval != 0:
        continue


      # feature extraction
      inp = mem_feature_extractor(frame[:,:,::-1], return_tensors='pt')

      # prediction
      with torch.no_grad():
        logits = mem_model(inp['pixel_values'])['logits']

      mem_list.append(logits.softmax(dim = -1)[0].tolist())



    # Release the video capture object and close the windows
    cap.release()
    cv2.destroyAllWindows()

    return (np.array(mem_list).mean(0) @ np.array([2,0,1])) / 3

In [None]:
### Apply to all videos ###
###########################

video_id = []
mem_video_list = []

for video_file in tqdm(os.listdir('./Video')):

  video_id.append(video_file[:-4])
  mem_video_list.append(mem_detection(os.path.join('./Video', video_file)))

100%|██████████| 182/182 [19:40<00:00,  6.49s/it]


In [None]:
### Create final dataframe for intent detection ###
###################################################

mem_df = pd.DataFrame({'mem_score' : mem_video_list})
mem_df['video_id'] = video_id

### Audio mood classification

In [None]:
### Imports audio mood classification ###
#########################################

import librosa
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

In [None]:
### Load model ###
##################

# audio mood classes
audio_mood_classes = ['Q1', 'Q2', 'Q3', 'Q4']

# Load feature extractor
sound_feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(model_dir, 'best_audio_mood_model'))

# Load model
sound_model = AutoModelForAudioClassification.from_pretrained(os.path.join(model_dir, 'best_audio_mood_model'))

In [None]:
### Function to apply audio mood detection to audio files ###
#############################################################

def audio_mood_detection(audio_file):

  # load audio data
  audio_data, sr = librosa.load(audio_file)

  # resample to sampling rate the model was trained on
  a_rs = librosa.resample(audio_data, orig_sr = sr, target_sr = 16000)

  # split audio file into 10 parts of equal length
  parts = np.linspace(0, len(a_rs), 10)

  # loop to consider each part individually
  audio_list = []

  for i in [0, 3, 5, 7]:

    # apply Short-time Fourier Transform to respective part to get model input
    inp = sound_feature_extractor(a_rs[int(parts[i]): int(parts[i+1])], sampling_rate=16000, return_tensors="pt")

    # extract class probabilities form output logits
    with torch.no_grad():
      class_probs = sound_model(**inp).logits.softmax(dim = -1)[0]

    audio_list.append(class_probs.tolist())

  # return mean class probabilites over all audio parts
  return list(np.array(audio_list).mean(0))

In [None]:
### Apply sound detection to all audio files ###
################################################

video_id = []
audio_mood_video_list = []

for audio_file in tqdm(os.listdir('./Audio')):

  video_id.append(audio_file[:-4])
  audio_mood_video_list.append(audio_mood_detection(os.path.join('./Audio', audio_file)))

100%|██████████| 182/182 [33:47<00:00, 11.14s/it]


In [None]:
### Create final dataframe for sound detection ###
##################################################

# create df
audio_mood_df = pd.DataFrame(audio_mood_video_list)

# create dict of col names
audio_mood_name_dict = {i:f"p_audio_mood_{c}" for i,c in enumerate(audio_mood_classes)}

# add names and video_id column
audio_mood_df = audio_mood_df.rename(columns = audio_mood_name_dict)
audio_mood_df['video_id'] = video_id

In [None]:
### Merge all 3 dataframes into one ###
#######################################

# merge
high_level_features = intent_df.merge(mem_df, on='video_id').merge(audio_mood_df, on='video_id')

# move video_id column to beginning
first_column = high_level_features.pop('video_id')
high_level_features.insert(0, 'video_id', first_column)

In [None]:
### Save as csv file ###
########################

high_level_features.to_csv(f'./high_level_features_{data_file}.csv')
shutil.copy(f'./high_level_features_{data_file}.csv', save_dir)

'/content/drive/MyDrive/0_Masterarbeit/2_Pipelines/Feature_outputs/high_level_features_ferrari.csv'