In [41]:
import yt_dlp
import os
import gc

import cv2
import torch
import torchvision.transforms as transforms
from torchvision.models import inception_v3
import numpy as np
from sklearn.decomposition import PCA

import torchaudio
from torchaudio.prototype.pipelines import VGGISH

import ffmpeg
from pydub import AudioSegment
from moviepy import AudioFileClip
import tqdm
import argparse

from transformers import EncodecModel, AutoProcessor
import librosa

In [42]:
def delete_file(filename):
    if os.path.exists(filename):
        os.remove(filename)
        print(f"The file {filename} has been deleted.")
    else:
        print(f"The file {filename} does not exist.")


def extract_video_features(video_path, max_frames=360):

    # Define preprocessing transformations
    preprocess = transforms.Compose([
      transforms.ToPILImage(),
      transforms.Resize((299, 299)),
      transforms.ToTensor(),
      # normalize based on mean and standard deviation of imagenet dataset
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Open video file
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    features = []
    frame_count = 0

    with torch.no_grad():  # No gradients needed for inference
      while cap.isOpened() and frame_count < max_frames*fps:
          ret, frame = cap.read()
          if not ret:
              break

          # Process one frame per second
          if frame_count % int(fps) == 0:
              # Preprocess the frame
              frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
              input_tensor = preprocess(frame).unsqueeze(0)  # Add batch dimension

              # Extract features using the InceptionV3 model
              feature_vector = inception(input_tensor)
              features.append(feature_vector.squeeze(0).cpu().numpy())  # Convert to numpy array

          frame_count += 1

    cap.release()
    return np.array(features)


def extract_audio_features(audio_path):   
    # Load and preprocess audio
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Resample if necessary
    if sample_rate != VGGISH.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, VGGISH.sample_rate)

    waveform = waveform.mean(dim=0)
    
    # Process input
    input_batch = vggish_input_processor(waveform)
    
    # Extract features
    with torch.no_grad():
        features = vggish(input_batch)
    
    return features.numpy()



In [3]:
video_url = 'https://www.youtube.com/watch?v=9bZkp7q19f0'

vid = video_url.split('=')[1]
vid = '9bZkp7q19f0'

In [13]:
ydl_opts = {
    'format': 'bestvideo+bestaudio/best',  # Download best video + best audio
    'outtmpl': f'{vid}.%(ext)s',  # Customize output filename
    'noplaylist': True,  # Ensure only the video itself is downloaded, not a playlist
    'postprocessor_args': [
        '-ss', '00:00:00',  # Start from the beginning of the video
        '-t', '360',  # Limit to 360 seconds (6 minutes)
    ]
}

# download youtube video
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])


# once we download all training features, we have to do pca whitening
#video_feats = extract_video_features(f'/{vid}.mp4')

[youtube] Extracting URL: https://www.youtube.com/watch?v=9bZkp7q19f0
[youtube] 9bZkp7q19f0: Downloading webpage
[youtube] 9bZkp7q19f0: Downloading tv client config
[youtube] 9bZkp7q19f0: Downloading player 20830619
[youtube] 9bZkp7q19f0: Downloading tv player API JSON
[youtube] 9bZkp7q19f0: Downloading ios player API JSON




[youtube] 9bZkp7q19f0: Downloading m3u8 information
[info] 9bZkp7q19f0: Downloading 1 format(s): 616+234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 48
[download] Destination: 9bZkp7q19f0.f616.mp4
[download] 100% of  134.21MiB in 00:00:28 at 4.78MiB/s                  
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 48
[download] Destination: 9bZkp7q19f0.f234.mp4
[download] 100% of    3.92MiB in 00:00:12 at 330.43KiB/s                
[Merger] Merging formats into "9bZkp7q19f0.mp4"
Deleting original file 9bZkp7q19f0.f616.mp4 (pass -k to keep)
Deleting original file 9bZkp7q19f0.f234.mp4 (pass -k to keep)


In [None]:
# load the model + processor (for pre-processing the audio)
encodec = EncodecModel.from_pretrained("facebook/encodec_24khz")
codebook_size = encodec.config.codebook_size


In [None]:
audio = AudioFileClip(f'{vid}.mp4')

# Write audio to MP3 file
audio.write_audiofile( f'{vid}.mp3')

In [14]:
# load the model + processor (for pre-processing the audio)
encodec = EncodecModel.from_pretrained("facebook/encodec_24khz")
codebook_size = encodec.config.codebook_size

processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")
audio_sample, sample_rate = librosa.load(f'{vid}.mp4', sr=processor.sampling_rate, duration=10)

  audio_sample, sample_rate = librosa.load(f'{vid}.mp4', sr=processor.sampling_rate, duration=10)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [9]:
# pre-process the audio inputs
inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

# explicitly encode then decode the audio inputs
encoder_outputs = encodec.encode(inputs["input_values"], inputs["padding_mask"])


In [12]:
# Save flattened audio codes
encoder_outputs.audio_codes.flatten().numpy()

array([ 62,  62,  62, ..., 688, 101, 365])

In [16]:
import IPython.display as ipd
ipd.Audio(audio_sample, rate=processor.sampling_rate)

### Video Feature Extraction

In [8]:
video_path = r'9bZkp7q19f0.mp4'
inception = inception_v3(pretrained=True, transform_input=False)
inception.fc = torch.nn.Identity()  # Remove the classification layer (we only need features)
inception.eval()  # Set the model to evaluation mode
print('Loaded InceptionV3')




Loaded InceptionV3


In [12]:
import cv2
import torch
import numpy as np
from torchvision import transforms

# Define preprocessing transformations for InceptionV3
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Open video file
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
num_seconds = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) // fps)
num_seconds = 30

# Initialize variables
features = np.zeros((num_seconds, 2048))  # Assuming InceptionV3 features are of size 2048
optical_flow_features = np.zeros(num_seconds)
frame_count = 0
prev_gray = None
optical_flow_accumulated = []  # To accumulate optical flow magnitudes for the current second
current_second = 0

# Calculate step to compute optical flow 4 times per second (for 1/4th intervals)
optical_flow_step = int(fps / 4)  # Compute optical flow every 1/4th of a second

with torch.no_grad():  # No gradients needed for inference
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert the frame to grayscale for optical flow calculation
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Compute optical flow if there is a previous frame, but only for 4 intervals per second
        if prev_gray is not None and (frame_count % optical_flow_step == 0):
            flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)

            # Compute the magnitude of optical flow
            magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])

            # Accumulate the magnitude of optical flow
            optical_flow_accumulated.append(np.mean(magnitude))  # Store the average magnitude for the current frame

        # Process one frame per second for feature extraction
        if frame_count % int(fps) == 0:
            # Preprocess the frame for InceptionV3
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            input_tensor = preprocess(frame_rgb).unsqueeze(0)  # Add batch dimension

            # Extract features using the InceptionV3 model (assuming 'inception' is already defined)
            feature_vector = inception(input_tensor)
            features[current_second] = feature_vector.squeeze(0).cpu().numpy()  # Store feature vector for the second

            # If we've accumulated optical flow data for the previous second, store it
            if len(optical_flow_accumulated) >= 4:  # After 4 intervals (1/4th per second)
                optical_flow_features[current_second] = np.mean(optical_flow_accumulated)  # Average over 4 intervals
                optical_flow_accumulated = []  # Reset for the next second interval
                current_second += 1  # Move to the next second

        # Update the previous grayscale frame
        prev_gray = gray
        frame_count += 1

        # Stop if we've processed the maximum number of seconds
        if current_second >= num_seconds:
            break

cap.release()

# Now `features` contains the InceptionV3 feature vectors for each second, and
# `optical_flow_features` contains the average optical flow magnitude for each second.


In [22]:
def extract_video_features(video_path, num_seconds=60):
  # Define preprocessing transformations for InceptionV3
  preprocess = transforms.Compose([
      transforms.ToPILImage(),
      transforms.Resize((299, 299)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])

  # Open video file
  cap = cv2.VideoCapture(video_path)
  fps = cap.get(cv2.CAP_PROP_FPS)
  

  # Initialize variables
  features = [] 
  optical_flow_features = []
  frame_count = 0
  prev_gray = None
  optical_flow_accumulated = []  # To accumulate optical flow magnitudes for the current second
  current_second = 0

  # Calculate step to compute optical flow 4 times per second (for 1/4th intervals)
  optical_flow_step = int(fps / 4)  # Compute optical flow every 1/4th of a second

  with torch.no_grad():  # No gradients needed for inference
      while cap.isOpened():
          ret, frame = cap.read()
          if not ret:
              break

          # Convert the frame to grayscale for optical flow calculation
          gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

          # Compute optical flow if there is a previous frame, but only for 4 intervals per second
          if prev_gray is not None and (frame_count % optical_flow_step == 0):
              flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)

              # Compute the magnitude of optical flow
              magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])

              # Accumulate the magnitude of optical flow
              optical_flow_accumulated.append(np.mean(magnitude))  # Store the average magnitude for the current frame

          # Process one frame per second for feature extraction
          if frame_count % int(fps) == 0:
              # Preprocess the frame for InceptionV3
              frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
              input_tensor = preprocess(frame_rgb).unsqueeze(0)  # Add batch dimension

              # Extract features using the InceptionV3 model (assuming 'inception' is already defined)
              feature_vector = inception(input_tensor)
              features.append(feature_vector.squeeze(0).cpu().numpy())  # Store feature vector for the second

              # If we've accumulated optical flow data for the previous second, store it
              if len(optical_flow_accumulated) >= 4:  # After 4 intervals (1/4th per second)
                  optical_flow_features.append(np.mean(optical_flow_accumulated))  # Average over 4 intervals
                  optical_flow_accumulated = []  # Reset for the next second interval
                  current_second += 1  # Move to the next second

          # Update the previous grayscale frame
          prev_gray = gray
          frame_count += 1

          # Stop if we've processed the maximum number of seconds
          if current_second >= num_seconds:
              break

  cap.release()


  return features, optical_flow_features

In [23]:
features, optical_flow_features = extract_video_features(video_path, num_seconds=10)

In [27]:
np.array(features).shape

(11, 2048)

In [29]:
np.array(optical_flow_features).shape

(10,)

In [31]:
optical_flow_features[0]

0.14966156

In [32]:
features[0]

array([0.08472932, 0.44540223, 0.28709668, ..., 0.03923059, 0.40963376,
       0.23256062], dtype=float32)

In [35]:
  with open('V2M-20k.txt', 'r') as file:
      content = file.read()  # Read the entire content of the file
  youtube_ids = content.split('\n')



In [38]:
vid = youtube_ids[0]

In [39]:
ydl_opts = {
    'format': 'bestvideo+bestaudio/best',  # Download best video + best audio
    'outtmpl': f'{vid}.%(ext)s',  # Customize output filename
    'noplaylist': True,  # Ensure only the video itself is downloaded, not a playlist
    'postprocessor_args': [
        '-ss', '00:00:00',  # Start from the beginning of the video
        '-t', '360',  # Limit to 360 seconds (6 minutes)
    ]
}

# download youtube video
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])


# once we download all training features, we have to do pca whitening
#video_feats = extract_video_features(f'/{vid}.mp4')

[youtube] Extracting URL: https://www.youtube.com/watch?v=9bZkp7q19f0
[youtube] 9bZkp7q19f0: Downloading webpage
[youtube] 9bZkp7q19f0: Downloading tv client config
[youtube] 9bZkp7q19f0: Downloading player 20830619
[youtube] 9bZkp7q19f0: Downloading tv player API JSON
[youtube] 9bZkp7q19f0: Downloading ios player API JSON




[youtube] 9bZkp7q19f0: Downloading m3u8 information
[info] 9bZkp7q19f0: Downloading 1 format(s): 616+234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 48
[download] Destination: ng9x47T3c_E.f616.mp4
[download] 100% of  134.21MiB in 00:00:24 at 5.47MiB/s                  
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 48
[download] Destination: ng9x47T3c_E.f234.mp4
[download] 100% of    3.92MiB in 00:00:03 at 1.07MiB/s                   
[Merger] Merging formats into "ng9x47T3c_E.mp4"
Deleting original file ng9x47T3c_E.f234.mp4 (pass -k to keep)
Deleting original file ng9x47T3c_E.f616.mp4 (pass -k to keep)
