In [None]:
from sklearn.preprocessing import OneHotEncoder

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/dataset/
%cd not_trained_videos
%rm -rf -d *.jpeg
%cd ..

In [None]:
!pip install moviepy==1.0.3
!pip install scenedetect

In [None]:
#Spleeter
from sklearn.preprocessing import OneHotEncoder

!apt install ffmpeg
!pip install spleeter
#Pytorch
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
#madmom
!pip install Cython
!pip install mido
!pip install madmom
#Beat Transformer model
!git clone --branch=main https://github.com/zhaojw1998/Beat-Transformer

#import shutil
#shutil.move('./Beat-Transformer/code/DilatedTransformer.py', './DilatedTransformer.py') 
#shutil.move('./Beat-Transformer/code/DilatedTransformerLayer.py', './DilatedTransformerLayer.py') 

In [None]:
from __future__ import print_function
import os
from os import listdir
from os.path import isfile, join
from scenedetect import open_video, SceneManager, StatsManager, AdaptiveDetector
from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.frame_timecode import FrameTimecode
from scenedetect.stats_manager import StatsManager
from scenedetect.detectors import ContentDetector, AdaptiveDetector

#Load models and dependencies


In [None]:
#General dependencies
from google.colab import files
import IPython.display as ipd
import numpy as np 
import librosa
import torch
#Models to load
from spleeter.audio.adapter import AudioAdapter
from spleeter.separator import Separator
from DilatedTransformer import Demixed_DilatedTransformerModel
from madmom.features.beats import DBNBeatTrackingProcessor
from madmom.features.downbeats import DBNDownBeatTrackingProcessor

#Initialize Spleeter for pre-processing (demixing)
separator = Separator('spleeter:5stems')
mel_f = librosa.filters.mel(sr=44100, n_fft=4096, n_mels=128, fmin=30, fmax=11000).T
audio_loader = AudioAdapter.default()

#Initialize Beat Transformer to estimate (down-)beat activation from demixed input
model = Demixed_DilatedTransformerModel(attn_len=5, instr=5, ntoken=2, 
                                        dmodel=256, nhead=8, d_hid=1024, 
                                        nlayers=9,  norm_first=True)
PARAM_PATH = {
    0: "./Beat-Transformer/checkpoint/fold_0_trf_param.pt",
    1: "./Beat-Transformer/checkpoint/fold_1_trf_param.pt",
    2: "./Beat-Transformer/checkpoint/fold_2_trf_param.pt",
    3: "./Beat-Transformer/checkpoint/fold_3_trf_param.pt",
    4: "./Beat-Transformer/checkpoint/fold_4_trf_param.pt",
    5: "./Beat-Transformer/checkpoint/fold_5_trf_param.pt",
    6: "./Beat-Transformer/checkpoint/fold_6_trf_param.pt",
    7: "./Beat-Transformer/checkpoint/fold_7_trf_param.pt"
}

#Initialize DBN Beat Tracker to locate beats from beat activation
beat_tracker = DBNBeatTrackingProcessor(min_bpm=55.0, max_bpm=215.0, fps=44100/1024, 
                                        transition_lambda=100, observation_lambda=6, 
                                        num_tempi=None, threshold=0.2)
                                        
#Initialize DBN Downbeat Tracker to locate downbeats from downbeat activation
downbeat_tracker = DBNDownBeatTrackingProcessor(beats_per_bar=[3, 4], 
                                                min_bpm=55.0, max_bpm=215.0, fps=44100/1024, 
                                                transition_lambda=100, observation_lambda=6, 
                                                num_tempi=None, threshold=0.2)

##Scene Change detection

In [None]:
from scenedetect import SceneManager, open_video, ContentDetector
from scenedetect import open_video, ContentDetector, SceneManager, StatsManager, AdaptiveDetector
from scenedetect.backends import VideoCaptureAdapter
import os
import shutil
from os import listdir
from os.path import isfile, join
import moviepy.editor as mp

min_required_scenes = 5
def find_scenes(mypath):
  all_scenes = []
  files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
  print(files)
  for vid in files[0:]:
      if vid.endswith("mp4"): 
        print("Extracting audio and Analyzing video: "+ vid)

        stats_manager = StatsManager()
        # Pass StatsManager to SceneManager to accelerate computing time
        scene_manager = SceneManager(stats_manager)
        # Add AdaptiveDetector algorithm (each detector's constructor
        # takes detector options, e.g. threshold).
        scene_manager.add_detector(AdaptiveDetector())
        my_clip = mp.VideoFileClip(os.path.join(mypath, vid))
        video = open_video(os.path.join(mypath, vid), backend = 'opencv')
        scene_manager.detect_scenes(video)
        scenes = scene_manager.get_scene_list()
        formatted_scenes = [scenes[i][0].get_seconds() for i in range(len(scenes))]
        if len(formatted_scenes) > min_required_scenes:
          my_clip.audio.write_audiofile('./audios/' + vid+'.mp3')
          all_scenes.append(formatted_scenes)
          shutil.move(mypath + vid, './already_trained_videos/'+vid)
        else:
          shutil.move(mypath + vid, './trash/'+vid)
          print("moved to trash")
  return all_scenes

In [None]:
scenes = find_scenes('./not_trained_videos/')

##Load test music audio

In [None]:
AUDIO_DIRS = ['./audios/'+f for f in listdir('./audios/') if isfile(join('./audios/', f))]
print(AUDIO_DIRS)
audio, sr = librosa.load(AUDIO_DIRS[0], duration=15)
ipd.Audio(audio, rate=sr)

['./audios/selindr___1675882791_7197861408682413318.mp4.mp3', './audios/selindr___1675792677_7197474315426696453.mp4.mp3', './audios/selindr___1675539561_7196387102307765509.mp4.mp3', './audios/selindr___1675367911_7195650078491561222.mp4.mp3', './audios/selindr___1675276520_7195257329837821190.mp4.mp3', './audios/selindr___1675182656_7194854647067135238.mp4.mp3', './audios/selindr___1675105626_7194523527746227462.mp4.mp3', './audios/selindr___1675021612_7194162512407923974.mp4.mp3', './audios/selindr___1674934392_7193788019218877702.mp4.mp3', './audios/selindr___1674676779_7192681380923378949.mp4.mp3', './audios/selindr___1674586741_7192294782499163397.mp4.mp3', './audios/selindr___1674502235_7191931957855423750.mp4.mp3', './audios/selindr___1674330392_7191193703493537030.mp4.mp3', './audios/selindr___1674243381_7190820119101164805.mp4.mp3', './audios/selindr___1674068294_7190068294311775493.mp4.mp3', './audios/selindr___1674044661_7189966249613430021.mp4.mp3', './audios/selindr___167

In [None]:
print(len(AUDIO_DIRS))
print(len(scenes))

## Beat and downbeat tracking


In [None]:
import librosa
import numpy as np
dbn_sampling_rate = 44100
hop_length = 512
n_fft = 2048

def get_spectrogram_value(waveform, S_db, time, sr):

    # Check that the time value is within the duration of the audio file
    audio_duration = librosa.get_duration(y = waveform)
    if time > audio_duration:
      return [-1]

    # Find the index of the STFT frame that corresponds to the given time
    frame_index = int(sr * time / hop_length)

    # Check that the frame index is within the range of the spectrogram
    if frame_index >= S_db.shape[1]:
      return [-1]

    # Extract the spectrogram values at the given frame index
    spectrogram_value = S_db[:, frame_index]

    return np.array(spectrogram_value)

In [None]:
beats_encoding_per_vid = []
beats_times_per_vid = []
dbn_beat_pred = []
dbn_downbeat_pred = []

for audio_sample in AUDIO_DIRS:
  waveform, _ = audio_loader.load(audio_sample, sample_rate = dbn_sampling_rate)
  x = separator.separate(waveform)
  x = np.stack([np.dot(np.abs(np.mean(separator._stft(x[key]), axis=-1))**2, mel_f) for key in x])
  x = np.transpose(x, (0, 2, 1))
  x = np.stack([librosa.power_to_db(x[i], ref=np.max) for i in range(len(x))])
  x = np.transpose(x, (0, 2, 1))
  FOLD = 4
  model.load_state_dict(torch.load(PARAM_PATH[FOLD], map_location=torch.device('cpu'))['state_dict'])
  model.cuda()
  model.eval()
  with torch.no_grad():
    model_input = torch.from_numpy(x).unsqueeze(0).float().cuda()
    activation, _ = model(model_input)
  beat_activation = torch.sigmoid(activation[0, :, 0]).detach().cpu().numpy()
  downbeat_activation = torch.sigmoid(activation[0, :, 1]).detach().cpu().numpy()
  # Transform it to a np array and divide by 10 to get the seconds
  dbn_beat_pred = beat_tracker(beat_activation)

  combined_act = np.concatenate((np.maximum(beat_activation - downbeat_activation, 
                                          np.zeros(beat_activation.shape)
                                          )[:, np.newaxis], 
                              downbeat_activation[:, np.newaxis]
                               ), axis=-1)   #(T, 2)
  dbn_downbeat_pred = downbeat_tracker(combined_act)
  # Transform it to a np array and divide by 10 to get the seconds
  dbn_downbeat_pred = dbn_downbeat_pred[dbn_downbeat_pred[:, 1]==1][:, 0]
  # Loading the audio in an appropriate Librosa format
  waveform, sr = librosa.load(audio_sample)
  # Calculate the STFT of the audio
  S = librosa.stft(waveform, n_fft = n_fft, hop_length=hop_length)
  # Convert the STFT to decibels
  S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
  # Normalizing the values of the spectrogram instance for faster training
  stft_db_normalized = librosa.util.normalize(S_db, axis=1)
  beats_encoding = []
  for i in range(len(dbn_beat_pred)):
    spectrogram_instance = get_spectrogram_value(waveform, stft_db_normalized, dbn_beat_pred[i],sr)
    if spectrogram_instance[0] == -1:
      continue
    if dbn_beat_pred[i] in dbn_downbeat_pred:
      spectrogram_instance = np.insert(spectrogram_instance,0,2)
    else:
      spectrogram_instance = np.insert(spectrogram_instance,0,1)
    beats_encoding.append(spectrogram_instance)
  os.remove(audio_sample)
  beats_times_per_vid.append(dbn_beat_pred)
  beats_encoding_per_vid.append(beats_encoding)

In [None]:
beats_encoding_per_vid[0][4]

In [None]:
print(dbn_beat_pred)
print(dbn_downbeat_pred)

##Create Dataset

In [None]:
len(scenes)

In [None]:
def create_Y(scene_times,dbn_beat_pred, threshold = 0.5):
  Y = [0]*len(dbn_beat_pred)
  j = 0

  i = 0
  for i in range(len(scene_times)):
    time_value = scene_times[i]
    
    while dbn_beat_pred[j] < time_value - threshold:
      j+=1
      if j== len(dbn_beat_pred):
        return np.array(Y)

    if dbn_beat_pred[j] > time_value + threshold:
      continue;
    if dbn_beat_pred[j] > time_value - threshold and dbn_beat_pred[j] < time_value + threshold:
      Y[j] = 1
      continue
    j+=1
  return np.array(Y)

In [None]:
create_Y(scenes[30][1:],beats_times_per_vid[0],0.5)

In [None]:
print(len(scenes))
print(len(beats_times_per_vid))

In [None]:
#Creating the full dataset
#Creating Y values for each beats vector
scenes_per_vid = []
for i in range(len(beats_encoding_per_vid)):
  scenes_per_vid.append(create_Y(scenes[i][1:],beats_times_per_vid[i],0.5))
len(scenes_per_vid)

In [None]:
beats_encoding_per_vid = [[[float(x) for x in arr] for arr in lst] for lst in beats_encoding_per_vid]
lengths = [len(beats_times_per_vid[i]) for i in range(len(beats_times_per_vid))]

In [None]:
import pandas as pd
beats_encoding_per_vid = pd.DataFrame(beats_encoding_per_vid)
scenes_per_vid = pd.DataFrame(scenes_per_vid)
lengths = pd.DataFrame(lengths)

beats_encoding_per_vid.to_csv('./beats_encoding_per_vid_normalized_05.csv', index=False, mode='w', header=False)
scenes_per_vid.to_csv('./scenes_per_vid_normalized_05.csv', index=False, mode='w', header=False)
lengths.to_csv('./lengths_normalized.csv', index=False, mode='w', header=False)