In [4]:
import librosa
import numpy as np
import pandas as pd
import os
import cv2
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

In [6]:
def extract_audio_features(audio_path, sr=22050, n_mfcc=13):
    y, sr = librosa.load(audio_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs, axis=1)

In [37]:
def extract_video_features(video_path):
    cap = cv2.VideoCapture(video_path)
    prev_gray = None
    motion_vectors = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        if prev_gray is not None:
            flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
            motion_vectors.append(np.mean(flow))  # Mean flow per frame

        prev_gray = gray

    cap.release()
    return np.array(motion_vectors)


In [25]:
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

def compute_similarity(audio_features, video_features):
    # Convert to NumPy arrays and ensure they are 1D
    audio_features = np.asarray(audio_features).ravel()
    video_features = np.asarray(video_features).ravel()

    # Compute DTW distance
    distance, _ = fastdtw(audio_features, video_features, dist=euclidean)
    return distance


In [11]:


audio_folder = "audio_only"
video_folder = "video_only"

audio_files = sorted(os.listdir(audio_folder))
video_files = sorted(os.listdir(video_folder))

results = []





In [12]:
audio_features_dict = {audio: extract_audio_features(os.path.join(audio_folder, audio)) for audio in audio_files}

In [15]:
from tqdm import tqdm

video_features_dict = {
    video: extract_video_features(os.path.join(video_folder, video))
    for video in tqdm(video_files, desc="Extracting video features")
}


Extracting video features: 100%|████████████████████████████████████████████████████| 45/45 [1:24:59<00:00, 113.32s/it]


In [None]:

# Loop through each audio file
for audio in tqdm(audio_files, desc="Matching audio to video"):
    best_match = None
    best_score = float("inf")

    # Compare with each video file
    for video in video_files:
        score = compute_similarity(audio_features_dict[audio], video_features_dict[video])

        if score < best_score:
            best_score = score
            best_match = video

    results.append((audio, best_match))

# Save results as CSV
df = pd.DataFrame(results, columns=["audio_file", "video_file"])
df.to_csv("matched_audio_video.csv", index=False)
