# Audio Syncronizer

We have built a program which synchronises an audio clip to a video clip based on the tempo in the audio. The basic principle is, when the tempo in the audio clip is high, the video speeds up proportionally, similarly when the tempo is low, the video slows down proportionally. Additionally, we add a transition at the moment a beat occurs. This creates a symphony between the users auditory and visual sensory input, giving a more immersive experience of the output.

In [1]:
# Importing the required libraries

import os, sys
import numpy as np
import librosa
from scipy import signal
from scipy.interpolate import interp1d
from scipy.ndimage import filters
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import IPython.display as ipd
from numba import jit


import libfmp.b
import libfmp.c2
import libfmp.c3
import libfmp.c6
import wave
import contextlib

from moviepy.editor import *

%matplotlib inline
sys.path.append('..')

Now, we import the audio file and the video file in the  to be synchronised. We also specify the maximum allowed duration of the final clip (in seconds).

In [446]:
audio_clip = r"C:\Users\Suhayl Mahek\Downloads\test_8.wav"
video_clip = r"C:\Users\Suhayl Mahek\Downloads\lake.mp4"
duration = 20

Computing Fourier-based tempogram [FMP, Section 6.2.2]

    (Reference Notebook: C6/C6S2_TempogramFourier.ipynb)

    The function has follwing arguments (with type):
        x (np.ndarray): Input signal
        Fs (scalar): Sampling rate
        N (int): Window length
        H (int): Hop size
        Theta (np.ndarray): Set of tempi (given in BPM) (Default value = np.arange(30, 601, 1))

    THe function returns:
        X (np.ndarray): Tempogram
        T_coef (np.ndarray): Time axis (seconds)
        F_coef_BPM (np.ndarray): Tempo axis (BPM)

In [447]:
def compute_tempogram_fourier(x, Fs, N, H, Theta=np.arange(30, 601, 1)):
    win = np.hanning(N)
    N_left = N // 2
    L = x.shape[0]
    L_left = N_left
    L_right = N_left
    L_pad = L + L_left + L_right
    x_pad = np.concatenate((np.zeros(L_left), x, np.zeros(L_right)))
    t_pad = np.arange(L_pad)
    M = int(np.floor(L_pad - N) / H) + 1
    K = len(Theta)
    X = np.zeros((K, M), dtype=np.complex_)

    for k in range(K):
        omega = (Theta[k] / 60) / Fs
        exponential = np.exp(-2 * np.pi * 1j * omega * t_pad)
        x_exp = x_pad * exponential
        for n in range(M):
            t_0 = n * H
            t_1 = t_0 + N
            X[k, n] = np.sum(win * x_exp[t_0:t_1])
        T_coef = np.arange(M) * H / Fs
        F_coef_BPM = Theta
    return X, T_coef, F_coef_BPM

In [449]:
def plot_function_peak_positions(nov, Fs_nov, peaks, title='', figsize=(8,2)):
    peaks_sec = peaks/Fs_nov
    fig, ax, line = libfmp.b.plot_signal(nov, Fs_nov, figsize=figsize, color='k', title=title);
    plt.vlines(peaks_sec, 0, 1.1, color='r', linestyle=':', linewidth=1);


In [None]:
fn_wav = os.path.join('..', 'data', 'C6', audio_clip)
# Sampling Frequency
Fs = 22050

# audio_clip
x = librosa.load(fn_wav) 

#Novelty curve
nov, Fs_nov = libfmp.c6.compute_novelty_spectrum(x[0], Fs=Fs, N=2048, H=512, gamma=100, M=10, norm=True)
nov, Fs_nov = libfmp.c6.resample_signal(nov, Fs_in=Fs_nov, Fs_out=100)

N = 500 #corresponding to 5 seconds (Fs_nov = 100)
H = 10
Theta = np.arange(30, 601)
X, T_coef, F_coef_BPM = compute_tempogram_fourier(nov, Fs_nov, N=N, H=H, Theta=Theta)
tempogram = np.abs(X)

fig, ax = plt.subplots(2, 2, gridspec_kw={'width_ratios': [1, 0.05], 
                                          'height_ratios': [1, 2]}, figsize=(8,5))        
libfmp.b.plot_signal(nov, Fs_nov, ax=ax[0,0], color='k', title='Novelty function')
ax[0,1].set_axis_off()
libfmp.b.plot_matrix(tempogram, T_coef=T_coef, F_coef=F_coef_BPM, ax=[ax[1,0], ax[1,1]], 
                     title='Fourier tempogram', ylabel='Tempo (BPM)', colorbar=True);
plt.tight_layout()


In [None]:
# Prominence is adjustable
prominence = 0.3
peaks = signal.find_peaks(nov, prominence=prominence)[0]
title='Scipy peak picking (prominence=%.2f)'%prominence
plot_function_peak_positions(nov, Fs_nov, peaks, title)

In [451]:
print(peaks)
# Peak position in milliseconds
peak=[]
for i in range(len(peaks)):
    peak.append(peaks[i]/100)

[ 32  54  65  76 109 121 163 207 218 230 251 295 339 383 425 471 492 513
 557 569 579 601 646 667 689 733]


In [452]:
max_d=0
a=0
tempo=[]
for j in range(len(T_coef)):
    tempo.append(F_coef_BPM[a])
    max_d=(tempogram[0][j])
    i=0
    while i<(len(F_coef_BPM)):
        if tempogram[i][j]>=max_d:
            max_d=tempogram[i][j]
            a=i
            i+=1
        else:
            i+=1

In [None]:
import matplotlib.pyplot as plt
plt.plot(T_coef, tempo)  # Plot the chart
for i in range(len(peaks)):
    plt.vlines(peak[i],0,300,'red')
plt.show()

In [454]:
peak.insert(0,0.0)

In [455]:
# Calculating Areas between peaks. The area corresponds to number of beats

count=0
k=0
j=0
area=0
areas=[]
while  j<(len(peak)):
    
    area=0
    while k<=peak[j]:
        area+=tempo[int(k*10)]*0.1
        k+=0.1
    #print(area)
    j+=1
    areas.append(area)
#areas.append(area)

In [456]:
print(areas)
print("No of sections = ", len(areas))

[3.0, 165.0, 109.80000000000001, 54.900000000000006, 54.900000000000006, 164.70000000000002, 109.80000000000001, 219.20000000000002, 219.20000000000002, 54.800000000000004, 54.800000000000004, 164.4, 219.20000000000002, 218.8, 273.90000000000003, 219.20000000000002, 274.0, 109.60000000000001, 109.60000000000001, 219.20000000000002, 54.800000000000004, 54.800000000000004, 164.4, 219.20000000000002, 109.60000000000001, 109.60000000000001, 274.0]
No of sections =  27


In [457]:
total_area=sum(areas)
s = 0
for i in areas:
    s = s + i
s

4004.3999999999996

In [458]:
percentages=[]
for i in areas:
    percentages.append(round((i/total_area),4))
ddd

In [459]:
percentages

[0.0007,
 0.0412,
 0.0274,
 0.0137,
 0.0137,
 0.0411,
 0.0274,
 0.0547,
 0.0547,
 0.0137,
 0.0137,
 0.0411,
 0.0547,
 0.0546,
 0.0684,
 0.0547,
 0.0684,
 0.0274,
 0.0274,
 0.0547,
 0.0137,
 0.0137,
 0.0411,
 0.0547,
 0.0274,
 0.0274,
 0.0684]

In [460]:
clip = VideoFileClip(video_clip)

In [461]:
t=0.3

In [462]:
####
duration = min((clip.duration-len(peaks)*t),peak[-1],duration)
print(clip.duration)
print(clip.duration-len(peaks)*t)
    

9.19
1.3899999999999997


In [463]:
print(duration)

1.3899999999999997


In [464]:
audio_ranges=[]
count=0.0
#audio_ranges.append([0.0,peak[0]])
for i in range(len(peak)-1):
    audio_ranges.append([round(peak[i],4), round(peak[i+1],4)])
    count+=peak[i]
    

In [465]:
video_duration = clip.duration

In [466]:
#######
video_markers=[]
for percent in percentages:
    video_markers.append(round(percent*video_duration,4))

In [467]:
print(video_markers)

[0.0064, 0.3786, 0.2518, 0.1259, 0.1259, 0.3777, 0.2518, 0.5027, 0.5027, 0.1259, 0.1259, 0.3777, 0.5027, 0.5018, 0.6286, 0.5027, 0.6286, 0.2518, 0.2518, 0.5027, 0.1259, 0.1259, 0.3777, 0.5027, 0.2518, 0.2518, 0.6286]


In [468]:
sum(video_markers)

9.188099999999999

In [469]:
len(video_markers)

27

In [470]:

video_ranges=[]
count=0.0
#t=(duration_audio*0.3)/len(peaks)
for i in range(1, len(video_markers)):
    video_ranges.append([round(count,1), round(count+video_markers[i],1)]) 
    count+=(video_markers[i]+t)
  


In [471]:
print(len(video_ranges))
print(video_ranges)
print(len(audio_ranges))
print(audio_ranges)
print(count)

26
[[0.0, 0.4], [0.7, 0.9], [1.2, 1.4], [1.7, 1.8], [2.1, 2.5], [2.8, 3.0], [3.3, 3.8], [4.1, 4.6], [4.9, 5.0], [5.3, 5.5], [5.8, 6.1], [6.4, 6.9], [7.2, 7.8], [8.1, 8.7], [9.0, 9.5], [9.8, 10.4], [10.7, 11.0], [11.3, 11.5], [11.8, 12.3], [12.6, 12.7], [13.0, 13.2], [13.5, 13.8], [14.1, 14.6], [14.9, 15.2], [15.5, 15.8], [16.1, 16.7]]
26
[[0.0, 0.32], [0.32, 0.54], [0.54, 0.65], [0.65, 0.76], [0.76, 1.09], [1.09, 1.21], [1.21, 1.63], [1.63, 2.07], [2.07, 2.18], [2.18, 2.3], [2.3, 2.51], [2.51, 2.95], [2.95, 3.39], [3.39, 3.83], [3.83, 4.25], [4.25, 4.71], [4.71, 4.92], [4.92, 5.13], [5.13, 5.57], [5.57, 5.69], [5.69, 5.79], [5.79, 6.01], [6.01, 6.46], [6.46, 6.67], [6.67, 6.89], [6.89, 7.33]]
16.9817


In [472]:

f = 0.0
clips=[]
for i in range(len(video_ranges)):
    tempclip = clip.subclip(video_ranges[i][0],video_ranges[i][1])    
    f = round((audio_ranges[i][1]-audio_ranges[i][0])/(video_ranges[i][1]-video_ranges[i][0]),2)
    print(f)
    a=f*f
    #if a> 1:
    #    a*=1.5
    #elif a<1:
    #    a*=0.5
    #elif a==1:
    #    a*=1
    tempclip = tempclip.fx(vfx.speedx,1/f)
    clips.append(tempclip)
    if(video_ranges[i+1][0]>video_duration):
        break
    #final = concatenate_videoclips([final,tempclip])

0.8
1.1
0.55
1.1
0.83
0.6
0.84
0.88
1.1
0.6
0.7
0.88
0.73
0.73
0.84


In [473]:
len(clips)

15

In [474]:
difference=0
for j in range(len(clips)):
    clip_time=clips[j].duration
    audio_time=(audio_ranges[j][1]-audio_ranges[j][0])
    difference+=abs(audio_time-clip_time)
print(difference)

0.0060000000000023646


In [475]:
final = concatenate_videoclips(clips)
audio = AudioFileClip(audio_clip)
duration=final.duration
#final.audio = CompositeAudioClip([audio])

final = final.set_audio(audio)
#final = final.subclip(0, duration)


In [476]:
print(duration)

4.247999999999999


In [477]:
end_clip=final.subclip(0,duration)

In [None]:
end_clip.ipython_display()

In [445]:
end_clip.write_videofile("edited_nebula.mp4")

                                                                                                                       

Moviepy - Building video edited_nebula.mp4.
MoviePy - Writing audio in edited_nebulaTEMP_MPY_wvf_snd.mp3
MoviePy - Done.
Moviepy - Writing video edited_nebula.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready edited_nebula.mp4
