In [241]:
import cv2
import numpy as np
from scipy.io.wavfile import write
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import load_model
import librosa, librosa.display
import IPython.display as ipd
from IPython.display import Audio
import sounddevice as sd
import time
import ffmpeg

In [243]:
# music onset detection
audio_file = './audio/hungarian_dance_no5.mp3'
xt, sr = librosa.load(audio_file)
x, index = librosa.effects.trim(xt)

# possible way to get beat, usually more variance
# code from https://www.freecodecamp.org/news/use-python-to-detect-music-onsets/ 
onset_frames = librosa.onset.onset_detect(x, sr=sr, wait=1, pre_avg=1, post_avg=1, pre_max=1, post_max=1)
onset_times = librosa.frames_to_time(onset_frames)
num_beats = onset_frames.shape

# different way to get beat, more steady (preferred)
tempo, beats = librosa.beat.beat_track(y=x, sr=sr)
beat_times = librosa.frames_to_time(beats)
onset_times = beat_times

output_name = 'beatmaps/hungarian_dance_beatmap.txt'
with open(output_name, 'wt') as f:
    f.write('\n'.join(['%.4f' % onset_time for onset_time in onset_times]))
ipd.Audio(x, rate=sr)

In [244]:
# sound with beats
clicks = librosa.clicks(onset_times, sr=sr, length=len(x))
write('hungarian_beat.wav', sr, x+clicks)
ipd.Audio(x + clicks, rate=sr)

In [245]:
# time stretch sanity check
x_fast = librosa.effects.time_stretch(x, 2.0)

# splitting each beat into separate arrays
beat_samples = librosa.frames_to_samples(onset_frames)
arr = np.array_split(x, beat_samples)

# play a single beat
ipd.Audio(arr[2], rate=sr)

In [246]:
# initialize mediapipe
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mpDraw = mp.solutions.drawing_utils

In [247]:
# Load the gesture recognizer model
# hand model recognition gotten from https://techvidvan.com/tutorials/hand-gesture-recognition-tensorflow-opencv/ 
model = load_model('mp_hand_gesture')

# Load class names
f = open('gesture.names', 'r')
classNames = f.read().split('\n')
f.close()
print(classNames)

['okay', 'peace', 'thumbs up', 'thumbs down', 'call me', 'stop', 'rock', 'live long', 'fist', 'smile']


In [248]:
def create_checkpoints(frame):
    # create checkpoints on screen and return positions
    x,y,c = frame.shape
    first_circle_pos = (640, 200)
    second_circle_pos = (426, 400)
    third_circle_pos = (853, 400)
    cv2.circle(frame, first_circle_pos, 50, (0, 255, 1), thickness=2, lineType=8, shift=0) #draw circle
    cv2.circle(frame, second_circle_pos, 50, (0, 255, 1), thickness=2, lineType=8, shift=0) #draw circle
    cv2.circle(frame, third_circle_pos, 50, (0, 255, 1), thickness=2, lineType=8, shift=0) #draw circle
    return first_circle_pos, second_circle_pos, third_circle_pos

In [249]:
# Initialize the webcam
cap = cv2.VideoCapture(0)

# record video, adjust fps to fps of webcam, varies
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
# parameters: (output_name, fourcc, fps, size of frame)
output = cv2.VideoWriter("web_video.mp4", fourcc, 9.0, (1280,720))

index = 0
num_circle = 1
time_start = time.time()
time_begin = -1.0
times_list = []
stop = 0
stop_list = []
stop_time = time.time()
while True:
    # Read each frame from the webcam
    _, frame = cap.read()

    x, y, c = frame.shape

    # Flip the frame vertically
    frame = cv2.flip(frame, 1)
    framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Get hand landmark prediction
    result = hands.process(framergb)

    # print(result)

    className = ''
    string = ''
    index_fing_str = ''
    index_fing = (0, 0)

    # post process the result
    if result.multi_hand_landmarks:
        landmarks = []
        frame_pos = []
        for handslms in result.multi_hand_landmarks:
            for lm in handslms.landmark:
                # print(id, lm)
                lmx = int(lm.x * x)
                lmy = int(lm.y * y)

                landmarks.append([lmx, lmy])
                
                # flipped for location of checkpoints
                framex = int(lm.x * y)
                framey = int(lm.y * x)
                frame_pos.append([framex, framey])

            # Drawing landmarks on frames
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)

            # Predict gesture
            prediction = model.predict([landmarks])
            # print(prediction)
            classID = np.argmax(prediction)
            className = classNames[classID]
            
            # getting location of index finger to see if hit checkpoint
            string = str(frame_pos[4])
            index_fing = frame_pos[8]
            index_fing_str = str(frame_pos[8])

    # show the prediction on the frame
    
    cv2.putText(frame, string, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                   1, (0,0,255), 2, cv2.LINE_AA)
    
    cv2.putText(frame, index_fing_str, (10, 500), cv2.FONT_HERSHEY_SIMPLEX, 
                   1, (0,0,255), 2, cv2.LINE_AA)
    
    cv2.putText(frame, str(num_circle), (10, 700), cv2.FONT_HERSHEY_SIMPLEX, 
                   1, (0,0,255), 2, cv2.LINE_AA)
    
    cv2.putText(frame, className, (900, 700), cv2.FONT_HERSHEY_SIMPLEX, 
                   1, (0,0,255), 2, cv2.LINE_AA)
    
    # checks if hand class is stop, will pause/stop beat
    if (className == 'stop'):
        if (stop == 0):
            stop = 1
            stop_list.append(time.time())
    else:
        # getting length of time that stop was held
        if (stop == 1):
            stop = 0
            stop_list.append(time.time())
    
    # checking if checkpoint was hit, records time and adjusts num_circle
    # num _circle is which circle to play the next beat
    first_pos, second_pos, third_pos = create_checkpoints(frame)
    if (abs(first_pos[0] - index_fing[0]) < 50 and abs(first_pos[1] - index_fing[1]) < 50 and index < num_beats[0] and num_circle == 1):
        if (time_begin < 0.0):
            time_begin = time.time() 
        times_list.append(time.time())
        index += 1
        num_circle = 2
    elif (abs(second_pos[0] - index_fing[0]) < 50 and abs(second_pos[1] - index_fing[1]) < 50 and index < num_beats[0] and num_circle == 2):
        times_list.append(time.time())
        index += 1
        num_circle = 3
    elif (abs(third_pos[0] - index_fing[0]) < 50 and abs(third_pos[1] - index_fing[1]) < 50 and index < num_beats[0] and num_circle == 3):
        times_list.append(time.time())
        index += 1
        num_circle = 1

    # Show the final output
    cv2.imshow("Output", frame)
    output.write(frame)
    
    # press q to stop recording and exit
    if cv2.waitKey(1) == ord('q'):
        break
        
# release the webcam and destroy all active windows
cap.release()
output.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

In [250]:
stop_indices= []
stop_i = 0
if (len(stop_list) > 0):
    
    for i in range (0, len(times_list)):
        if (times_list[i] > stop_list[stop_i]):
            # pushing back index in times_list and stop_index
            stop_indices.append((i, stop_i))
            stop_i += 2
            if (stop_i >= len(stop_list)):
                break

    print(stop_indices)

[(9, 0)]


In [251]:
print('times list: ',times_list)
print('stop list: ',stop_list)
diff_list = []
for i in range (1, len(times_list)):
    search = [item for item in stop_indices if item[0] == i]
    if search:
        diff_list.append(stop_list[search[0][1]] - times_list[i - 1])
    else:
        diff_list.append(times_list[i] - times_list[i - 1])
print(diff_list)

times list:  [1639368395.091979, 1639368396.360062, 1639368397.127433, 1639368398.155815, 1639368398.54267, 1639368399.488055, 1639368399.989829, 1639368400.5427468, 1639368401.289563, 1639368404.9780421, 1639368405.514045, 1639368406.453102, 1639368406.860745, 1639368407.352798, 1639368407.894378, 1639368408.600227, 1639368409.000196, 1639368409.477437, 1639368411.4841259, 1639368411.8231611]
stop list:  [1639368402.143432, 1639368404.734894]
[1.268082857131958, 0.7673711776733398, 1.0283818244934082, 0.3868551254272461, 0.9453849792480469, 0.5017740726470947, 0.5529177188873291, 0.7468161582946777, 0.8538689613342285, 0.5360028743743896, 0.9390571117401123, 0.40764284133911133, 0.4920530319213867, 0.5415799617767334, 0.7058491706848145, 0.39996886253356934, 0.47724103927612305, 2.0066888332366943, 0.3390352725982666]


In [253]:
# onset_diff is the difference between beats
onset_diff = []
for i in range (1, len(onset_times)):
    onset_diff.append(onset_times[i] - onset_times[i - 1])

hand_diff = np.array(diff_list)
beat_diff = np.array(onset_diff)

size = min(len(hand_diff), len(beat_diff))
end_result = []
scale = []
fits = []

# finds the factor put into time_stretch
for i in range (0, size):
    fit = beat_diff[i] / hand_diff[i]
    scale.append(fit)

end_result = np.empty(len(beat_diff), dtype='float')

# adding silence of beginning
delay = 0.000
pad_time = (time_begin - time_start - delay)

beg_silence = np.zeros(int(pad_time * sr))
end_result = np.concatenate((end_result, beg_silence))

# concatenate adjusted beats together
for i in range (0, size):
    search = [item for item in stop_indices if item[0] == i]
    if search:
        modified_tempo = librosa.effects.time_stretch(arr[i], scale[i])
        end_result = np.concatenate((end_result, modified_tempo))
        silence_time = stop_list[search[0][1] + 1] - stop_list[search[0][1]]
        print(silence_time)
        silence = np.zeros(int(silence_time * sr))
        end_result = np.concatenate((end_result, silence))
    else:
        modified_tempo = librosa.effects.time_stretch(arr[i], scale[i])
        end_result = np.concatenate((end_result, modified_tempo))
write('result_audio/hungarian_stop.wav', sr, end_result)
ipd.Audio(end_result, rate=sr)

2.5914621353149414


In [254]:
# TODO:try clipping video to right when finger touches first checkpoint
# if gives an error, file already exists, have to change name
input_video = ffmpeg.input('web_video.mp4')
input_audio = ffmpeg.input('hungarian_stop.wav')
ffmpeg.concat(input_video, input_audio, v = 1, a = 1).output('results/hungarian_stop.mp4').run()

ffmpeg version 4.0 Copyright (c) 2000-2018 the FFmpeg developers
  built with clang version 4.0.1 (tags/RELEASE_401/final)
  configuration: --prefix=/Users/rebeccaang/opt/miniconda3/envs/cs445 --cc=x86_64-apple-darwin13.4.0-clang --disable-doc --enable-shared --enable-static --enable-zlib --enable-pic --enable-gpl --enable-version3 --disable-nonfree --enable-hardcoded-tables --enable-avresample --enable-libfreetype --disable-openssl --disable-gnutls --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --disable-libx264
  libavutil      56. 14.100 / 56. 14.100
  libavcodec     58. 18.100 / 58. 18.100
  libavformat    58. 12.100 / 58. 12.100
  libavdevice    58.  3.100 / 58.  3.100
  libavfilter     7. 16.100 /  7. 16.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  1.100 /  5.  1.100
  libswresample   3.  1.100 /  3.  1.100
  libpostproc    55.  1.100 / 55.  1.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'web_video.mp4':
  Metadata:
    major_brand     :

Error: ffmpeg error (see stderr output for detail)