<a href="https://colab.research.google.com/github/satvik-venkatesh/audio-seg-data-synth/blob/main/models/doMusicAndSpeechDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import soundfile as sf
import argparse
import numpy as np
import librosa
import tensorflow as tf
import math

# Functions

In [None]:
def smooth_output(output, min_speech=1.3, min_music=3.4, max_silence_speech=0.4, max_silence_music=0.6):
    duration_frame = 220 / 22050
    n_frame = output.shape[1]

    start_music = -1000
    start_speech = -1000

    for i in range(n_frame):
        if output[0, i] == 1:
            if i - start_speech > 1:
                if (i - start_speech) * duration_frame <= max_silence_speech:
                    output[0, start_speech:i] = 1

            start_speech = i

        if output[1, i] == 1:
            if i - start_music > 1:
                if (i - start_music) * duration_frame <= max_silence_music:
                    output[1, start_music:i] = 1

            start_music = i

    start_music = -1000
    start_speech = -1000

    for i in range(n_frame):
        if i != n_frame - 1:
            if output[0, i] == 0:
                if i - start_speech > 1:
                    if (i - start_speech) * duration_frame <= min_speech:
                        output[0, start_speech:i] = 0

                start_speech = i

            if output[1, i] == 0:
                if i - start_music > 1:
                    if (i - start_music) * duration_frame <= min_music:
                        output[1, start_music:i] = 0

                start_music = i
        else:
            if i - start_speech > 1:
                if (i - start_speech) * duration_frame <= min_speech:
                    output[0, start_speech:i + 1] = 0

            if i - start_music > 1:
                if (i - start_music) * duration_frame <= min_music:
                    output[1, start_music:i + 1] = 0

    return output

"""
This function converts the predictions made by the neural network into a format that is understood by sed_eval.
"""

def preds_to_se(p, audio_clip_length = 8.0):
  start_speech = -100
  start_music = -100
  stop_speech = -100
  stop_music = -100

  audio_events = []

  n_frames = p.shape[0]

  if p[0, 0] == 1:
    start_speech = 0
  
  if p[0, 1] == 1:
    start_music = 0

  for i in range(n_frames - 1):
    if p[i, 0] == 0 and p[i + 1, 0] == 1:
      start_speech = i + 1

    elif p[i, 0] == 1 and p[i + 1, 0] == 0:
      stop_speech = i
      start_time = frames_to_time(start_speech)
      stop_time = frames_to_time(stop_speech)
      audio_events.append((start_time, stop_time, "speech"))
      start_speech = -100
      stop_speech = -100

    if p[i, 1] == 0 and p[i + 1, 1] == 1:
      start_music = i + 1
    elif p[i, 1] == 1 and p[i + 1, 1] == 0:
      stop_music = i
      start_time = frames_to_time(start_music)
      stop_time = frames_to_time(stop_music)      
      audio_events.append((start_time, stop_time, "music"))
      start_music = -100
      stop_music = -100

  if start_speech != -100:
    start_time = frames_to_time(start_speech)
    stop_time = audio_clip_length
    audio_events.append((start_time, stop_time, "speech"))
    start_speech = -100
    stop_speech = -100

  if start_music != -100:
    start_time = frames_to_time(start_music)
    stop_time = audio_clip_length
    audio_events.append((start_time, stop_time, "music"))
    start_music = -100
    stop_music = -100

  audio_events.sort(key = lambda x: x[0]) 
  return audio_events

def frames_to_time(f, sr = 22050.0, hop_size = 220):
  return f * hop_size / sr

def get_log_melspectrogram(audio, sr = 22050, hop_length = 220, n_fft = 1024, n_mels = 128, fmin = 64, fmax = 8000):
    """Return the log-scaled Mel bands of an audio signal."""
    bands = librosa.feature.melspectrogram(
        y=audio, sr=sr, hop_length=hop_length, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, dtype=np.float32)
    return librosa.core.power_to_db(bands, amin=1e-7)


"""
Make predictions for full audio.
"""
def mk_preds_fa(audio_path, hop_size = 6.0, discard = 1.0, win_length = 8.0, sampling_rate = 22050):
  in_signal, in_sr = sf.read(audio_path)

  # Convert to mono if needed.
  if (in_signal.ndim > 1):
    in_sigal_mono = librosa.to_mono(in_signal)
    in_signal = np.copy(in_signal_mono)
  # Resample the audio file.
  in_signal_22k = librosa.resample(in_signal, orig_sr=in_sr, target_sr=sampling_rate)
  in_signal = np.copy(in_signal_22k)

  audio_clip_length_samples = in_signal.shape[0]
  print('audio_clip_length_samples is {}'.format(audio_clip_length_samples))

  #hop_size_samples = int(hop_size * sampling_rate)
  hop_size_samples = 220 * 602 - 1

  #win_length_samples = int(win_length * sampling_rate)
  win_length_samples = 220 * 802 - 1

  n_preds = int(math.ceil((audio_clip_length_samples - win_length_samples) / hop_size_samples)) + 1

  #print('n_preds is {}'.format(n_preds))

  in_signal_pad = np.zeros((n_preds * hop_size_samples + 200 * 220))

  #print('in_signal_pad.shape is {}'.format(in_signal_pad.shape))

  in_signal_pad[0:audio_clip_length_samples] = in_signal

  preds = np.zeros((n_preds, 802, 2))
  mss_in = np.zeros((n_preds, 802, 128))

  for i in range(n_preds):
    seg = in_signal_pad[i * hop_size_samples:(i * hop_size_samples) + win_length_samples]
    #print('seg.shape is {}'.format(seg.shape))
    seg = librosa.util.normalize(seg)

    mss = get_log_melspectrogram(seg)
    M = mss.T
    mss_in[i, :, :] = M

  preds = (model.predict(mss_in) >= (0.5, 0.5)).astype(np.float)

  preds_mid = np.copy(preds[1:-1, 100:702, :])

  preds_mid_2 = preds_mid.reshape(-1, 2)

  oa_preds = preds[0, 0:702, :] # oa stands for overall predictions

  oa_preds = np.concatenate((oa_preds, preds_mid_2), axis = 0)

  oa_preds = np.concatenate((oa_preds, preds[-1, 100:, :]), axis = 0)

  return oa_preds

In [None]:
parser = argparse.ArgumentParser(description="Music and speech detection on a given audio and output as txt file")
parser.add_argument('input_path', help='Input wav file path')
parser.add_argument('output_path', help="Output txt file path")

args = parser.parse_args()

In [None]:
test_audio = args.input_path

In [None]:
m = 'model 772_3.h5'

eval_path = "/content/drive/My Drive/ICASSP 2021/OpenBMAT/eval-2"
  
model = tf.keras.models.load_model(m)

ss, _ = sf.read(test_audio)
oop = mk_preds_fa(test_audio)

p_smooth = smooth_output(oop.T, min_speech=1.3, min_music=3.4, max_silence_speech=0.4, max_silence_music=0.6)
p_smooth = p_smooth.T
see = preds_to_se(p_smooth, audio_clip_length=ss.shape[0]/22050.0)

n_label = args.output_path

with open(n_label, 'w') as fp:
  fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))

['/content/drive/My Drive/ICASSP 2021/e1/Models/model 128_1.h5']
audio_clip_length_samples is 238140000
Accuracy: 0.9377648704779312
