# Importing libraries

In [None]:
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
import numpy as np
from scipy import stats
from IPython.display import Audio
import math
import os
import json

# Getting access to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
def find_homogeneous_segments(data, threshold):
    homogeneous_segments = []
    start = 0
    for i in range(1, len(data)):
        if abs(data[i] - data[start]) > threshold:
            if i - start >= 1:
                homogeneous_segments.append((start, i - 1))
            start = i
    if start < len(data) - 1:
        homogeneous_segments.append((start, len(data) - 1))
    return homogeneous_segments


# Voice Activity Detection

In [None]:
def VAD(audio):

  #Setting up the framing parameters
  frame_length = 2048
  hop_length = 512

  # Calculating the energy of each frame
  energy = np.sum(np.abs(librosa.stft(audio, n_fft=frame_length, hop_length=hop_length)), axis=0)

  # Setting up an energy threshold
  threshold_energy = np.mean(energy) * 0.4
  #print(threshold_energy)

  # The commented section is equivalent to line under it
  '''speech_segments = []
  for i in range(len(energy)):
    if (energy[i] > threshold_energy):
      bit = 1
    else:
      bit = 0
    speech_segments.append(bit)'''

  # Putting 1s where the energy's above the threshold and 0s where it's under
  speech_segments = (energy > threshold_energy).astype(int)
  #print('speech_segments', speech_segments)

  # Keeping track of where first and last 1s are
  speech_onsets = []
  for i in range(len(speech_segments)):
    if(speech_segments[i] == 1):
      speech_onsets.append(i)
  #print('speech_onsets', speech_onsets)

  speech_onsets = np.array(speech_onsets)


  #speech_onsets = np.where(np.diff(np.hstack(([0], speech_segments, [0]))))[0][::2]
  '''speech_offsets = np.where(np.diff(np.hstack(([0], speech_segments, [0]))))[0][1::2]
  print('speech_offsets', speech_offsets)'''

  # Claculating the start and end indeces of the actual audio array
  speech_start_index = hop_length * speech_onsets[0] + frame_length
  speech_end_index = hop_length * speech_onsets[-1] + frame_length
  #print(speech_start_index, speech_end_index)

  # Slicing up the audio array
  speech_audio = audio[speech_start_index:speech_end_index]


  return speech_audio, speech_onsets.shape[0], speech_onsets


#Pitch extraction

In [None]:
def pitchExtractor(audio, sr):

  # loading the file and converting it to a numpy array
  #audio, sr = librosa.load(audio_file)
  frame_length = 2048
  hop_length = 512
  # Setting up the parameters for pitch extraction
  fmin = librosa.note_to_hz('C2')
  fmax = librosa.note_to_hz('C7')

  # Calculatting voicing probabilities using librosa's harmonic-percussive source separation
  f0, voiced_flag, voiced_probs = librosa.pyin(audio, sr=sr, fmin=fmin, fmax=fmax, frame_length=frame_length, hop_length=hop_length)

  for i in range(len(f0)):
    if(np.isnan(f0[i])):
      f0[i] = 0


  # Remove NaN values from the dataset
  '''clean_data = [x for x in f0 if not np.isnan(x)]

  #Exctracting the most common FO
  #most_common_pitch = stats.mode(f0[f0 > 0])  # Mode of non-zero pitch values
  # Find homogeneous segments (assuming a threshold of 5)
  homogeneous_segments = find_homogeneous_segments(clean_data, 10)

  # Calculate mean of each homogeneous segment
  homogeneous_means = [sum(clean_data[start:end + 1]) / (end - start + 1) for start, end in homogeneous_segments]

  # Calculate the overall mean of the homogeneous segments
  most_common_pitch = sum(homogeneous_means) / len(homogeneous_means) if len(homogeneous_means) > 0 else 0'''

  return f0

In [None]:
def spectrogramPlotter(audio, sr, f0):
  times = librosa.times_like(f0)
  # Calculate time values corresponding to the f0 (fundamental frequency) contour

  D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
  # Compute the spectrogram (D) of the input audio signal using Short-Time Fourier Transform (STFT),
  # convert it to decibel scale using amplitude_to_db, and set the reference level to the maximum amplitude

  fig, ax = plt.subplots()
  # Create a new figure and axis for plotting

  img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax)
  # Display the spectrogram 'D' on the axis 'ax' with time on the x-axis and frequency on the y-axis in log scale

  ax.set(title='pYIN fundamental frequency estimation')
  # Set the title of the plot

  fig.colorbar(img, ax=ax, format="%+2.f dB")
  # Add a color bar to the plot to indicate the dB scale

  ax.plot(times, f0, label='f0', color='cyan', linewidth=3)
  # Plot the fundamental frequency contour 'f0' over time using cyan color and a thicker line

  ax.legend(loc='upper right')
  # Add a legend to the plot at the upper right corner

  plt.show()
  # Display the entire plot


# MFCC extraction

In [None]:
def mfccExtractor(audio, sr):

  # Extracting MFCCs
  mfccs = librosa.feature.mfcc(y=audio, n_mfcc=13, sr=sr)

  # Computing first MFCCs derivatives
  delta_mfccs = librosa.feature.delta(mfccs)

  # Computing second MFCCs derivatives
  delta2_mfccs = librosa.feature.delta(mfccs, order=2)

  # Constructing the MFCCs feature matrix
  mfccs_features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))

  return mfccs, delta_mfccs, delta2_mfccs


In [None]:
def mel_spectrogramPlotter(audio, sr):
    # Compute the Mel spectrogram using librosa
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, n_mels=90)

    # Convert power spectrogram to dB scale for better visualization
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)

    # Create a figure and display the log Mel spectrogram
    plt.figure(figsize=(7, 6))
    librosa.display.specshow(log_mel_spectrogram,
                             x_axis="time",
                             y_axis="mel",
                             sr=sr)

    # Add a colorbar to the plot with specified format
    plt.colorbar(format="%+2.f")

    # Show the plot
    plt.show()


# Formants extraction

In [None]:
def formantsExtractor(audio, sr):
    # Constants for analysis
    FRAME_SIZE = 2048
    HOP_SIZE = 512
    order = 16

    # Compute the Short-Time Fourier Transform (STFT) of the audio
    S_audio = librosa.stft(audio, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
    Y_audio = np.abs(S_audio) ** 2  # Power spectrogram

    # Create an empty matrix to store formants information
    formants_matrix = np.empty((Y_audio.shape[1], 5))

    # Iterate through each frame of the spectrogram
    for i in range(Y_audio.shape[1]):
        segment = Y_audio[:, i]  # Take one frame
        coefficients = librosa.lpc(y=segment, order=order)  # Compute LPC coefficients
        roots = np.roots(coefficients)  # Find roots of LPC polynomial
        positive_imag_roots = roots[np.imag(roots) >= 0]  # Select roots with positive imaginary parts
        formants = np.angle(positive_imag_roots) * (sr / (2 * np.pi))  # Convert roots to frequencies
        indices_desc = np.argsort(formants)[::-1]  # Sort formants in descending order
        formants = formants[indices_desc]
        formants = formants[0:5]   # Select top 5 formants

        # Store formants in the formants_matrix
        formants_matrix[i] = formants

    return formants_matrix


# os.walk demonstration

In [None]:
dataset_path = ('/content/drive/MyDrive/Dataset')
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
  print(dirpath)
  print(dirnames)
  print(filenames)
  print(i)
  print("----------------------------------------------")

/content/drive/MyDrive/Dataset
['Others', 'Me']
[]
0
----------------------------------------------
/content/drive/MyDrive/Dataset/Others
['.ipynb_checkpoints']
['7al la barriere (1).wav', '7al la barriere (2).wav', '7al la barriere (3).wav', '7al la barriere (4).wav', '7al la barriere (6).wav', '7al la barriere (5).wav', '7al la barriere (7).wav', '7al la barriere (9).wav', '7al la barriere (8).wav', '7al la barriere (10).wav', '7al la barriere (11).wav', '7al la barriere (12).wav', '7al la barriere (13).wav', '7al lbab (1).wav', '7al lbab (2).wav', '7al lbab (3).wav', '7al lbab (4).wav', '7al lbab (5).wav', '7al lbab (6).wav', '7al lbab (8).wav', '7al lbab (7).wav', '7al lbab (9).wav', '7al lbab (10).wav', '7al lbab (11).wav', '7al lbab (12).wav', '7al lbab (13).wav', 'aghla9 la barriere (3).wav', 'aghla9 la barriere (1).wav', 'aghla9 la barriere (2).wav', 'aghla9 la barriere (4).wav', 'aghla9 la barriere (5).wav', 'aghla9 la barriere (6).wav', 'aghla9 la barriere (7).wav', 'aghla9 l

# Dataset extraction to a json file

In [None]:
def Dataset(dataset_path, json_path, class1, class2):
  sr = 22050
  n_segments = 30
  # Dictionnary of the dataset
  data = {
      "mapping": [],
      "features": [],
      "labels": []
  }

  # Looping through the dataset folder
  for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
    if dirpath is not dataset_path:
      # Saving the labels
      dirpath_components = dirpath.split("/")
      semantic_label = dirpath_components[-1]
      data["mapping"].append(semantic_label)
      print(f"\nProcessing: {semantic_label}")

      # Process files
      for f in filenames:

        # Loading the audio files
        file_path = os.path.join(dirpath, f)
        print(file_path)

        # Checking if the files are wav files
        if file_path.endswith('.wav'):

          # Loading audio files
          audio, _  = librosa.load(file_path)

          # Applying voice activity detection
          audio_clean, dim, _ = VAD(audio)

          # Extracting the features

          # Pitch
          pitch = pitchExtractor(audio_clean, sr)

          # Making sure the pitch is of size 30
          if pitch.shape[0] >= dim:
            pitch = pitch[0:dim]
          else:
            pitch = np.concatenate((pitch, np.zeros(30)))
            pitch = pitch[0:dim]

          # MFCCs
          mfcc, delta_mfcc, delta2_mfcc = mfccExtractor(audio_clean, sr)

          # Making sure the MFCCs is of size 30
          if mfcc.shape[1] >= dim:
            mfcc = mfcc[:,0:dim].T
          else:
            j = mfcc.shape[1]
            mfcc = mfcc.T
            while j < dim:
              mfcc = np.vstack((mfcc, np.zeros(13)))
              j = j+1


          if delta_mfcc.shape[1] >= dim:
            delta_mfcc = delta_mfcc[:,0:dim].T
          else:
            j = delta_mfcc.shape[1]
            delta_mfcc = delta_mfcc.T
            while j < dim:
              delta_mfcc = np.vstack((delta_mfcc, np.zeros(13)))
              j = j+1

          if delta2_mfcc.shape[1] >= dim:
            delta2_mfcc = delta2_mfcc[:,0:dim].T
          else:
            j = delta2_mfcc.shape[1]
            delta2_mfcc = delta2_mfcc.T
            while j < dim:
              delta2_mfcc = np.vstack((delta2_mfcc, np.zeros(13)))
              j = j+1

          # Formants
          formants = formantsExtractor(audio_clean, sr)

          # Making sure the formants are of size 30
          if formants.shape[0] >= dim:
            formants = formants[0:dim,:]
          else:
            j = formants.shape[0]
            while j < dim:
              formants = np.vstack((formants, np.zeros(5)))
              j = j+1



          #checking if the size of the features is right
          try:
            assert(pitch.shape == (dim,))
            assert(mfcc.shape == (dim, 13))
            assert(delta_mfcc.shape == (dim, 13))
            assert(delta2_mfcc.shape == (dim, 13) )
            assert(formants.shape == (dim, 5))
          except AssertionError as e:
            print(f"dim = {dim}")
            print(f"pitch shape = {pitch.shape}")
            print(f"mfcc shape = {mfcc.shape}")
            print(f"delta mfcc shape = {delta_mfcc.shape}")
            print(f"delta2 mfcc shape = {delta2_mfcc.shape}")

          # Creating an empty numpy array to stock the features
          features = np.empty((n_segments, 45))
          print('\naaaaaaaaaaaaaaaaa')

          for i in range(n_segments):

            if (i <= dim-1):
              '''print(f'\n{pitch[i]}')
              print(f'\n{mfcc[i]}')
              print(f'\n{delta_mfcc[i]}')
              print(f'\n{delta2_mfcc[i]}')
              print(f'\n{formants[i]}')'''
              pitch_as_array = np.array([pitch[i]])
              features[i] = np.concatenate((pitch_as_array, mfcc[i], delta_mfcc[i], delta2_mfcc[i], formants[i]))

            else:
              features[i] = np.zeros(45)
          #print(f'\n{features}\n{features.shape}')

          # Checking if the feature array dimension is right
          assert(features.shape == (n_segments,45))
          features = features.tolist()

          # Appending the features to the dictionnary
          data["features"].append(features)

          # Labeling the dataset
          if semantic_label == class1:
            label = 0
          elif semantic_label == class2:
            label = 1

          data["labels"].append(label)


        else:
          print("not a '.wav' file")


  # Saving to jason file
  with open(json_path, 'w') as fp:
    json.dump(data, fp, indent=4)










In [None]:
dataset_path = '/content/drive/MyDrive/Dataset3'
dataset_json_path = '/content/drive/MyDrive/Dataset3_kolch.json'

In [None]:
Dataset(dataset_path, dataset_json_path, "Others", "Me")


Processing: Me
/content/drive/MyDrive/Dataset3/Me/Aghle9-lbab-1.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-13.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-12.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-11.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-10.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-9.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-8.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-7.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi-do-6.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi do 1.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi do 2.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi do 3.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi do 4.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Dataset3/Me/Tafi do 5.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/D

In [None]:
testset_path = '/content/drive/MyDrive/Test2'
testset_json_path = '/content/drive/MyDrive/Test2_kolch.json'

In [None]:
Dataset(testset_path, testset_json_path, "Others", "Me")


Processing: Others
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 58.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 8.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 7.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 6.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 5.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 4.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 3.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Others/Nouvel enregistrement 2.wav

aaaaaaaaaaaaaaaaa

Processing: Me
/content/drive/MyDrive/Test2/Me/Voice 262.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Me/Voice 270.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Me/Voice 268.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Me/Voice 261.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test2/Me/Voice 2

In [None]:
testset_path = '/content/drive/MyDrive/Test'
testset_json_path = '/content/drive/MyDrive/Test_kolch.json'

In [None]:
Dataset(testset_path, testset_json_path, "Others", "Me")


Processing: Me
/content/drive/MyDrive/Test/Me/Voice 262.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 270.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 268.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 261.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 257.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 259.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 267.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 265.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 271.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 274.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 269.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 263.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 258.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 266.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/Test/Me/Voice 260.wav

aaaaaaaaaaaaaaaaa
/content/drive/MyDrive/T