# **Installs**

In [None]:
!pip install -qq pyannote.audio
!pip install -qq rich
!pip install transformers datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m898.7/898.7 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m811.0/811.0 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m4.1 MB/s[0m eta [36m0:0

# **Imports**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import librosa
import torch
from transformers import HubertModel, Wav2Vec2Processor, WhisperModel, WhisperProcessor
from speechbrain.inference.speaker import EncoderClassifier
from datasets import load_dataset
import numpy as np
import fnmatch
import re
import soundfile as sf
import librosa
import torch

import os



# **Functions**

In [None]:
def getEmbeddings(selectedDataset=None, selectedModel=None,processor=None, model=None, diarizations=None, filePaths=None, datasetIterator=None, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000):
  embeddings = []
  if datasetIterator != None:
    for i in range(minNumberToProcess):
      print("Iterating to min: " + str(i)+"/"+str(minNumberToProcess-1))
      next(datasetIterator)
  for i in range(minNumberToProcess, maxNumberToProcess):
    try:
        print("Computing embeddings, at: " + str(i)+"/"+str(maxNumberToProcess-1))
        if filePaths != None:
          audio, sr = librosa.load(filePaths[i],sr=sampleRate)


        start_end = diarizations[i]
        print(filePaths[i])
        print(start_end[0])
        for start, end in start_end[1:]:

            start_sample = int(start * sampleRate)
            end_sample = int(end * sampleRate)
            segment = audio[start_sample:end_sample]
            print(str(start_sample)+" "+str(end_sample))
            try:
                if (selectedModel=="speechbrain"):
                    signal = torch.tensor(segment).unsqueeze(0)
                    print(signal.shape)
                    embedding = processor.encode_batch(signal)
                    print(torch.squeeze(embedding).shape)
                    embeddings.append(torch.squeeze(embedding))
                elif (selectedModel=="whisper"):
                  input_features = processor(segment, return_tensors="pt", sampling_rate=sampleRate).input_features
                  with torch.no_grad():
                    outputs = model.get_encoder()(input_features)
                    embedding = outputs.last_hidden_state.squeeze()
                    embeddings.append(torch.mean(embedding, dim=0))
                else:
                  input_values = processor(segment, return_tensors="pt", sampling_rate=sampleRate).input_values

                  with torch.no_grad():
                      outputs = model(input_values)
                      embedding = outputs.last_hidden_state.squeeze()
                      embeddings.append(torch.mean(embedding, dim=0))
            except:
              print("ERROR")
              sf.write("output.wav", segment, 16000)
    except:
      print("OUT OF BOUNDS")


  return embeddings


In [None]:
def append_to_existing_file(file_name, new_data):
    if os.path.exists(file_name + ".npy"):
        existing_data = np.load(file_name + ".npy", allow_pickle=True)
        print(np.array(existing_data).shape)
        combined_data = np.vstack((existing_data, new_data))
    else:
        combined_data = new_data

    print(np.array(combined_data).shape)

    np.save(file_name, combined_data)

# **Initalize Models**

In [None]:
#model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
#processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")



In [None]:
model = WhisperModel.from_pretrained("openai/whisper-small")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")



In [None]:
#model = None
#processor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")


# **OGI Kids Embeddings**

In [None]:
OGIKidsFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/BarIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              OGIKidsFilepaths.append(os.path.join(directory,filename))

In [None]:
OGKidsDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/OGIKidsBar.npy", allow_pickle=True)
OGIKidsEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=OGKidsDiarizations, filePaths=OGIKidsFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=1100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
870165 929295
1005165 1048365
1071855 1081845
1106415 1204965
1287585 1367505
Computing embeddings, at: 734/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/BarIR/735-OGIKids.wav
735-OGIKids.wav
6975 316125
342585 347445
425475 433575
452475 480825
515385 538875
600705 617175
771345 824535
897705 1020015
1056465 1076985
1101825 1127205
1153125 1199565
1233045 1294875
1319175 1481175
1495215 1555695
1600515 1854855
1909395 1993905
2017935 2028195
2072205 2227455
2243925 2320065
2333295 2364345
Computing embeddings, at: 735/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/BarIR/736-OGIKids.wav
736-OGIKids.wav
20205 263475
280485 322335
339615 489735
525105 553455
576135 719505
737595 764595
809415 822375
843975 863145
879615 900405
934965 942255
959805 1066185
1092105 1160145
1183095 1368855
1383435 1406655
1422585 1440405
1500345 1651545
1666935 1676115
1694205 1814895
Computing embeddings, at: 736/1099

In [None]:
newEmbeddings = []
for i in range(len(OGIKidsEmbeddings)):
  try:
    if (len(OGIKidsEmbeddings[i])!=768):
      print(len(OGIKidsEmbeddings[i]))
    else:
      newEmbeddings.append(OGIKidsEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/OGIKidsEmbeddingsBar",newEmbeddings)


(14167, 768)


In [None]:
OGIKidsFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/StairwellIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              OGIKidsFilepaths.append(os.path.join(directory,filename))

In [None]:

OGIKidsEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=OGKidsDiarizations, filePaths=OGIKidsFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=1100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
870165 929295
1005165 1048365
1071855 1081845
1106415 1204965
1287585 1367505
Computing embeddings, at: 734/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/StairwellIR/735-OGIKids.wav
735-OGIKids.wav
6975 316125
342585 347445
425475 433575
452475 480825
515385 538875
600705 617175
771345 824535
897705 1020015
1056465 1076985
1101825 1127205
1153125 1199565
1233045 1294875
1319175 1481175
1495215 1555695
1600515 1854855
1909395 1993905
2017935 2028195
2072205 2227455
2243925 2320065
2333295 2364345
Computing embeddings, at: 735/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/StairwellIR/736-OGIKids.wav
736-OGIKids.wav
20205 263475
280485 322335
339615 489735
525105 553455
576135 719505
737595 764595
809415 822375
843975 863145
879615 900405
934965 942255
959805 1066185
1092105 1160145
1183095 1368855
1383435 1406655
1422585 1440405
1500345 1651545
1666935 1676115
1694205 1814895
Computing embeddings, 

In [None]:
newEmbeddings = []
for i in range(len(OGIKidsEmbeddings)):
  try:
    if (len(OGIKidsEmbeddings[i])!=768):
      print(len(OGIKidsEmbeddings[i]))
    else:
      newEmbeddings.append(OGIKidsEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/OGIKidsEmbeddingsStairwell",newEmbeddings)


(14167, 768)


In [None]:
OGIKidsFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/BathroomIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              OGIKidsFilepaths.append(os.path.join(directory,filename))

In [None]:
OGIKidsEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=OGKidsDiarizations, filePaths=OGIKidsFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=1100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
870165 929295
1005165 1048365
1071855 1081845
1106415 1204965
1287585 1367505
Computing embeddings, at: 734/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/BathroomIR/735-OGIKids.wav
735-OGIKids.wav
6975 316125
342585 347445
425475 433575
452475 480825
515385 538875
600705 617175
771345 824535
897705 1020015
1056465 1076985
1101825 1127205
1153125 1199565
1233045 1294875
1319175 1481175
1495215 1555695
1600515 1854855
1909395 1993905
2017935 2028195
2072205 2227455
2243925 2320065
2333295 2364345
Computing embeddings, at: 735/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/BathroomIR/736-OGIKids.wav
736-OGIKids.wav
20205 263475
280485 322335
339615 489735
525105 553455
576135 719505
737595 764595
809415 822375
843975 863145
879615 900405
934965 942255
959805 1066185
1092105 1160145
1183095 1368855
1383435 1406655
1422585 1440405
1500345 1651545
1666935 1676115
1694205 1814895
Computing embeddings, at

In [None]:
newEmbeddings = []
for i in range(len(OGIKidsEmbeddings)):
  try:
    if (len(OGIKidsEmbeddings[i])!=768):
      print(len(OGIKidsEmbeddings[i]))
    else:
      newEmbeddings.append(OGIKidsEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/OGIKidsEmbeddingsBathroom",newEmbeddings)


(14167, 768)


In [None]:
OGIKidsFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/SupermarketIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              OGIKidsFilepaths.append(os.path.join(directory,filename))

In [None]:
OGIKidsEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=OGKidsDiarizations, filePaths=OGIKidsFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=1100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
870165 929295
1005165 1048365
1071855 1081845
1106415 1204965
1287585 1367505
Computing embeddings, at: 734/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/SupermarketIR/735-OGIKids.wav
735-OGIKids.wav
6975 316125
342585 347445
425475 433575
452475 480825
515385 538875
600705 617175
771345 824535
897705 1020015
1056465 1076985
1101825 1127205
1153125 1199565
1233045 1294875
1319175 1481175
1495215 1555695
1600515 1854855
1909395 1993905
2017935 2028195
2072205 2227455
2243925 2320065
2333295 2364345
Computing embeddings, at: 735/1099
/content/drive/MyDrive/UC_COSMOS/New_Datasets/OGIKids/SupermarketIR/736-OGIKids.wav
736-OGIKids.wav
20205 263475
280485 322335
339615 489735
525105 553455
576135 719505
737595 764595
809415 822375
843975 863145
879615 900405
934965 942255
959805 1066185
1092105 1160145
1183095 1368855
1383435 1406655
1422585 1440405
1500345 1651545
1666935 1676115
1694205 1814895
Computing embeddin

In [None]:
newEmbeddings = []
for i in range(len(OGIKidsEmbeddings)):
  try:
    if (len(OGIKidsEmbeddings[i])!=768):
      print(len(OGIKidsEmbeddings[i]))
    else:
      newEmbeddings.append(OGIKidsEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/OGIKidsEmbeddingsSupermarket",newEmbeddings)


(14167, 768)


# **myst Embeddings**

In [None]:
mystFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              mystFilepaths.append(os.path.join(directory,filename))

In [None]:
mystDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/mystBar.npy", allow_pickle=True)
mystEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=mystDiarizations, filePaths=mystFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Computing embeddings, at: 1842/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1842-myst.wav
1842-myst.wav
10755 33435
Computing embeddings, at: 1843/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1843-myst.wav
1843-myst.wav
8325 97965
Computing embeddings, at: 1844/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1844-myst.wav
1844-myst.wav
8325 23715
Computing embeddings, at: 1845/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1845-myst.wav
1845-myst.wav
11295 29385
Computing embeddings, at: 1846/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1846-myst.wav
1846-myst.wav
12915 106065
Computing embeddings, at: 1847/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1847-myst.wav
1847-myst.wav
20475 156555
188955 238095
Computing embeddings, at: 1848/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BarIR/1848-myst.wav
1848-myst.wav
19

In [None]:
newEmbeddings = []
for i in range(len(mystEmbeddings)):
  try:
    if (len(mystEmbeddings[i])!=768):
      print(len(mystEmbeddings[i]))
    else:
      newEmbeddings.append(mystEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/mystEmbeddingsBar",newEmbeddings)


(4080, 768)


In [None]:
mystFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              mystFilepaths.append(os.path.join(directory,filename))

In [None]:
mystDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/mystBar.npy", allow_pickle=True)
mystEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=mystDiarizations, filePaths=mystFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Computing embeddings, at: 1842/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/1842-myst.wav
1842-myst.wav
10755 33435
Computing embeddings, at: 1843/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/1843-myst.wav
1843-myst.wav
8325 97965
Computing embeddings, at: 1844/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/1844-myst.wav
1844-myst.wav
8325 23715
Computing embeddings, at: 1845/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/1845-myst.wav
1845-myst.wav
11295 29385
Computing embeddings, at: 1846/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/1846-myst.wav
1846-myst.wav
12915 106065
Computing embeddings, at: 1847/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/StairwellIR/1847-myst.wav
1847-myst.wav
20475 156555
188955 238095
Computing embeddings, at: 1848/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/

In [None]:
newEmbeddings = []
for i in range(len(mystEmbeddings)):
  try:
    if (len(mystEmbeddings[i])!=768):
      print(len(mystEmbeddings[i]))
    else:
      newEmbeddings.append(mystEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/mystEmbeddingsStairwell",newEmbeddings)


(4080, 768)


In [None]:
mystFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              mystFilepaths.append(os.path.join(directory,filename))

In [None]:
mystEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=mystDiarizations, filePaths=mystFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Computing embeddings, at: 1842/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/1842-myst.wav
1842-myst.wav
10755 33435
Computing embeddings, at: 1843/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/1843-myst.wav
1843-myst.wav
8325 97965
Computing embeddings, at: 1844/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/1844-myst.wav
1844-myst.wav
8325 23715
Computing embeddings, at: 1845/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/1845-myst.wav
1845-myst.wav
11295 29385
Computing embeddings, at: 1846/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/1846-myst.wav
1846-myst.wav
12915 106065
Computing embeddings, at: 1847/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/SupermarketIR/1847-myst.wav
1847-myst.wav
20475 156555
188955 238095
Computing embeddings, at: 1848/2999
/content/drive/MyDrive/UC_COSMOS/New_Da

In [None]:
newEmbeddings = []
for i in range(len(mystEmbeddings)):
  try:
    if (len(mystEmbeddings[i])!=768):
      print(len(mystEmbeddings[i]))
    else:
      newEmbeddings.append(mystEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/mystEmbeddingsSupermarket",newEmbeddings)


(4080, 768)


In [None]:
mystFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              mystFilepaths.append(os.path.join(directory,filename))

In [None]:
mystEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=mystDiarizations, filePaths=mystFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Computing embeddings, at: 1842/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/1842-myst.wav
1842-myst.wav
10755 33435
Computing embeddings, at: 1843/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/1843-myst.wav
1843-myst.wav
8325 97965
Computing embeddings, at: 1844/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/1844-myst.wav
1844-myst.wav
8325 23715
Computing embeddings, at: 1845/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/1845-myst.wav
1845-myst.wav
11295 29385
Computing embeddings, at: 1846/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/1846-myst.wav
1846-myst.wav
12915 106065
Computing embeddings, at: 1847/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/BathroomIR/1847-myst.wav
1847-myst.wav
20475 156555
188955 238095
Computing embeddings, at: 1848/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/myst/Bathro

In [None]:
newEmbeddings = []
for i in range(len(mystEmbeddings)):
  try:
    if (len(mystEmbeddings[i])!=768):
      print(len(mystEmbeddings[i]))
    else:
      newEmbeddings.append(mystEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/mystEmbeddingsBathroom",newEmbeddings)


(4080, 768)


# **Librispeech Embeddings**

In [None]:
librispeechFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              librispeechFilepaths.append(os.path.join(directory,filename))

In [None]:
librispeechDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/LibrispeechBar.npy", allow_pickle=True)
librispeechEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=librispeechDiarizations, filePaths=librispeechFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5355 253215
Computing embeddings, at: 1755/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/1755-Librispeech.wav
1755-Librispeech.wav
495 205965
Computing embeddings, at: 1756/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/1756-Librispeech.wav
1756-Librispeech.wav
4545 106065
Computing embeddings, at: 1757/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/1757-Librispeech.wav
1757-Librispeech.wav
5625 250245
Computing embeddings, at: 1758/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/1758-Librispeech.wav
1758-Librispeech.wav
495 199485
Computing embeddings, at: 1759/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/1759-Librispeech.wav
1759-Librispeech.wav
495 191925
Computing embeddings, at: 1760/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BarIR/1760-Librispeech.wav
1760-Librispeech.wav
3465 228915
Co

In [None]:
newEmbeddings = []
print(len(librispeechEmbeddings))
for i in range(len(librispeechEmbeddings)):
  try:
    if (len(librispeechEmbeddings[i])!=768):
      print(len(librispeechEmbeddings[i]))
    else:
      newEmbeddings.append(librispeechEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/LibrispeechEmbeddingsBar",newEmbeddings)

4098
(4098, 768)


In [None]:
librispeechFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              librispeechFilepaths.append(os.path.join(directory,filename))

In [None]:
librispeechEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=librispeechDiarizations, filePaths=librispeechFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5355 253215
Computing embeddings, at: 1755/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/1755-Librispeech.wav
1755-Librispeech.wav
495 205965
Computing embeddings, at: 1756/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/1756-Librispeech.wav
1756-Librispeech.wav
4545 106065
Computing embeddings, at: 1757/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/1757-Librispeech.wav
1757-Librispeech.wav
5625 250245
Computing embeddings, at: 1758/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/1758-Librispeech.wav
1758-Librispeech.wav
495 199485
Computing embeddings, at: 1759/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/1759-Librispeech.wav
1759-Librispeech.wav
495 191925
Computing embeddings, at: 1760/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/StairwellIR/1760-Librispeech.wav

In [None]:
newEmbeddings = []
print(len(librispeechEmbeddings))
for i in range(len(librispeechEmbeddings)):
  try:
    if (len(librispeechEmbeddings[i])!=768):
      print(len(librispeechEmbeddings[i]))
    else:
      newEmbeddings.append(librispeechEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/LibrispeechEmbeddingsstairwell",newEmbeddings)

4098
(4098, 768)


In [None]:
librispeechFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              librispeechFilepaths.append(os.path.join(directory,filename))

In [None]:
librispeechEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=librispeechDiarizations, filePaths=librispeechFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5355 253215
Computing embeddings, at: 1755/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/1755-Librispeech.wav
1755-Librispeech.wav
495 205965
Computing embeddings, at: 1756/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/1756-Librispeech.wav
1756-Librispeech.wav
4545 106065
Computing embeddings, at: 1757/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/1757-Librispeech.wav
1757-Librispeech.wav
5625 250245
Computing embeddings, at: 1758/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/1758-Librispeech.wav
1758-Librispeech.wav
495 199485
Computing embeddings, at: 1759/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/1759-Librispeech.wav
1759-Librispeech.wav
495 191925
Computing embeddings, at: 1760/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/SupermarketIR/1760-Lib

In [None]:
newEmbeddings = []
print(len(librispeechEmbeddings))
for i in range(len(librispeechEmbeddings)):
  try:
    if (len(librispeechEmbeddings[i])!=768):
      print(len(librispeechEmbeddings[i]))
    else:
      newEmbeddings.append(librispeechEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/LibrispeechEmbeddingssupermarket",newEmbeddings)

4098
(4098, 768)


In [None]:
librispeechFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              librispeechFilepaths.append(os.path.join(directory,filename))

In [None]:
librispeechEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=librispeechDiarizations, filePaths=librispeechFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5355 253215
Computing embeddings, at: 1755/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/1755-Librispeech.wav
1755-Librispeech.wav
495 205965
Computing embeddings, at: 1756/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/1756-Librispeech.wav
1756-Librispeech.wav
4545 106065
Computing embeddings, at: 1757/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/1757-Librispeech.wav
1757-Librispeech.wav
5625 250245
Computing embeddings, at: 1758/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/1758-Librispeech.wav
1758-Librispeech.wav
495 199485
Computing embeddings, at: 1759/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/1759-Librispeech.wav
1759-Librispeech.wav
495 191925
Computing embeddings, at: 1760/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/Librispeech/BathroomIR/1760-Librispeech.wav
1760-

In [None]:
newEmbeddings = []
print(len(librispeechEmbeddings))
for i in range(len(librispeechEmbeddings)):
  try:
    if (len(librispeechEmbeddings[i])!=768):
      print(len(librispeechEmbeddings[i]))
    else:
      newEmbeddings.append(librispeechEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/LibrispeechEmbeddingsBathroom",newEmbeddings)

4098
(4098, 768)


# **Common Voice Embeddings**

In [None]:
commonVoiceFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              commonVoiceFilepaths.append(os.path.join(directory,filename))

In [None]:
commonVoiceDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/mozillaBar.npy", allow_pickle=True)
commonVoiceEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=commonVoiceDiarizations, filePaths=commonVoiceFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11594-CommonVoice.wav
Computing embeddings, at: 1727/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/11595-CommonVoice.wav
11595-CommonVoice.wav
Computing embeddings, at: 1728/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/11596-CommonVoice.wav
11596-CommonVoice.wav
11835 49905
Computing embeddings, at: 1729/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/11597-CommonVoice.wav
11597-CommonVoice.wav
21015 134955
Computing embeddings, at: 1730/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/11629-CommonVoice.wav
11629-CommonVoice.wav
Computing embeddings, at: 1731/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/11630-CommonVoice.wav
11630-CommonVoice.wav
495 80145
Computing embeddings, at: 1732/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BarIR/11638-CommonVoice.wav
11638-CommonVoice.wav
27765 95265
Co

In [None]:
newEmbeddings = []
for i in range(len(commonVoiceEmbeddings)):
  try:
    if (len(commonVoiceEmbeddings[i])!=768):
      print(len(commonVoiceEmbeddings[i]))
    else:
      newEmbeddings.append(commonVoiceEmbeddings[i])
  except:
    print("error")
print(np.array(newEmbeddings).shape)
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/CommonVoiceEmbeddingsBar",newEmbeddings)

(2789, 768)
(2789, 768)


In [None]:
commonVoiceFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              commonVoiceFilepaths.append(os.path.join(directory,filename))

In [None]:
commonVoiceEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=commonVoiceDiarizations, filePaths=commonVoiceFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11594-CommonVoice.wav
Computing embeddings, at: 1727/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/11595-CommonVoice.wav
11595-CommonVoice.wav
Computing embeddings, at: 1728/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/11596-CommonVoice.wav
11596-CommonVoice.wav
11835 49905
Computing embeddings, at: 1729/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/11597-CommonVoice.wav
11597-CommonVoice.wav
21015 134955
Computing embeddings, at: 1730/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/11629-CommonVoice.wav
11629-CommonVoice.wav
Computing embeddings, at: 1731/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/11630-CommonVoice.wav
11630-CommonVoice.wav
495 80145
Computing embeddings, at: 1732/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/StairwellIR/11638-CommonVoice.wav


In [None]:
newEmbeddings = []
for i in range(len(commonVoiceEmbeddings)):
  try:
    if (len(commonVoiceEmbeddings[i])!=768):
      print(len(commonVoiceEmbeddings[i]))
    else:
      newEmbeddings.append(commonVoiceEmbeddings[i])
  except:
    print("error")
print(np.array(newEmbeddings).shape)
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/CommonVoiceEmbeddingsstairwell",newEmbeddings)

(2789, 768)
(2789, 768)


In [None]:
commonVoiceFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              commonVoiceFilepaths.append(os.path.join(directory,filename))

In [None]:
commonVoiceEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=commonVoiceDiarizations, filePaths=commonVoiceFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11594-CommonVoice.wav
Computing embeddings, at: 1727/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/11595-CommonVoice.wav
11595-CommonVoice.wav
Computing embeddings, at: 1728/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/11596-CommonVoice.wav
11596-CommonVoice.wav
11835 49905
Computing embeddings, at: 1729/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/11597-CommonVoice.wav
11597-CommonVoice.wav
21015 134955
Computing embeddings, at: 1730/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/11629-CommonVoice.wav
11629-CommonVoice.wav
Computing embeddings, at: 1731/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/11630-CommonVoice.wav
11630-CommonVoice.wav
495 80145
Computing embeddings, at: 1732/2999
/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/SupermarketIR/11638-Comm

In [None]:
newEmbeddings = []
for i in range(len(commonVoiceEmbeddings)):
  try:
    if (len(commonVoiceEmbeddings[i])!=768):
      print(len(commonVoiceEmbeddings[i]))
    else:
      newEmbeddings.append(commonVoiceEmbeddings[i])
  except:
    print("error")
print(np.array(newEmbeddings).shape)
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/CommonVoiceEmbeddingssupermarket",newEmbeddings)

(2789, 768)
(2789, 768)


In [None]:
commonVoiceFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/CommonVoice/BathroomIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              commonVoiceFilepaths.append(os.path.join(directory,filename))

In [None]:
commonVoiceEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=commonVoiceDiarizations, filePaths=commonVoiceFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
for i in range(len(commonVoiceEmbeddings)):
  try:
    if (len(commonVoiceEmbeddings[i])!=768):
      print(len(commonVoiceEmbeddings[i]))
    else:
      newEmbeddings.append(commonVoiceEmbeddings[i])
  except:
    print("error")
print(np.array(newEmbeddings).shape)
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/CommonVoiceEmbeddingsBathroom",newEmbeddings)

# **cmu Embeddings**

In [None]:
cmuFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/cmuKids/BarIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              cmuFilepaths.append(os.path.join(directory,filename))

In [None]:
cmuDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/CmuKidsBar.npy", allow_pickle=True)
cmuEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=cmuDiarizations, filePaths=cmuFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
for i in range(len(cmuEmbeddings)):
  try:
    if (len(cmuEmbeddings[i])!=768):
      print(len(cmuEmbeddings[i]))
    else:
      newEmbeddings.append(cmuEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/cmuEmbeddings",newEmbeddings)


In [None]:
cmuFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/cmuKids/StairwellIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              cmuFilepaths.append(os.path.join(directory,filename))

In [None]:
cmuEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=cmuDiarizations, filePaths=cmuFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
for i in range(len(cmuEmbeddings)):
  try:
    if (len(cmuEmbeddings[i])!=768):
      print(len(cmuEmbeddings[i]))
    else:
      newEmbeddings.append(cmuEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/cmuEmbeddingsStairwell",newEmbeddings)


In [None]:
cmuFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/cmuKids/SupermarketIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              cmuFilepaths.append(os.path.join(directory,filename))

In [None]:
cmuEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=cmuDiarizations, filePaths=cmuFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
for i in range(len(cmuEmbeddings)):
  try:
    if (len(cmuEmbeddings[i])!=768):
      print(len(cmuEmbeddings[i]))
    else:
      newEmbeddings.append(cmuEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/cmuEmbeddingsSupermarket",newEmbeddings)


In [None]:
cmuFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/cmuKids/BathroomIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              cmuFilepaths.append(os.path.join(directory,filename))

In [None]:
cmuEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=cmuDiarizations, filePaths=cmuFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
for i in range(len(cmuEmbeddings)):
  try:
    if (len(cmuEmbeddings[i])!=768):
      print(len(cmuEmbeddings[i]))
    else:
      newEmbeddings.append(cmuEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/cmuEmbeddingsBatheroom",newEmbeddings)


# **VoxCeleb Embeddings**

In [None]:
voxCelebFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/VoxCeleb/BarIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              voxCelebFilepaths.append(os.path.join(directory,filename))

In [None]:
voxCelebDiarizations = np.load("/content/drive/MyDrive/UC_COSMOS/DIRIZATIONS/VoxCelebBar.npy", allow_pickle=True)
voxCelebEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=voxCelebDiarizations, filePaths=voxCelebFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
print(len(voxCelebEmbeddings))
for i in range(len(voxCelebEmbeddings)):
  try:
    if (len(voxCelebEmbeddings[i])!=768):
      print(len(voxCelebEmbeddings[i]))
    else:
      newEmbeddings.append(voxCelebEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/voxcelebEmbeddingsBar",newEmbeddings)

In [None]:
voxCelebFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/VoxCeleb/StairwellIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              voxCelebFilepaths.append(os.path.join(directory,filename))

In [None]:
voxCelebEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=voxCelebDiarizations, filePaths=voxCelebFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
print(len(voxCelebEmbeddings))
for i in range(len(voxCelebEmbeddings)):
  try:
    if (len(voxCelebEmbeddings[i])!=768):
      print(len(voxCelebEmbeddings[i]))
    else:
      newEmbeddings.append(voxCelebEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/voxcelebEmbeddingsStairwell",newEmbeddings)

In [None]:
voxCelebFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/VoxCeleb/SupermarketIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              voxCelebFilepaths.append(os.path.join(directory,filename))

In [None]:
voxCelebEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=voxCelebDiarizations, filePaths=voxCelebFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
print(len(voxCelebEmbeddings))
for i in range(len(voxCelebEmbeddings)):
  try:
    if (len(voxCelebEmbeddings[i])!=768):
      print(len(voxCelebEmbeddings[i]))
    else:
      newEmbeddings.append(voxCelebEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/voxcelebEmbeddingsSupermarket",newEmbeddings)

In [None]:
voxCelebFilepaths = []
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.aac', '*.ogg', '*.m4a', '*.wma']
directory = "/content/drive/MyDrive/UC_COSMOS/New_Datasets/VoxCeleb/BathroomIR/"
# Function to extract numerical values from filenames for sorting
def numerical_sort_key(filename):
    # Extract numeric values from the filename using regex, or use the filename itself
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]

# Get all files in the directory and sort them numerically
files_in_directory = sorted(os.listdir(directory), key=numerical_sort_key)
for filename in files_in_directory:
    for ext in audio_extensions:
          if fnmatch.fnmatch(filename, ext):
              voxCelebFilepaths.append(os.path.join(directory,filename))

In [None]:
voxCelebEmbeddings = getEmbeddings(processor=processor, model=model, selectedModel="whisper", diarizations=voxCelebDiarizations, filePaths=voxCelebFilepaths, sampleRate=16000, minNumberToProcess=0, maxNumberToProcess=3000)

In [None]:
newEmbeddings = []
print(len(voxCelebEmbeddings))
for i in range(len(voxCelebEmbeddings)):
  try:
    if (len(voxCelebEmbeddings[i])!=768):
      print(len(voxCelebEmbeddings[i]))
    else:
      newEmbeddings.append(voxCelebEmbeddings[i])
  except:
    print("error")
append_to_existing_file("/content/drive/MyDrive/UC_COSMOS/EmbeddingsWhisper/voxcelebEmbeddingsBathroom",newEmbeddings)