# **Data Preparation**


Install Pydub Library

In [None]:
pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


Import Required Modules

In [None]:
import os
from pydub import AudioSegment
import numpy as np
import random

In [None]:
prefix_dict = {}    # dictionary to store the prefixes of the audio files for different speakers

for i in range(10270, 10310):   # speaker no - 10270 to 10310
  for j in os.listdir("drive/MyDrive/VoxCeleb/data/id" + str(i)):    # get the list of names of audio files for each speaker
    if j.endswith("10.wav"):
      break

  prefix_dict[i] = j[:-10]

In [None]:
prefix_dict

{10270: '8jEAjG6SegY',
 10271: '1gtz-CUIygI',
 10272: 'olePnztkm6U',
 10273: '0LbtndiXJC0',
 10274: 'L9q6XabEpEI',
 10275: 'CVUXDNZzcmA',
 10276: '3vWez3baO60',
 10277: 'tbh20gz_KRA',
 10278: 'Pp-rAswo4Xg',
 10279: '4Q1IvdayPR8',
 10280: '9AtE4C309P8',
 10281: '1CiBWgsHT4E',
 10282: 'U3xR3MZjEVg',
 10283: 'clznvDi-ybg',
 10284: 'RNYNkXzY5Hk',
 10285: 'FUqAFZmZJ80',
 10286: '9K2YB1d8BqY',
 10287: 'I99R40TUF1s',
 10288: 'A3ZvNuG8_oM',
 10289: '3g9CjhcNEWk',
 10290: 'FAOLWl96MY0',
 10291: 'TMCTm7GxiDE',
 10292: '3kzw8lTcUBU',
 10293: 'MHOCv7pOmG4',
 10294: '1l9B0dz7gjc',
 10295: '3tvnlmkCiTw',
 10296: 'SKwsA_nq1P8',
 10297: '6pjKcuVoHLM',
 10298: '6qFnVechX9o',
 10299: 'uaPd2Ix-wSI',
 10300: '1ZyvrJaiLQk',
 10301: 'AeRSD9jIdSg',
 10302: 'K2_D_tFdAgY',
 10303: 'LKDWhdJQFco',
 10304: 'CN6JpPYbHCQ',
 10305: '3QrLepYlH6o',
 10306: '2SaEbN8hYz4',
 10307: '120gjdqGWNQ',
 10308: 'XQWpVt5n3Ic',
 10309: '0b1inHMAr6o'}

# Select Number of Files and Number of Speakers in each file

In [None]:
files = {}    # to store the id of speakers in each file

no_of_files = 5   # number of files in DATASET
min_speakers = 2    # minimum number of speakers in each file
max_speakers = 6    # maximum number of speakers in each file

for file_no in range(no_of_files):
  no_of_speakers = random.randint(min_speakers, max_speakers)   # randomly select the number of speakers
  # print(no_of_speakers, end=' -> ')
  
  speakers = []

  for j in range(no_of_speakers):
    speakers.append(random.randint(10270, 10309))   # randomly select the id of speakers in each file
    # print(speakers[j], end=' ')
  
  files[file_no + 1] = speakers
  # print()

In [None]:
files

{1: [10275, 10273, 10287, 10290, 10308],
 2: [10275, 10279, 10275, 10276, 10303],
 3: [10290, 10290, 10278, 10276, 10288, 10292],
 4: [10273, 10306, 10309, 10302, 10298],
 5: [10293, 10307]}

# Select Number of Occurences of Each Speaker and File Number from VoxCeleb RANDOMLY

In [None]:
data = []   # store the audio file numbers for each speaker

for aud_file in files.values():
  audio = {}

  for index, speaker in enumerate(aud_file):
    aud_files = []

    no_of_occurences = random.randint(1,10)   # get number of occurences of single speaker randomly

    for occurence in range(no_of_occurences):
      file_no = random.randint(1,9)   # get the number of the audio file for the speaker
      
      if file_no not in aud_files:
        aud_files.append(file_no)

    # print(index+1, speaker, end=', ')

    audio[speaker] = aud_files

  data.append(audio)

In [None]:
data

[{10275: [6, 4, 8, 7, 1],
  10273: [8, 2, 7, 6],
  10287: [4, 3, 9, 5, 2, 8],
  10290: [6, 1, 4, 5],
  10308: [3, 1, 2, 9, 4]},
 {10275: [6],
  10279: [4, 2, 5, 1, 7, 8, 9],
  10276: [6, 9, 1, 5, 8, 7],
  10303: [7, 2, 8, 5, 4]},
 {10290: [2, 1, 6, 5],
  10278: [9, 5, 4, 1, 6, 7],
  10276: [4, 6, 2, 7, 8],
  10288: [7, 9, 8, 6, 1, 4],
  10292: [9]},
 {10273: [4, 6, 2, 1],
  10306: [4, 9, 3, 2, 5],
  10309: [6, 5, 7, 3],
  10302: [6, 5, 8, 1, 2, 3],
  10298: [6, 9, 7, 1, 4]},
 {10293: [8, 6, 7, 2], 10307: [1, 6, 8]}]

# Load the Audio Files of Each Speaker and its Duration, Assign a Number to Each Speaker

In [None]:
audio_files = []

for data_file in data:
  audio_file = {}

  for index, speaker in enumerate(data_file):

    for file_no in data_file[speaker]:

      filename = "drive/MyDrive/VoxCeleb/data/id" + str(speaker) + "/" + prefix_dict[speaker] + "_0000" + str(file_no) + ".wav"

      audio = AudioSegment.from_file(filename, format="wav")    # get the audio data file with the filename
      audio_length = round(len(audio)/1000)   # get the length of the audio file

      id = index+1, audio_length    

      audio_file[id] = audio    # store the audio file with the id of speaker and length of audio
      
  audio_files.append(audio_file)


In [None]:
audio_files[:2]

[{(1, 9): <pydub.audio_segment.AudioSegment at 0x7ffa603e7190>,
  (1, 13): <pydub.audio_segment.AudioSegment at 0x7ffa603e73a0>,
  (1, 6): <pydub.audio_segment.AudioSegment at 0x7ffa603e7250>,
  (1, 8): <pydub.audio_segment.AudioSegment at 0x7ffa603e7670>,
  (2, 5): <pydub.audio_segment.AudioSegment at 0x7ffa603e78e0>,
  (2, 24): <pydub.audio_segment.AudioSegment at 0x7ffa603e77f0>,
  (2, 9): <pydub.audio_segment.AudioSegment at 0x7ffa603e73d0>,
  (3, 8): <pydub.audio_segment.AudioSegment at 0x7ffa603e7e80>,
  (3, 4): <pydub.audio_segment.AudioSegment at 0x7ffa603e7940>,
  (3, 10): <pydub.audio_segment.AudioSegment at 0x7ffa603e7d90>,
  (3, 6): <pydub.audio_segment.AudioSegment at 0x7ffa603e79d0>,
  (4, 5): <pydub.audio_segment.AudioSegment at 0x7ffa603e7490>,
  (4, 15): <pydub.audio_segment.AudioSegment at 0x7ffa603e7a60>,
  (4, 11): <pydub.audio_segment.AudioSegment at 0x7ffa60384070>,
  (4, 12): <pydub.audio_segment.AudioSegment at 0x7ffa603d6eb0>,
  (5, 5): <pydub.audio_segment.Aud

# Merge Audio Files of different Speakers Randomly to Form a Single Audio File and Create Labels Accordingly

In [None]:
labels = []
data_files = []

for audio_file in audio_files:
  label = []
  speaker_dict = {}

  audio = AudioSegment.empty()

  list_of_speakers = list(audio_file.keys())    # get the speakers in an audio file as a list
  random.shuffle(list_of_speakers)    # shuffle audio files in the list

  index = 1
  for speaker in list_of_speakers:
    
    audio += audio_file[speaker]    # merge audio files

    if speaker[0] not in speaker_dict:
      speaker_dict[speaker[0]] = index    # assign each speaker with ascending numbers
      index = index + 1

    label += [speaker_dict[speaker[0]]] * speaker[1]

  labels.append(label)    # update labels
  data_files.append(audio)    # update dataset



In [None]:
for label in labels:
  print(label)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

In [None]:
for file in data_files:
  print(file)

<pydub.audio_segment.AudioSegment object at 0x7ffa603981c0>
<pydub.audio_segment.AudioSegment object at 0x7ffa603980d0>
<pydub.audio_segment.AudioSegment object at 0x7ffa60398190>
<pydub.audio_segment.AudioSegment object at 0x7ffa60398250>
<pydub.audio_segment.AudioSegment object at 0x7ffa60398100>


# Get TimeStamps

In [None]:
# print the timestamps
def print_label(speaker_no : int, time : int, duration : int) -> None :

  print("Speaker-" + str(speaker_no), end='  ')

  start_minutes = time // 60
  start_seconds = time % 60

  time = time + duration

  end_minutes = time // 60
  end_seconds = time % 60

  print(f"{start_minutes}:{start_seconds} - {end_minutes}:{end_seconds}")

# get the duration for each occurence of the speaker
def get_duration(labels : list, index : int) -> int :

  length = 1
  i = index

  while (i < (len(labels) - 1)) and (labels[i] == labels[i+1]):
    length = length + 1
    i = i + 1

  return length

# get the timestamps for the audio file
def get_timestamps(labels : list) -> None :   
  
  dur = get_duration(labels, 0)
  print_label(1, 0, dur)

  for i in range(1, len(labels)):

    if labels[i] != labels[i-1]:
      
      dur = get_duration(labels, i)
      print_label(labels[i], i+1, dur)
  

In [None]:
print(labels[-1])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]


In [None]:
data_files[-1]

In [None]:
get_timestamps(labels[-1])

Speaker-1  0:0 - 0:11
Speaker-2  0:12 - 0:40
Speaker-1  0:40 - 0:53
Speaker-2  0:53 - 0:57
