# Standardization
Date : 31/07/2020

Load all the datasets from the global csv. Select a fixed number of samples for every speaker (ex : 60 sec => 3s x 20 samples). You will have to load the datasets with 'standardized=True' to use the standardization saved for each dataset (ex : TCOF_test.csv => TCOF_test_standardized.csv)

In [1]:
# useful imports
from IPython.display import SVG, Audio, display
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm
import librosa
%matplotlib inline
import os
from matplotlib import pyplot as plt

# voicemap imports
from voicemap.datasets import LibriSpeech, SpeakersInTheWild, CommonVoice, TCOF, ClassConcatDataset, SpectrogramDataset, DatasetFolder 
from config import PATH, DATA_PATH

## Parameters

In [2]:
n_seconds = 3 # Use only the samples longer than 'n_seconds'
downsampling = 4
sampling_rate = 16000
subset = "train"

## Load data

In [3]:
df_train = pd.read_csv(f"{DATA_PATH}/train_durations_per_speaker.csv")
df_test = pd.read_csv(f"{DATA_PATH}/val_durations_per_speaker.csv")
df_global = pd.read_csv(f"{DATA_PATH}/global_durations_per_speaker.csv")
if (subset == "train"):
    base_df = df_train
elif (subset == "val"):
    base_df = df_test
else:
    base_df = df_global

In [4]:
base_df

Unnamed: 0,speaker_id,filepath,seconds,dataset_name
0,TCOF_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.216000,TCOF
1,TCOF_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.547000,TCOF
2,TCOF_2,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,1.010000,TCOF
3,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.505000,TCOF
4,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,1.805000,TCOF
5,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,2.329000,TCOF
6,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.810000,TCOF
7,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,1.091000,TCOF
8,TCOF_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,2.622000,TCOF
9,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.906000,TCOF


## Prepare data
Add 'n_samples' column to count the number of samples for each speaker

In [5]:
source_df = base_df.loc[base_df['seconds'] > n_seconds]
df = source_df.loc[:, ['speaker_id', 'dataset_name']]
df = df.set_index('speaker_id')
df = df.loc[~df.index.duplicated(keep='first')]

dfGrouped = source_df.groupby(['speaker_id']).sum()
dfCountAudio = source_df.groupby(['speaker_id']).count().filepath
speakers_duration = dfGrouped.join(df)
speakers_duration = speakers_duration.join(dfCountAudio)
speakers_duration = speakers_duration.rename(columns={'filepath': 'n_samples'})
speakers_duration

Unnamed: 0_level_0,seconds,dataset_name,n_samples
speaker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CommonVoice_0003cd846a882cd7b17df3ce941508d16da430599223ef39a4e22226c640afe6ae1e8675e0b5f96175a6de4d322dcf0e67c258fa02b2b61cda05354fdddc5fbb,432.145488,CommonVoice,68
CommonVoice_000de8f59af10409a1f8e499c2f50b6a5adf2d9ad0f18395af0893345481b8311b1a094787aac8959fc50961f3e94deeacd005df8d478fd261cbf7b508602f0a,1636.423447,CommonVoice,338
CommonVoice_001b79ad83c8bb7c5591f059b31611157620e20f647eab28720c1dc40baac089bc9d1551e0ea5e0e4bdef30a97afdbb7a1f2e6d325deab3434b9848c0ad95f1e,82.440317,CommonVoice,18
CommonVoice_003f6010fddda25afa548668d296b4f4d91b8fff30ad32b1b55d1b79028cb9dc611723e109c879fcd83e273f918bf82ef438c4157652c5106e089d01663c02ce,170.040635,CommonVoice,35
CommonVoice_004a38a23c7b674dd17aa3479d50ef39fd484c38347423e241901eaec25ada7c337d0b2700dd5cd3b6c3a1862ca33336911687b283380c3bd15aa29b0406bee0,46.752154,CommonVoice,10
CommonVoice_004f7e01f66f710d82b83c614bdc2ac9aba075d95d205258cf28ef8faa2d2a4b62e25e87162c01a20d7e51be22d198b0b55611347e8937c3b679d3517e329ceb,25.536100,CommonVoice,5
CommonVoice_00511485e714ef2b6eea14ef62f03641ade85f7ce56fba114063c38d8a67208b581b027bbe385f52d51f5b91387a669fbdb5c5b2819942664b2e9852eecc288e,238.296961,CommonVoice,43
CommonVoice_006015274fb2a0e44ffd95b6624a52ebf90b65f3e9beacd08f30f03a2dfb7f04b5007740d80dcfcb7ee2916cf8836311fa9bdc5ab0636ffc36ad14407d75e51b,190.128844,CommonVoice,40
CommonVoice_0065fa8b8804fdd89c9ce2b8133b3c45a8fc49676c5547605e9286a4653a8e30599f95f4df577827e6631e52b242624995efba9b33c70da3cbbc80feb9c5ba9b,155.520499,CommonVoice,27
CommonVoice_006fc824e4e9eab126422b6e22272222dd7d17aad3fbebc55e872a6282d7dfbc3bb7b8c15f63e6f1538ed795245456cb77a3e2e3250933319324c2a55fde33ae,104.520317,CommonVoice,19


In [6]:
speakers_duration.n_samples.describe()

count    3697.000000
mean       74.078442
std       210.576436
min         1.000000
25%        18.000000
50%        42.000000
75%       110.000000
max      7495.000000
Name: n_samples, dtype: float64

Dans un premier temps on garde les locuteurs ayant plus de 15 samples de 3 secondes. Ce qui représente plus de 75% des locuteurs.

UPDATE : Avec 15 échantillons on obtient un modèle avec une perte de 1.22 et une précision de 22%. On augmente donc le nombre d'échantillons requis à 30 (en éliminant les locuteurs ayant moins de 30 échantillons, on verra plus tard un moyen de les conserver)

## Standardize the samples for each speaker

In [7]:
n_samples_min = 30

In [8]:
speakers_duration = speakers_duration.loc[speakers_duration.n_samples >= n_samples_min]
print(f"{len(speakers_duration)} speakers remain")

2199 speakers remain


In [9]:
# get the samples > n_seconds for the remaining speakers
samples_remaining_speakers = pd.merge(source_df, speakers_duration, on="speaker_id")
# sort the samples by speakers and duration
samples_remaining_speakers = samples_remaining_speakers.sort_values(['speaker_id', 'seconds_x'], ascending=[True, True])
print(f"{len(samples_remaining_speakers.loc[samples_remaining_speakers['dataset_name_x'] == 'LibriSpeech'])} samples remain in LibriSpeech")
print(f"{len(samples_remaining_speakers.loc[samples_remaining_speakers['dataset_name_x'] == 'sitw'])} samples remain in sitw")
print(f"{len(samples_remaining_speakers.loc[samples_remaining_speakers['dataset_name_x'] == 'CommonVoice'])} samples remain in CommonVoice")
print(f"{len(samples_remaining_speakers.loc[samples_remaining_speakers['dataset_name_x'] == 'TCOF'])} samples remain in TCOF")

129567 samples remain in LibriSpeech
0 samples remain in sitw
100853 samples remain in CommonVoice
21199 samples remain in TCOF


In [10]:
# Extract the 'n_samples' first samples for a given speaker
def getSpeakerFirstSamples(df_samples, speaker_id, n_samples):
    df_res = df_samples.loc[df_samples['speaker_id'] == speaker_id].head(n_samples)
    return df_res    

In [11]:
# for each speaker add his 'n_samples' longest samples to the 'filtered_samples' dataframe
filtered_samples = pd.DataFrame()
for speaker_id in tqdm(speakers_duration.index):
    speaker_samples = getSpeakerFirstSamples(samples_remaining_speakers, speaker_id, n_samples_min)
    filtered_samples = pd.concat([filtered_samples, speaker_samples], sort=False)
filtered_samples = filtered_samples.loc[:, ['speaker_id', 'filepath', 'seconds_x', 'dataset_name_x']]
filtered_samples

100%|██████████| 2199/2199 [00:38<00:00, 57.55it/s]


Unnamed: 0,speaker_id,filepath,seconds_x,dataset_name_x
162003,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,3.816009,CommonVoice
161989,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.224036,CommonVoice
161981,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.296009,CommonVoice
161987,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.296009,CommonVoice
161988,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.296009,CommonVoice
161978,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.416009,CommonVoice
161999,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.416009,CommonVoice
162037,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.416009,CommonVoice
162009,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.536009,CommonVoice
162038,CommonVoice_0003cd846a882cd7b17df3ce941508d16d...,/home/profenpoche/voicemap/data/CommonVoice/fr...,4.584036,CommonVoice


## Generate standardized csv for datasets

In [12]:
for dataset_name in ['LibriSpeech', 'CommonVoice', 'TCOF']:
    dataset_samples = filtered_samples.loc[filtered_samples['dataset_name_x'] == dataset_name]
    dataset_samples = dataset_samples.rename(columns={'seconds_x': 'seconds'})
    dataset_samples = dataset_samples.loc[:, ['speaker_id', 'filepath', 'seconds']]
    dataset_samples.to_csv(f"{DATA_PATH}/{dataset_name}/{dataset_name}_{subset}_standardized.csv", index=False)
    print(f"Saved to '{DATA_PATH}/{dataset_name}/{dataset_name}_{subset}_standardized.csv'")

Saved to '/home/profenpoche/voicemap/data/LibriSpeech/LibriSpeech_train_standardized.csv'
Saved to '/home/profenpoche/voicemap/data/CommonVoice/CommonVoice_train_standardized.csv'
Saved to '/home/profenpoche/voicemap/data/TCOF/TCOF_train_standardized.csv'


## Display to ensure it is correct

In [13]:
dataset_name = "LibriSpeech"
a = pd.read_csv(f"{DATA_PATH}/{dataset_name}/{dataset_name}_{subset}_standardized.csv")
a

Unnamed: 0,speaker_id,filepath,seconds
0,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,3.060000
1,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,3.690000
2,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,4.420000
3,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,5.150000
4,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,5.190000
5,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,5.225000
6,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,6.200000
7,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,6.450000
8,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,6.560000
9,LibriSpeech_100,/home/profenpoche/voicemap/data/LibriSpeech/tr...,6.805000


In [14]:
librispeech = LibriSpeech("train", n_seconds, downsampling, use_standardized=True, stochastic=True, pad=False)
common_voice = CommonVoice('fr', 'train', n_seconds, 12, use_standardized=True, stochastic=True, pad=False)
tcof = TCOF("train", n_seconds, downsampling, use_standardized=True, stochastic=True, pad=False)


LibriSpeech loaded from '/home/profenpoche/voicemap/data/LibriSpeech/LibriSpeech_train_standardized.csv'
In LibriSpeech there are 1168 speakers
CommonVoice loaded from '/home/profenpoche/voicemap/data/CommonVoice/CommonVoice_train_standardized.csv'
In CommonVoice train there are 715 speakers
TCOF loaded from '/home/profenpoche/voicemap/data/TCOF/TCOF_train_standardized.csv'
In TCOF  train  there are 316 speakers


In [15]:
common_voice.df.loc[common_voice.df.index == 15].filepath.values

array(['/home/profenpoche/voicemap/data/CommonVoice/fr/clips/common_voice_fr_19363050.mp3'],
      dtype=object)

In [35]:
display(Audio(data=tcof[359][0], rate=4000))