# Imports and transforms the datasets
Date : 29/07/2020

Transformation : Add the 'seconds' column to the .csv

In [1]:
import torch

In [2]:
# useful imports
from IPython.display import SVG, Audio, display
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm
import librosa
%matplotlib inline
import os

# voicemap imports
from voicemap.datasets import LibriSpeech, SpeakersInTheWild, CommonVoice, TCOF 
from config import PATH, DATA_PATH

## Parameters

In [3]:
n_seconds = None # Use only the samples longer than 'n_seconds'
downsampling = 4
sampling_rate = 16000
recalculate_duration = False
export_datasets_csv = True

## Load data

In [4]:
librispeech_subsets = ['train-clean-100', 'train-clean-360']
unseen_subset = 'dev-clean'
sitw_unseen = 'eval'
tcof_subsets = ['train', 'test']

sampling_rate_ratio_common_voice = int(CommonVoice.base_sampling_rate / LibriSpeech.base_sampling_rate)

# Create datasets entities
librispeech = LibriSpeech(librispeech_subsets, n_seconds, downsampling, stochastic=True, pad=False)
librispeech_unseen = LibriSpeech(unseen_subset, n_seconds, downsampling, stochastic=True, pad=False)
sitw = SpeakersInTheWild('dev', 'enroll-core', n_seconds, downsampling, stochastic=True, pad=False)
sitw_unseen = SpeakersInTheWild('eval', 'enroll-core', n_seconds, downsampling, stochastic=True, pad=False)
common_voice = CommonVoice('fr', 'train', n_seconds, int(downsampling * sampling_rate_ratio_common_voice), stochastic=True, pad=False)
common_voice_unseen = CommonVoice('fr', 'test', n_seconds, int(downsampling * sampling_rate_ratio_common_voice), stochastic=True, pad=False)
tcof = TCOF(tcof_subsets, n_seconds, downsampling, stochastic=True, pad=False)
tcof_unseen = TCOF('dev', n_seconds, downsampling, stochastic=True, pad=False)
# group them by train/validation subset
train_datasets = [librispeech, sitw, common_voice, tcof]
val_datasets = [librispeech_unseen, sitw_unseen, common_voice_unseen, tcof_unseen]

In LibriSpeech there are 1172 speakers
Finished indexing data. 132553 usable files found.
In LibriSpeech there are 40 speakers
Finished indexing data. 2703 usable files found.
In SitW there are 119 speakers
In SitW there are 180 speakers
In CommonVoice train there are 1754 speakers
In CommonVoice test there are 4022 speakers
In TCOF  ['train', 'test']  there are 654 speakers
In TCOF  dev  there are 649 speakers


## Calculate (or load) duration for samples of a dataset

In [19]:
# Calculate the duration of the file at 'filepath'
def get_duration(filepath):
    if filepath.endswith('.mp3'):
        return float(librosa.get_duration(librosa.core.load(filepath)[0]))
    else:
        return float(subprocess.check_output(['sox', '--i','-D', filepath], stderr=subprocess.STDOUT).decode('utf-8').strip())
# Calculate the durations for all the samples of the dataset
def calculate_durations(dataset):
    print(f"Dataset is of type {type(dataset)}")
    dataframe_dataset = dataset.df.loc[:, ['speaker_id', 'filepath']]
    if 'seconds' in dataset.df.columns:
        dataframe_dataset['seconds'] = dataset.df['seconds']
    else:
        dataframe_dataset['seconds'] = np.nan
    
    max_range = len(dataframe_dataset)
    for i in tqdm(range(max_range)):
        if (np.isnan(dataframe_dataset.loc[i, "seconds"])):
            filepath = dataframe_dataset.loc[i].filepath
            file_duration = get_duration(filepath)
            dataframe_dataset.loc[i, "seconds"] = file_duration
    return dataframe_dataset

# Add 'dataset_name' column to identify from which dataset the sample comes from
def addDatasetColumn(dataframe):
    # dataset_name is the 6th column in '/home/profenpoche/voicemap/data/dataset_name'
    dataframe.loc[:,'dataset_name'] = dataframe['filepath'].str.split('/').str[5]
    return dataframe

# Add the 'dataset_name' in front of the 'speaker_id' to ensure it is unique
def makeIdUnique(dataframe):
    dataframe.loc[:,'speaker_id'] = dataframe['dataset_name'] + '_' + dataframe['speaker_id'].apply(str)
    return dataframe

# get train/val/global csv, or calculate it if necessary
def get_durations(subset_name="train", datasets=[]):
    csv_save_filename = f"{DATA_PATH}/{subset_name}_durations_per_speaker.csv"
    if os.path.isfile(csv_save_filename) and not recalculate_duration:
        subset_durations_per_speaker = pd.read_csv(csv_save_filename)
        print(f"Loaded from {csv_save_filename}")
    else:
        subset_durations_per_speaker = pd.DataFrame()
        for dataset in datasets:
            dataframe_dataset = calculate_durations(dataset)
            addDatasetColumn(dataframe_dataset)
            makeIdUnique(dataframe_dataset)
            subset_durations_per_speaker = pd.concat([subset_durations_per_speaker, dataframe_dataset], sort=False)
    return subset_durations_per_speaker

## Calculate train, val and global csv

In [20]:
val_durations = get_durations("val", val_datasets)
train_durations = get_durations("train", train_datasets)

Loaded from /home/profenpoche/voicemap/data/val_durations_per_speaker.csv
Loaded from /home/profenpoche/voicemap/data/train_durations_per_speaker.csv


- Merging both csv into a global csv

In [25]:
global_durations = pd.concat([train_durations, val_durations], sort=False)

## Save csv (& load)

In [22]:
# Save csv if we have recalculate durations
if recalculate_duration:
    train_durations_per_speaker.to_csv(f"{DATA_PATH}/train_durations_per_speaker.csv", index=False)
    test_durations_per_speaker.to_csv(f"{DATA_PATH}/val_durations_per_speaker.csv", index=False)
    global_durations.to_csv(f"{DATA_PATH}/global_durations_per_speaker.csv", index=False)

In [23]:
# Read csv to ensure they are correct
df_train = pd.read_csv(f"{DATA_PATH}/train_durations_per_speaker.csv")
df_test = pd.read_csv(f"{DATA_PATH}/val_durations_per_speaker.csv")
df_global = pd.read_csv(f"{DATA_PATH}/global_durations_per_speaker.csv")

In [24]:
df_global

Unnamed: 0,speaker_id,filepath,seconds,dataset_name
0,TCOF_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.216000,TCOF
1,TCOF_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.547000,TCOF
2,TCOF_2,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,1.010000,TCOF
3,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.505000,TCOF
4,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,1.805000,TCOF
5,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,2.329000,TCOF
6,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.810000,TCOF
7,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,1.091000,TCOF
8,TCOF_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,2.622000,TCOF
9,TCOF_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,0.906000,TCOF


## Add 'seconds' column into the datasets.csv

In [26]:
def addSecondsColumn(dataset_df, dataset_name=""):
    # Generate {filepath -> seconds} dataframe from df_global
    if dataset_name != "":
        df_fixed_dataset = df_global.loc[df_global['dataset_name'] == dataset_name]
    else:
        df_fixed_dataset = df_global
    df_fixed_dataset = df_fixed_dataset.loc[:, ['filepath','seconds']]
    df_fixed_dataset = df_fixed_dataset.set_index('filepath')
    # Merge the dataset with the above dataframe to add the 'seconds' corresponding to the 'filepath' found
    if not 'seconds' in dataset_df.columns:
        dataset_df = dataset_df.join(df_fixed_dataset, on='filepath')
    return dataset_df

### Librispeech
'seconds' are already given in the original dataset

### Sitw
*TODO : Add 'seconds' column to allow filtering on minimum duration.*

For now the min duration is 6 seconds so it is not useful while we train the model with a duration less or equal than 6 seconds

In [38]:
df_global.loc[df_global['dataset_name'] == 'sitw'].seconds.min()

6.0

### Common Voice

In [27]:
languages = ["fr"]
subsets = ["train", "test"]

Load datasets

In [28]:
common_voice_datasets = []
for language in languages:
    for subset in subsets:
        common_voice_datasets.append(CommonVoice(language, subset, n_seconds, int(downsampling * sampling_rate_ratio_common_voice), stochastic=True, pad=False))

In CommonVoice train there are 1754 speakers
In CommonVoice test there are 4022 speakers


Add 'seconds' column and export as csv

In [29]:
for common_voice in common_voice_datasets:
    new_df = addSecondsColumn(common_voice.df, "CommonVoice")
    if export_datasets_csv:
        new_df.to_csv(f"{DATA_PATH}/CommonVoice/{common_voice.language}/{common_voice.subset}_transformed.csv", index=False)
        print(f"Saved to '{DATA_PATH}/CommonVoice/{common_voice.language}/{common_voice.subset}_transformed.csv'")

Saved to '/home/profenpoche/voicemap/data/CommonVoice/fr/train_transformed.csv'
Saved to '/home/profenpoche/voicemap/data/CommonVoice/fr/test_transformed.csv'


In [30]:
for language in languages:
    for subset in subsets:
        df = pd.read_csv(DATA_PATH+ f'/CommonVoice/{language}/{subset}_transformed.csv')
        print(len(df), df.columns)

130602 Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accent', 'speaker_id', 'filepath', 'index', 'id', 'seconds'],
      dtype='object')
14760 Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accent', 'speaker_id', 'filepath', 'index', 'id', 'seconds'],
      dtype='object')


### TCOF

Run voicemap/datasets/ImportTCOF.py to import the dataset and generate the csv with filepaths first

In [31]:
subsets = ['train', 'test', 'dev']

Add 'seconds' column and export as csv

In [32]:
for subset in subsets:
    source = pd.read_csv(DATA_PATH + f'/TCOF/TCOF_{subset}.csv',
                      delimiter=',', names=['speaker_id', 'filepath', 'transcript', 'wav_filesize', 'seconds'],header=1)
    # Remove 'seconds' column
    source = source.loc[:, source.columns != 'seconds']
    new_df = addSecondsColumn(source, "TCOF")
    if export_datasets_csv:
        new_df.to_csv(f"{DATA_PATH}/TCOF/TCOF_{subset}.csv", index=False)
        print(f"Saved to '/TCOF/TCOF_{subset}.csv'")

Saved to '/TCOF/TCOF_train.csv'
Saved to '/TCOF/TCOF_test.csv'
Saved to '/TCOF/TCOF_dev.csv'


In [33]:
for subset in subsets:
    df = pd.read_csv(DATA_PATH+ f'/TCOF/TCOF_3{subset}.csv')
    print(len(df), df.columns)
df

94891 Index(['speaker_id', 'filepath', 'transcript', 'wav_filesize', 'seconds'], dtype='object')
11860 Index(['speaker_id', 'filepath', 'transcript', 'wav_filesize', 'seconds'], dtype='object')
11861 Index(['speaker_id', 'filepath', 'transcript', 'wav_filesize', 'seconds'], dtype='object')


Unnamed: 0,speaker_id,filepath,transcript,wav_filesize,seconds
0,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,vous m'avez donné votre nom tout à l'heure,64718,2.020
1,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,ema alors est ce que vous connaissez,70510,2.201
2,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,d'accord,15566,0.484
3,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,et en quoi elles sont différentes,84590,2.641
4,ema_chiara_cp_proinf_spk570_id_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,elle a,10574,0.328
5,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,est ce que tu serais la même petite fille,106414,3.323
6,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,en quoi,69006,2.154
7,ema_chiara_cp_proinf_spk569_id_0,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,mais tu serais pas la même petite fille,54830,1.711
8,ema_chiara_cp_proinf_spk570_id_1,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,si elle était moi,45038,1.405
9,ema_chiara_cp_proinf_spk571_id_2,/home/profenpoche/voicemap/data/TCOF/Enfants/e...,hé c'est ma tata qui s'appelle aussi christine,67694,2.113
