In [57]:
import re
import os
#import torch
import pandas as pd
from glob import glob

from abc import ABC, abstractmethod

class DataBase(ABC):
    
    @abstractmethod
    def make_tidy(self):
        pass
    
    @abstractmethod
    def parse_data(self) -> pd.DataFrame:
        pass
    
class MLS(DataBase):
    
    ext = ".flac" 
    basename = "multi_speech_librespeech"
    
    def __init__(self, data_train_dir, data_test_dir, data_dev_dir):

        self.train_path = data_train_dir
        self.test_path = data_test_dir
        self.dev_path = data_dev_dir
        
    def _create_path(self, path_type:str, audio_code:str):
    
        match = re.search("(\d+)_(\d+)_(\d+)",audio_code)
        return os.path.join(path_type, "audio", match.group(1), match.group(2), "".join([audio_code, self.ext]))

    def _parse_type(self, path_type:str, type_:str) -> pd.DataFrame:
        path_label = os.path.join(path_type, "transcripts.txt")
        
        df = pd.read_csv(path_label, sep="\t",header=None,names=["audio_code", "label"])    
        df = df.assign(**{"type":type_,
                          "audio_path":df.audio_code.apply(lambda x: 
                                                           self._create_path(path_type,x))
                         })
        return df.filter(["audio_path", "label", "type"])
    
    def make_tidy(self):
        pass
    
    def parse_data(self) -> pd.DataFrame:
        
        df_train = self._parse_type(self.train_path, "train")
        df_test = self._parse_type(self.test_path, "test")
        df_dev = self._parse_type(self.dev_path, "dev")
    
        return pd.concat([df_train, df_test, df_dev], ignore_index=True).assign(base=self.basename)

class CommonVoice(DataBase):
    
    ext = ".mp3"    
    basename = "common_voice"
    
    def __init__(self, main_path):

        self.train_path = os.path.join(main_path, "train.tsv")
        self.test_path = os.path.join(main_path, "test.tsv")
        self.dev_path = os.path.join(main_path, "validated.tsv")
        self.audios_path = os.path.join(main_path, "clips")
        
    def _create_path(self, audio_name):
        return os.path.join(self.audios_path, audio_name)

    def _parse_type(self,type_): 
        return (df.assign(**{"type":type_, "audio_path":df["path"].apply(self._create_path)})
                  .rename(columns={"sentence":"label"})
                  .filter(["audio_path", "label", "type"]))
    
    def make_tidy(self):
        pass
    
    def parse_data(self) -> pd.DataFrame: 
        
        df_train = self._parse_type("train")
        df_test = self._parse_type("test")
        df_dev = self._parse_type("dev")
        
        return pd.concat([df_train, df_test, df_dev], ignore_index=True).assign(base=self.basename)

In [58]:
mls.ext

'.flac'

In [17]:
audio_path = os.path.join("data/common_voice/cv-corpus-7.0-2021-07-21/pt","clips","common_voice_pt_27026042.mp3")

In [18]:
from pydub import AudioSegment

# files                                                                  
src = "transcript.mp3"
dst = "test.wav"

# convert wav to mp3                                                            
sound = AudioSegment.from_mp3(audio_path)
sound.export(dst, format="wav")

<_io.BufferedRandom name='test.wav'>

In [76]:
from pathlib import Path

class CreateTidyDataset:
    
    def __init__(self):
        
        mls = MLS(data_train_dir = "data/mls_portuguese/train", 
                  data_test_dir  = "data/mls_portuguese/test",
                  data_dev_dir   = "data/mls_portuguese/dev")

        cov = CommonVoice(main_path = "data/common_voice/cv-corpus-7.0-2021-07-21/pt")
        
        self.databases = [(mls,False), (cov, False)]

    def mp3_convert(self, path, database):
        pass
    
    def flac_convert(self, path, database):
        pass
  
    def create_folder(self, path):
        Path(path).mkdir(parents=True, exist_ok=True)
        return path
    
    def is_folder_created(self, path, force:bool):
        if force: return False
        else: return Path(path).is_dir()
            
    def converter_audio(self):
        
        conver_mapper = {".mp3": self.mp3_convert, ".flac": self.flac_convert}
                
        for database, force in self.databases:

            algos = []
            path = os.path.join("data", "tidy", database.basename)
            if not self.is_folder_created(path, force):
                path = self.create_folder(path)
                convert = conver_mapper[database.ext]
                algo = convert(path, database)
                algos.append(algo)

tidy = CreateTidyDataset()       

In [77]:
tidy.converter_audio()