# Audio Segmentation for Processing in ASR

## Team: AI-NLP 

Author: Aline Rodrigues

In [None]:
from glob import glob
from pydub import AudioSegment, silence
import os

In [None]:
class Preprocessing():
    """
        Author:  Aline Rodrigues
        Created: 09/02/2022
        Execute preprocessing audio
    """
    
    def __init__(self, format = 'wav'):
        self.format = format
    
    
    def get_splits_from_wav_file(self, path_wav, path_files, min_silence_len=800, max_segs=20, min_segs=10, join_to_max=True):

        sound = AudioSegment.from_file(path_wav, format=self.format)
        
        def spt(snd, ms, ks=True):
            ret = []
            if snd.duration_seconds < min_segs:
                ret.append(snd)
            else:
                chunks = silence.split_on_silence(snd, min_silence_len = ms, silence_thresh = sound.dBFS-16, keep_silence=ks)
                for chunk in chunks:
                    if chunk.duration_seconds >= max_segs:
                        chunks2 = spt(chunk, round(ms*0.6))
                        ret.extend(chunks2)
                    else:
                        ret.append(chunk)
            return ret 
        
        chunks = spt(sound, min_silence_len, True)
        
        if join_to_max:
            ret = []
            actual = chunks[0]
            for chunk in chunks[1:]:
                if (actual.duration_seconds + chunk.duration_seconds) < max_segs:
                    actual += chunk
                else:
                    ret.append(actual)
                    actual = chunk
            ret.append(actual)
            chunks = ret

        file_name = path_wav.split('/')[-1].replace(f'.{self.format}', '')
        saved_files = []
        if os.path.isdir(path_files):
            for i, audio_chunk in enumerate(chunks, start=1):
                chunk_filename = os.path.join(path_files, f'{file_name}_{i:03}.{self.format}' )
                audio_chunk.export(chunk_filename, format=self.format)        
                saved_files.append(chunk_filename)
        else:
            print("É necessário informar uma pasta temporária pré-existente")

In [7]:
path_files = glob('dataset/audio/*')
print(path_files)

[]


In [8]:
output_files = 'dataset/segmentation'

In [None]:
format = 'wav'
preprocessing = Preprocessing(format)
duration_total = 0

for path in path_files:
    if 'palavras' not in path: continue
    if 'pseudopalavras' in path: continue
    os.mkdir(path.replace('audio', 'segmentation'))
    
    files = glob(f'{path}/*.{format}')
    print(f'{len(files)} audios')
    output = path.split('/')[-1]
    print(output)
    for path_file in files:
        preprocessing.get_splits_from_wav_file(path_file, output_files + '/' + output,  min_silence_len=800)

0 audios
palavras.zip
221 audios
palavras
