In [7]:
import torchaudio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import concurrent.futures
import os
from functools import wraps

def make_parallel(func):
    """
        Decorator used to decorate any function which needs to be parallized.
        After the input of the function should be a list in which each element is a instance of input fot the normal function.
        You can also pass in keyword arguements seperatley.
        :param func: function
            The instance of the function that needs to be parallelized.
        :return: function
    """

    @wraps(func)
    def wrapper(lst):
        """

        :param lst:
            The inputs of the function in a list.
        :return:
        """
        # the number of threads that can be max-spawned.
        # If the number of threads are too high, then the overhead of creating the threads will be significant.
        # Here we are choosing the number of CPUs available in the system and then multiplying it with a constant.
        # In my system, i have a total of 8 CPUs so i will be generating a maximum of 16 threads in my system.
        number_of_threads_multiple = 2 # You can change this multiple according to you requirement
        number_of_workers = int(os.cpu_count() * number_of_threads_multiple)
        if len(lst) < number_of_workers:
            # If the length of the list is low, we would only require those many number of threads.
            # Here we are avoiding creating unnecessary threads
            number_of_workers = len(lst)

        if number_of_workers:
            if number_of_workers == 1:
                # If the length of the list that needs to be parallelized is 1, there is no point in
                # parallelizing the function.
                # So we run it serially.
                result = [func(lst[0])]
            else:
                # Core Code, where we are creating max number of threads and running the decorated function in parallel.
                result = []
                with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_workers) as executer:
                    bag = {executer.submit(func, i): i for i in lst}
                    for future in concurrent.futures.as_completed(bag):
                        result.append(future.result())
        else:
            result = []
        return result
    return wrapper

In [4]:
def get_sig_len(ex):
    x, fs = torchaudio.load(ex)
    if fs != 16000:
        print(ex)
    sig_len = x.shape[1] / fs
    return sig_len

In [5]:
with open('dataset_file.txt', 'r') as f:
    tr_ex_list = [line.strip().split(',')[0] for line in f.readlines()]

In [8]:
results = make_parallel(get_sig_len)(tr_ex_list)

In [9]:
len(results)

28539

In [10]:
import numpy as np
np.min(results)

1.41

In [12]:
np.sum(np.array(results) > 10)

23173

In [13]:
sec_utt_10 = np.array(results) > 10

In [15]:
sec_utt_10.shape

(28539,)

In [16]:
with open('dataset_file.txt', 'r') as f:
    tr_ex_list_lines = [line for line in f.readlines()]

In [19]:
with open('dataset_file_10sec.txt', 'w') as f:
    for i in range(len(results)):
        if sec_utt_10[i]:
            f.write(tr_ex_list_lines[i])

