In [1]:
import soundfile as sf
import os
from tqdm import tqdm
import argparse

In [2]:
def split_audio(segments_file, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(segments_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    wav_scp_path = os.path.join(output_dir, 'wav.scp')
    text_path = os.path.join(output_dir, 'text')
    
    with open(wav_scp_path, 'w', encoding='utf-8') as wav_scp_file, open(text_path, 'w', encoding='utf-8') as text_file:
        for line in tqdm(lines, desc="Processing", unit="wavs"):
            parts = line.strip().split()
            record_id = parts[0]
            wav_path = parts[1]
            start_time = float(parts[2])
            end_time = float(parts[3])
            gt_text = ' '.join(parts[4:])

            try:
                audio_data, sample_rate = sf.read(wav_path)
                start_sample = int(start_time * sample_rate)
                end_sample = int(end_time * sample_rate)
                segment = audio_data[start_sample:end_sample]
                output_path = os.path.join(output_dir, f"{record_id}.wav")
                sf.write(output_path, segment, sample_rate)
                # print(f"Saving {record_id}.wav")
                wav_scp_file.write(f"{record_id} {output_path}\n")
                text_file.write(f"{record_id} {gt_text}\n")
            except Exception as e:
                print(f"Error in saving {record_id}: {e}")

In [None]:
import os
from multiprocessing import Pool, cpu_count


# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_1_chunks"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp"



# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_french.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/french"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp"




# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_german.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/german"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp"



segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_russian.txt"
base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_3_chunks/russian"
temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp/russian"






# Ensure temp directory exists
os.makedirs(temp_dir, exist_ok=True)

chunk_size = 50000

# Read lines
with open(segments_path, 'r') as file:
    lines = file.readlines()

total_chunks = len(lines) // chunk_size + (1 if len(lines) % chunk_size != 0 else 0)

# Prepare job arguments
jobs = []

for i in range(total_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk_lines = lines[start:end]

    temp_path = os.path.join(temp_dir, f"chunk_{i+1}.txt")
    with open(temp_path, 'w') as f:
        f.writelines(chunk_lines)

    output_dir = os.path.join(base_output_dir, f"chunk_{i+1}")
    os.makedirs(output_dir, exist_ok=True)
    jobs.append((temp_path, output_dir))

# Function wrapper for multiprocessing

def run_split_audio(args):
    segment_path, out_dir = args
    split_audio(segment_path, out_dir)

In [13]:
jobs

[('/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp/spanish/chunk_1.txt',
  '/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_3_chunks/spanish/chunk_1'),
 ('/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp/spanish/chunk_2.txt',
  '/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_3_chunks/spanish/chunk_2')]

In [14]:
with Pool(cpu_count()) as pool:
    pool.map(run_split_audio, jobs)

Processing: 100%|██████████| 44006/44006 [1:24:15<00:00,  8.70wavs/s]
Processing: 100%|██████████| 50000/50000 [1:36:57<00:00,  8.60wavs/s]


In [19]:
import os

folder_path_1 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_3_chunks/spanish/chunk_1"

folder_path_2 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_3_chunks/spanish/chunk_2"

len(os.listdir(folder_path_1)) + len(os.listdir(folder_path_2))

94010

### Check for if every audio is present in the scratch folder or not

In [2]:
import pandas as pd
import os

In [3]:
scratch_data = pd.read_csv("/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/scratch_balanced_shuffled_data.csv")

In [4]:
scratch_data

Unnamed: 0,ID,Text,Path,class
0,English-American-0029-0029_002-O1-129573-129945,Is my um way of bringing back memories.,/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,english
1,0122-0122_005_phone-O1-001562-002014,"Antes as pessoas começaram por ir ao teatro, n...",/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,portuguese
2,0003-0003_005-O2-049763-050186,다은이 나오고 그날 유미랑 만나는 거 다은이랑 유미랑 만나는 거.,/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,korean
3,0110-0110_003_phone-O2-033851-034374,"sehr, sehr viel sehen in hmm weniger oder auch...",/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,german
4,English-American-0055-0055_004-O2-177931-178327,"Well, as I mentioned a minute ago, uh I travel...",/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,english
...,...,...,...,...
1398569,English-Filipino-00211-00211_002_phone-O2-1187...,And it's not just humans who appreciate flowers.,/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,english
1398570,English-American-0316-0316_002-O1-138564-138826,And it was so boring.,/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,english
1398571,English-British-0152-0152_002_phone-O2-010248-...,Qatar did an amazing job with the stadiums.,/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,english
1398572,English-American-0322-0322_005-O1-002448-002623,"I do, um.",/scratch/IITB/ai-at-ieor/23m1508/23m1508_backu...,english


In [None]:
from tqdm import tqdm

for path in tqdm(scratch_data['Path'], desc="Checking file paths"):
    if not os.path.exists(path):
        print(f"File does not exist: {path}")

Checking file paths: 100%|██████████| 1398574/1398574 [06:14<00:00, 3734.71it/s]
