In [2]:
import os
import argparse

In [3]:
def process_txt_files(root_dir):
    results = []
    for root, _, files in os.walk(root_dir):
        print(f"Processing directory: {root}")
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                wav_path = file_path.rsplit('.', 1)[0] + '.wav'
                relative_path = os.path.relpath(file_path, root_dir)
                parts = relative_path.split(os.sep)
                if len(parts) >= 2:
                    record_id = "-".join(parts[:-1] + [os.path.splitext(parts[-1])[0]])
                else:
                    record_id = os.path.splitext(relative_path)[0]

                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) >= 4:
                            start_time = float(parts[0])
                            end_time = float(parts[1])
                            spkid = parts[2]
                            gt_text = ' '.join(parts[3:])
                            start_time_rounded = round(start_time, 2)
                            end_time_rounded = round(end_time, 2)
                            start_time_str = str(int(start_time_rounded * 100)).zfill(6)
                            end_time_str = str(int(end_time_rounded * 100)).zfill(6)
                            combined_value = f"{record_id}-{spkid}-{start_time_str}-{end_time_str}"
                            results.append((combined_value, wav_path, start_time_rounded, end_time_rounded, gt_text))
    return results


def write_to_segments(segments_path, segment):
    with open(segments_path, 'w', encoding='utf-8') as f:
        for seg_id, wav_path, start, end, gt_text in segment:
            f.write(f"{seg_id} {wav_path} {start} {end} {gt_text}\n")

In [None]:
# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_1_extracted/MLC-SLM_Workshop-Training_Set_1/data"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_2_extracted/data/French"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_french.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_2_extracted/data/German"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_german.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_2_extracted/data/Italian"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_italian.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_2_extracted/data/Japanese"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_japanese.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_2_extracted/data/Korean"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_korean.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_3_extracted/MLC-SLM_Workshop-Training_Set_3/data/Portuguese"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_portuguese.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_3_extracted/MLC-SLM_Workshop-Training_Set_3/data/Russian"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_russian.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_3_extracted/MLC-SLM_Workshop-Training_Set_3/data/Spanish"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_spanish.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_3_extracted/MLC-SLM_Workshop-Training_Set_3/data/Thai"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_thai.txt"


# data_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/set_3_extracted/MLC-SLM_Workshop-Training_Set_3/data/Vietnamese"
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_vietnamese.txt"

# =========================================================================


### This is for the development set

In [15]:
data_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Vietnamese"

segments_path = '/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/vietnamese_segments_path.txt'

segment = process_txt_files(data_dir)

write_to_segments(segments_path, segment)

Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Vietnamese


This is for Evaluation Set

moving to the scratch directory

In [4]:
import shutil
import os

# Source and destination paths
src_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/Evaluation_Set_1"
dst_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1"  # Adjust username if needed

# Make sure destination parent directory exists
os.makedirs(os.path.dirname(dst_dir), exist_ok=True)

# Copy the entire folder
shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)  # dirs_exist_ok=True for overwriting existing contents

print(f"Copied to: {dst_dir}")


Copied to: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1


In [55]:
import os
import argparse

def process_txt_files(root_dir):
    results = []
    for root, _, files in os.walk(root_dir):
        print(f"Processing directory: {root}")
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                wav_path = file_path.rsplit('.', 1)[0] + '.wav'
                relative_path = os.path.relpath(file_path, root_dir)
                parts = relative_path.split(os.sep)
                if len(parts) >= 2:
                    record_id = "-".join(parts[:-1] + [os.path.splitext(parts[-1])[0]])
                else:
                    record_id = os.path.splitext(relative_path)[0]

                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) == 3:
                            start_time = float(parts[0])
                            end_time = float(parts[1])
                            spkid = parts[2]
                            gt_text = "This is a test text"  # Placeholder for actual text
                            start_time_rounded = round(start_time, 2)
                            end_time_rounded = round(end_time, 2)
                            start_time_str = str(int(start_time_rounded * 100)).zfill(6)
                            end_time_str = str(int(end_time_rounded * 100)).zfill(6)
                            combined_value = f"{record_id}-{spkid}-{start_time_str}-{end_time_str}"
                            results.append((combined_value, wav_path, start_time_rounded, end_time_rounded, gt_text))
                            
    return results

def write_to_segments(segments_path, segment):
    with open(segments_path, 'w', encoding='utf-8') as f:
        for seg_id, wav_path, start, end, gt_text in segment:
            f.write(f"{seg_id} {wav_path} {start} {end} {gt_text}\n")

In [61]:
data_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1"

segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/evaluation_paths.txt"

segment = process_txt_files(data_dir)

Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1/French
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1/Korean
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1/Russian
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1/Italian
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1/English
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Workshop-Evaluation_Set_1/English/Indian
Processing directory: /scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/MLC-SLM_Works

In [62]:
len(segment)

29299

In [63]:
write_to_segments(segments_path, segment)

### Spliting the audio files and then save them in particular folder

In [1]:
import soundfile as sf
import os
from tqdm import tqdm
import argparse

In [2]:
def split_audio(segments_file, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(segments_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    wav_scp_path = os.path.join(output_dir, 'wav.scp')
    text_path = os.path.join(output_dir, 'text')
    
    with open(wav_scp_path, 'w', encoding='utf-8') as wav_scp_file, open(text_path, 'w', encoding='utf-8') as text_file:
        for line in tqdm(lines, desc="Processing", unit="wavs"):
            parts = line.strip().split()
            record_id = parts[0]
            wav_path = parts[1]
            start_time = float(parts[2])
            end_time = float(parts[3])
            gt_text = ' '.join(parts[4:])

            try:
                audio_data, sample_rate = sf.read(wav_path)
                start_sample = int(start_time * sample_rate)
                end_sample = int(end_time * sample_rate)
                segment = audio_data[start_sample:end_sample]
                output_path = os.path.join(output_dir, f"{record_id}.wav")
                sf.write(output_path, segment, sample_rate)
                # print(f"Saving {record_id}.wav")
                wav_scp_file.write(f"{record_id} {output_path}\n")
                text_file.write(f"{record_id} {gt_text}\n")
            except Exception as e:
                print(f"Error in saving {record_id}: {e}")

In [9]:
# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path.txt"
# output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/only_audio_set1"
# split_audio(segments_path, output_dir)

In [3]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/english_segments_path.txt"
output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/English_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 9540/9540 [14:50<00:00, 10.72wavs/s]


In [5]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/german_segments_path.txt"
output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/German_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 2008/2008 [03:09<00:00, 10.57wavs/s]


In [6]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/italian_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Italian_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 1853/1853 [03:06<00:00,  9.91wavs/s]


In [7]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/japanese_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Japanese_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 2816/2816 [03:05<00:00, 15.20wavs/s]


In [8]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/korean_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Korean_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 2340/2340 [03:10<00:00, 12.26wavs/s]


In [9]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/portuguese_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Portuguese_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 1824/1824 [01:58<00:00, 15.44wavs/s]


In [10]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/russian_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Russian_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 1687/1687 [03:12<00:00,  8.74wavs/s]


In [11]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/spanish_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Spanish_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 1716/1716 [03:44<00:00,  7.65wavs/s]


In [12]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/thai_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Thai_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 2095/2095 [04:06<00:00,  8.48wavs/s]


In [13]:
segments_path = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/data_dev_csv/vietnamese_segments_path.txt"

output_dir = "/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/audio_development_set/Vietnamese_split"

split_audio(segments_path, output_dir)

Processing: 100%|██████████| 1977/1977 [02:39<00:00, 12.42wavs/s]


In [3]:
import os
len(os.listdir("/scratch/IITB/ai-at-ieor/23m1508/23m1508_backup/Evaluation_Set_1/All_Evaluation_audio"))

29301

In [10]:
# import os

# folder_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/only_audio_set1"
# all_files = os.listdir(folder_path)
# print("Total files:", len(all_files))


In [54]:
import os
from multiprocessing import Pool, cpu_count


# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_1_chunks"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp"



# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_french.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/french"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp"




# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_german.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/german"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp"



# segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_csv/segments_path_russian.txt"
# base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/russian"
# temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp/russian"



segments_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/data_dev_csv/vietnamese_segments_path.txt"
base_output_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_dev_set_chunks/vietnamese"
temp_dir = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp/vietnamese"


# Ensure temp directory exists
os.makedirs(temp_dir, exist_ok=True)

chunk_size = 50000

# Read lines
with open(segments_path, 'r') as file:
    lines = file.readlines()

total_chunks = len(lines) // chunk_size + (1 if len(lines) % chunk_size != 0 else 0)

# Prepare job arguments
jobs = []

for i in range(total_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk_lines = lines[start:end]

    temp_path = os.path.join(temp_dir, f"chunk_{i+1}.txt")
    with open(temp_path, 'w') as f:
        f.writelines(chunk_lines)

    output_dir = os.path.join(base_output_dir, f"chunk_{i+1}")
    os.makedirs(output_dir, exist_ok=True)
    jobs.append((temp_path, output_dir))

# Function wrapper for multiprocessing
def run_split_audio(args):
    segment_path, out_dir = args
    split_audio(segment_path, out_dir)

In [55]:
jobs

[('/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/temp/vietnamese/chunk_1.txt',
  '/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_dev_set_chunks/vietnamese/chunk_1')]

In [56]:
with Pool(cpu_count()) as pool:
    pool.map(run_split_audio, jobs)

Processing: 100%|██████████| 1977/1977 [02:39<00:00, 12.41wavs/s]


In [6]:
# wav_scp_path = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_1_chunks/chunk_1/wav.scp"

# with open(wav_scp_path, 'r', encoding='utf-8') as f:
#     lines = f.readlines()
    
    
#     for line in lines:
#         print(line)
#         break

In [102]:
import os

folder_path_1 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/italian/chunk_1"

folder_path_2 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/italian/chunk_2"

folder_path_3 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/italian/chunk_3"

len(os.listdir(folder_path_1)) + len(os.listdir(folder_path_2)) + len(os.listdir(folder_path_3))

104886

In [2]:
import os

folder_path_1 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/russian/chunk_1"

folder_path_2 = "/home/IITB/ai-at-ieor/23m1508/Shivam_23M1508/Interspeech/data/audio_set_2_chunks/russian/chunk_2"

len(os.listdir(folder_path_1)) + len(os.listdir(folder_path_2))

97593