##Imports and Logins

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install librosa soundfile datasets torchcodec huggingface_hub[cli] -q

In [None]:
HF_token = "TOKEN"
!export HF_token=TOKEN

In [None]:
from huggingface_hub import login
login(token=HF_token)

In [None]:
from datasets import load_dataset

# bhasaanuvaad = load_dataset("ai4bharat/NPTEL", "indic2en", split="hindi", streaming=True)


## Urls for Paraquets

In [None]:
import requests
url = "https://huggingface.co/api/datasets/ai4bharat/NPTEL/parquet/indic2en/marathi"

# Make request
resp = requests.get(url, headers={"Authorization": f"Bearer {HF_token}"})
resp.raise_for_status()  # ensure no error

# Parse JSON
urls = resp.json()


In [None]:
urls

## Select only 3 files at a Time due to RAM constraint

In [None]:
urls=urls[:3]

## Downloading Files

In [None]:
import requests
from tqdm import tqdm
import os

def download_parquet_files(url_list, token, output_dir="parquets"):
    """
    Downloads multiple parquet files from given URLs using Hugging Face token.

    Args:
        url_list (list): List of parquet file URLs.
        token (str): Hugging Face API token.
        output_dir (str): Directory to save downloaded files.
    """
    os.makedirs(output_dir, exist_ok=True)

    for url in url_list:
        filename = url.split("/")[-1]  # get filename from URL (e.g. 1.parquet)
        out_path = os.path.join(output_dir, filename)

        # Skip if already downloaded
        if os.path.exists(out_path):
            print(f"Skipping already downloaded {filename}")
            continue

        with requests.get(url, headers={"Authorization": f"Bearer {token}"}, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get("Content-Length", 0))
            with open(out_path, "wb") as f, tqdm(
                desc=filename,
                total=total_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        bar.update(len(chunk))
        print(f"✅ Download complete: {filename}")



In [None]:
download_parquet_files(urls, HF_token)


Skipping already downloaded 0.parquet
Skipping already downloaded 1.parquet
Skipping already downloaded 2.parquet


## Merging DFs in batches

In [None]:
import pandas as pd
import os
import gc
from tqdm import tqdm

def merge_parquet_files(parquet_folder="parquets",
                        output_folder="merged_chunks",
                        audio_folder="audio_files",
                        selected_columns=None,
                        chunk_size=3):

    os.makedirs(output_folder, exist_ok=True)
    os.makedirs(audio_folder, exist_ok=True)

    batch = []
    batch_count = 0
    file_count = 0
    audio_index = 0  # unique counter for audio filenames

    for filename in sorted(os.listdir(parquet_folder)):
        if filename.endswith(".parquet"):
            file_count += 1
            file_path = os.path.join(parquet_folder, filename)
            print(f"Loading {filename}...")

            # Load only needed columns
            df = pd.read_parquet(file_path, columns=selected_columns)

            # Process audio column
            if "chunked_audio_filepath" in df.columns:
                audio_paths = []
                for _, row in tqdm(df.iterrows(),total=57892):
                    if isinstance(row["chunked_audio_filepath"], dict) and "bytes" in row["chunked_audio_filepath"]:
                        audio_bytes = row["chunked_audio_filepath"]["bytes"]
                        audio_filename = f"audio_{batch_count}_{audio_index}.wav"
                        audio_index += 1
                        audio_out_path = os.path.join(audio_folder, audio_filename)

                        # Save file
                        with open(audio_out_path, "wb") as f:
                            f.write(audio_bytes)

                        audio_paths.append(audio_out_path)
                    else:
                        audio_paths.append(None)

                df = df.drop(columns=["chunked_audio_filepath"])
                df["audio_filepath"] = audio_paths

            batch.append(df)

            # Merge and save every chunk_size
            if file_count % chunk_size == 0:
                merged = pd.concat(batch, ignore_index=True)

                out_file = os.path.join(output_folder, f"merged_batch_{batch_count}.csv")
                merged.to_csv(out_file, index=False)
                print(f"Saved {out_file} with {len(merged)} rows")

                # Free memory explicitly
                for b in batch:
                    del b
                del merged
                batch.clear()
                gc.collect()

                batch_count += 1

    # Handle leftovers
    if batch:
        merged = pd.concat(batch, ignore_index=True)
        out_file = os.path.join(output_folder, f"merged_batch_{batch_count}.csv")
        merged.to_csv(out_file, index=False, encoding="utf-8-sig")
        print(f"Saved {out_file} with {len(merged)} rows")

        for b in batch:
            del b
        del merged
        batch.clear()
        gc.collect()

    print(f"✅ Done. Created {batch_count + 1} merged CSV files in {output_folder}")



merge_parquet_files(selected_columns=["text","chunked_audio_filepath","en_text"])

Loading 0.parquet...


100%|██████████| 57892/57892 [00:33<00:00, 1751.04it/s]


Loading 1.parquet...


100%|██████████| 57892/57892 [01:01<00:00, 934.09it/s] 


Loading 2.parquet...


100%|██████████| 57892/57892 [02:11<00:00, 439.09it/s]


Saved merged_chunks/merged_batch_0.csv with 173676 rows
Loading 3.parquet...


  8%|▊         | 4447/57892 [00:08<01:38, 542.94it/s]


KeyboardInterrupt: 

In [None]:
!mkdir -p drive/MyDrive/S2S/hindi
!rsync -ah --info=progress2 audio_files/ drive/MyDrive/S2S/hindi/audio_files/
!rsync -ah --info=progress2 merged_chunks/ drive/MyDrive/S2S/hindi/merged_chunks/

          7.45G  99%    1.63MB/s    1:12:29 (xfr#231246, to-chk=0/231569)
              0   0%    0.00kB/s    0:00:00 (xfr#0, to-chk=0/3)


In [None]:
# !rm -rf drive/MyDrive/S2S/hindi/audio_files
!rm -rf drive/MyDrive/S2S/hindi/merged_chunks
!rm -rf drive/MyDrive/S2S/hindi/paraquets


^C
