📊 Final Results:

---

📚 Total books processed: 388

✅ Successful downloads: 336

❌ Failed downloads: 52

📈 Success rate: 86.6%


In [1]:
from pathlib import Path

audio_save_path = Path("../data/audio/raw/")
text_save_path = Path("../data/text/pdf/")

In [2]:
audio_dirs = [d for d in audio_save_path.iterdir() if d.is_dir()]
text_files = text_save_path.glob("**/*.pdf")

text_files = [f for f in text_files if f.is_file()]
audio_dirs = [d for d in audio_dirs if d.is_dir()]

len(text_files), len(audio_dirs), text_files[:5], audio_dirs[:5]

(387,
 344,
 [PosixPath('../data/text/pdf/sach-den-ve-tinh-than-doanh-nhan.pdf'),
  PosixPath('../data/text/pdf/ping-hanh-trinh-ra-bien-lon.pdf'),
  PosixPath('../data/text/pdf/suc-manh-cua-su-tu-te.pdf'),
  PosixPath('../data/text/pdf/chu-be-rac-roi.pdf'),
  PosixPath('../data/text/pdf/chiec-la-cuoi-cung.pdf')],
 [PosixPath('../data/audio/raw/suc-manh-cua-su-khich-le'),
  PosixPath('../data/audio/raw/bi-mat-ho-ca-than'),
  PosixPath('../data/audio/raw/dieu-binh-di-thong-thai'),
  PosixPath('../data/audio/raw/khuc-nam-ai'),
  PosixPath('../data/audio/raw/bi-mat-cua-vua-solomon')])

In [3]:
def get_dir_size(path: Path, print_size: bool = False) -> int:
  size = sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
  if print_size:
    print(f"Directory: {path}, Size: {size / (1024 * 1024):.2f} MB")
  return size


(
  get_dir_size(audio_save_path, print_size=True),
  get_dir_size(text_save_path, print_size=True),
)

Directory: ../data/audio/raw, Size: 56943.42 MB
Directory: ../data/text/pdf, Size: 638.70 MB


(59709501712, 669720595)

In [4]:
def matching_audio_text(text_files, audio_dirs, remove_empty_dirs=False):
  matched_pairs = []
  text_files = sorted(text_files, key=lambda f: f.stem)
  audio_dirs = sorted(audio_dirs, key=lambda d: d.stem)

  i_text, i_audio = 0, 0
  while i_text < len(text_files) and i_audio < len(audio_dirs):
    text_file = text_files[i_text]
    audio_dir = audio_dirs[i_audio]

    if text_file.stem == audio_dir.stem:
      if get_dir_size(audio_dir) == 0 and remove_empty_dirs:
        print(f"Removing empty directory: {audio_dir}")
        # Remove the text file if it matches an empty audio directory
        text_file.unlink(missing_ok=True)
        audio_dir.rmdir(missing_ok=True)  # Remove the empty audio directory
      else:
        matched_pairs.append((text_file, audio_dir))
      i_text += 1
      i_audio += 1
    elif text_file.stem < audio_dir.stem:
      i_text += 1
    else:
      i_audio += 1

  print(f"Matched pairs: {len(matched_pairs)}")
  return matched_pairs


matched_pairs = matching_audio_text(text_files, audio_dirs, remove_empty_dirs=True)

Matched pairs: 344


In [5]:
matched_pairs[:5]

[(PosixPath('../data/text/pdf/10-bai-hoc-tren-chiec-khan-an.pdf'),
  PosixPath('../data/audio/raw/10-bai-hoc-tren-chiec-khan-an')),
 (PosixPath('../data/text/pdf/10-bi-quyet-thanh-cong-cua-nhung-dien-gia-mc-tai-nang-nhat-the-gioi.pdf'),
  PosixPath('../data/audio/raw/10-bi-quyet-thanh-cong-cua-nhung-dien-gia-mc-tai-nang-nhat-the-gioi')),
 (PosixPath('../data/text/pdf/10-dieu-khac-biet-nhat-giua-ke-giau-va-nguoi-ngheo.pdf'),
  PosixPath('../data/audio/raw/10-dieu-khac-biet-nhat-giua-ke-giau-va-nguoi-ngheo')),
 (PosixPath('../data/text/pdf/10-dieu-tao-nen-so-phan.pdf'),
  PosixPath('../data/audio/raw/10-dieu-tao-nen-so-phan')),
 (PosixPath('../data/text/pdf/10-nghich-ly-cuoc-song.pdf'),
  PosixPath('../data/audio/raw/10-nghich-ly-cuoc-song'))]

In [6]:
import pandas as pd
from typing import List, Tuple
from pathlib import Path
from tts_data_pipeline import Book

In [7]:
def update_metadata(
  old_df: pd.DataFrame,
  matched_pairs: List[Tuple[Path, Path]],
  name_new_df: str = "after_download_metadata",
  save_json: bool = True,
  save_csv: bool = True,
) -> pd.DataFrame:
  """
  Update metadata for matched text/audio pairs with optimized performance.

  Args:
      old_df: Original metadata DataFrame
      matched_pairs: List of (text_file, audio_dir) tuples
      name_new_df: Name for output CSV file
      save_json: Whether to save individual JSON files
      save_csv: Whether to save consolidated CSV

  Returns:
      Updated DataFrame with new metadata
  """
  name_to_row = {row["name"]: row for _, row in old_df.iterrows()}
  updated_records = []
  missing_names = []

  # Create output directory if saving JSON files
  if save_json:
    json_output_dir = Path("../data/metadata/book")
    json_output_dir.mkdir(parents=True, exist_ok=True)

  for text_file, audio_dir in matched_pairs:
    name = text_file.stem

    if name not in name_to_row:
      missing_names.append(name)
      continue

    row_dict = name_to_row[name]
    book = Book.from_dict(row_dict)

    # Update paths and sizes
    book.update_paths(
      audio_path=audio_dir,
      text_path=text_file,
    )
    book.update_size(
      audio_size=get_dir_size(audio_dir),
      text_size=text_file.stat().st_size,
    )

    # Save JSON if requested
    if save_json:
      json_path = json_output_dir / f"{name}.json"
      book.save_json(path=json_path)

    # Collect record for batch DataFrame creation
    updated_records.append(book.to_dict())

  # Report missing names at once
  if missing_names:
    print(
      f"Warning: No metadata found for {len(missing_names)} items: {missing_names[:5]}"
    )
    if len(missing_names) > 5:
      print(f"... and {len(missing_names) - 5} more")

  # Create DataFrame in one operation instead of repeated concatenation
  new_df = pd.DataFrame(updated_records, columns=old_df.columns)

  # Save CSV if requested
  if save_csv:
    csv_output_dir = Path("../data/metadata")
    csv_output_dir.mkdir(parents=True, exist_ok=True)
    csv_path = csv_output_dir / f"{name_new_df}.csv"
    new_df.to_csv(csv_path, index=False)

  return new_df

In [8]:
old_metadata_df = pd.read_csv("../data/metadata/able_download_metadata_less_5_hour.csv")
# old_metadata_df.head()

In [9]:
after_download_metadata_df = update_metadata(
  old_metadata_df, matched_pairs, save_json=False
)
after_download_metadata_df.head()

Unnamed: 0,id,name,text_path,audio_path,narrator,duration,author,text_url,audio_url,alignment_path,text_download_url,audio_download_url,sample_rate,quality,word_count,num_sentences,audio_size,text_size
0,a00985dd,10-bai-hoc-tren-chiec-khan-an,../data/text/pdf/10-bai-hoc-tren-chiec-khan-an...,../data/audio/raw/10-bai-hoc-tren-chiec-khan-an,"{'id': '5af6cbeb', 'name': 'Trần Huỳnh Phương ...",2.5039,Don Failla,['https://thuviensachpdf.com/10-bai-hoc-tren-c...,https://sachnoiviet.net/sach-noi/10-bai-hoc-tr...,,https://cloud.thuviensachpdf.com/pdf/vi/10-bai...,['https://archive.org/download/10-bai-hoc-tren...,,,,,144734808,347556
1,2c0c221e,10-bi-quyet-thanh-cong-cua-nhung-dien-gia-mc-t...,../data/text/pdf/10-bi-quyet-thanh-cong-cua-nh...,../data/audio/raw/10-bi-quyet-thanh-cong-cua-n...,"[{'id': '047c61b1', 'name': 'Ái Hòa', 'dialect...",4.2244,Carmine Gallo,['https://thuviensachpdf.com/10-bi-quyet-thanh...,https://sachnoiviet.net/sach-noi/10-bi-quyet-t...,,https://cloud.thuviensachpdf.com/pdf/vi/10-bi-...,['https://archive.org/download/10-bi-02/10-bi-...,,,,,121662956,1017767
2,a0c7d3cd,10-dieu-khac-biet-nhat-giua-ke-giau-va-nguoi-n...,../data/text/pdf/10-dieu-khac-biet-nhat-giua-k...,../data/audio/raw/10-dieu-khac-biet-nhat-giua-...,"{'id': '134c54b3', 'name': 'Trần Huỳnh Phương ...",2.0014,Keith Cameron Smith,['https://thuviensachpdf.com/10-dieu-khac-biet...,https://sachnoiviet.net/sach-noi/10-dieu-khac-...,,https://cloud.thuviensachpdf.com/pdf/vi/10-die...,['https://archive.org/download/truyenngan_2019...,,,,,57600157,2764042
3,33797050,10-dieu-tao-nen-so-phan,../data/text/pdf/10-dieu-tao-nen-so-phan.pdf,../data/audio/raw/10-dieu-tao-nen-so-phan,"[{'id': '97bdf585', 'name': 'Ái Hòa', 'dialect...",3.7058,Maria Shriver,['https://thuviensachpdf.com/10-dieu-tao-nen-s...,https://sachnoiviet.net/sach-noi/10-dieu-tao-n...,,https://cloud.thuviensachpdf.com/pdf/vi/10-die...,['https://archive.org/download/audiobook-10die...,,,,,98223048,464391
4,7685d9c3,10-nghich-ly-cuoc-song,../data/text/pdf/10-nghich-ly-cuoc-song.pdf,../data/audio/raw/10-nghich-ly-cuoc-song,"[{'id': '2b4c5c3b', 'name': 'Ái Hòa', 'dialect...",3.2611,Kent M. Keith Ph. D.,['https://thuviensachpdf.com/10-nghich-ly-cuoc...,https://sachnoiviet.net/sach-noi/10-nghich-ly-...,,https://cloud.thuviensachpdf.com/pdf/vi/10-ngh...,['https://archive.org/download/truyenngan_2019...,,,,,89579482,308308
