In [None]:
# Setup: Install Dependencies & Configure Paths

# Note: For local execution, install dependencies manually:
# pip install torch torchaudio transformers sentencepiece scikit-learn joblib soundfile tqdm
# For Modal execution, see modal_phase1.py

import os

# Local directories for Bible audio files
ANTIGO_TESTAMENTO_DIR = "/Users/joao/Desktop/work/shema/shemaobt/scripts/Antigo_Testamento_COMPLETO"
NOVO_TESTAMENTO_DIR = "/Users/joao/Desktop/work/shema/shemaobt/scripts/NOVO_TESTAMENTO_COMPLETO"

# Project root (local or Modal)
PROJECT_ROOT = os.path.expanduser("~/bible_audio_project")
os.makedirs(PROJECT_ROOT, exist_ok=True)

RAW_AUDIO_DIR = os.path.join(PROJECT_ROOT, "raw_audio")
CONVERTED_DIR = os.path.join(PROJECT_ROOT, "converted_audio")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "satere_units")

os.makedirs(RAW_AUDIO_DIR, exist_ok=True)
os.makedirs(CONVERTED_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Check for audio files in local directories
print("Looking for audio files...")
mp3_count = 0
mp3_files = []

# Check Antigo Testamento
if os.path.exists(ANTIGO_TESTAMENTO_DIR):
    for root, dirs, files in os.walk(ANTIGO_TESTAMENTO_DIR):
        for f in files:
            if f.endswith('.mp3'):
                mp3_count += 1
                mp3_files.append(os.path.join(root, f))

# Check Novo Testamento
if os.path.exists(NOVO_TESTAMENTO_DIR):
    for root, dirs, files in os.walk(NOVO_TESTAMENTO_DIR):
        for f in files:
            if f.endswith('.mp3'):
                mp3_count += 1
                mp3_files.append(os.path.join(root, f))

print(f"✓ Found {mp3_count} MP3 files")
print(f"  - Antigo Testamento: {ANTIGO_TESTAMENTO_DIR}")
print(f"  - Novo Testamento: {NOVO_TESTAMENTO_DIR}")

Mounted at /content/drive
Looking for audio files...
✓ Found 0 MP3 files


In [None]:
# Copy MP3 files from local directories to project directory
import shutil

print("Copying MP3 files to project directory...")
copied = 0

# Copy from Antigo Testamento
if os.path.exists(ANTIGO_TESTAMENTO_DIR):
    for root, dirs, files in os.walk(ANTIGO_TESTAMENTO_DIR):
        for f in files:
            if f.endswith('.mp3'):
                src = os.path.join(root, f)
                dst = os.path.join(RAW_AUDIO_DIR, f)
                if not os.path.exists(dst):
                    shutil.copy2(src, dst)
                    copied += 1

# Copy from Novo Testamento
if os.path.exists(NOVO_TESTAMENTO_DIR):
    for root, dirs, files in os.walk(NOVO_TESTAMENTO_DIR):
        for f in files:
            if f.endswith('.mp3'):
                src = os.path.join(root, f)
                dst = os.path.join(RAW_AUDIO_DIR, f)
                if not os.path.exists(dst):
                    shutil.copy2(src, dst)
                    copied += 1

print(f"✓ Copied {copied} new MP3 files to {RAW_AUDIO_DIR}")
print(f"Total MP3 files in project: {len([f for f in os.listdir(RAW_AUDIO_DIR) if f.endswith('.mp3')])}")

In [None]:
# Let's see what folders exist
import os

project_path = "/content/drive/MyDrive/satere_project"
print("Contents of satere_project:")
for item in os.listdir(project_path):
    print(f"  {item}")

Contents of satere_project:
  raw_audio


In [None]:
# Check inside raw_audio folder
import os

raw_path = "/content/drive/MyDrive/satere_project/raw_audio"
print("Contents of raw_audio:")
for item in os.listdir(raw_path):
    print(f"  {item}")

Contents of raw_audio:
  MAVWYIN1DA_B01_MAT_001.mp3
  MAVWYIN1DA_B01_MAT_002.mp3
  MAVWYIN1DA_B01_MAT_003.mp3
  MAVWYIN1DA_B01_MAT_005.mp3
  MAVWYIN1DA_B01_MAT_004.mp3
  MAVWYIN1DA_B01_MAT_007.mp3
  MAVWYIN1DA_B01_MAT_006.mp3
  MAVWYIN1DA_B01_MAT_008.mp3
  MAVWYIN1DA_B01_MAT_010.mp3
  MAVWYIN1DA_B01_MAT_009.mp3
  MAVWYIN1DA_B01_MAT_011.mp3
  MAVWYIN1DA_B01_MAT_012.mp3
  MAVWYIN1DA_B01_MAT_013.mp3
  MAVWYIN1DA_B01_MAT_014.mp3
  MAVWYIN1DA_B01_MAT_015.mp3
  MAVWYIN1DA_B01_MAT_017.mp3
  MAVWYIN1DA_B01_MAT_016.mp3
  MAVWYIN1DA_B01_MAT_018.mp3
  MAVWYIN1DA_B01_MAT_019.mp3
  MAVWYIN1DA_B01_MAT_021.mp3
  MAVWYIN1DA_B01_MAT_020.mp3
  MAVWYIN1DA_B01_MAT_022.mp3
  MAVWYIN1DA_B01_MAT_023.mp3
  MAVWYIN1DA_B01_MAT_024.mp3
  MAVWYIN1DA_B01_MAT_025.mp3
  MAVWYIN1DA_B01_MAT_026.mp3
  MAVWYIN1DA_B01_MAT_027.mp3
  MAVWYIN1DA_B01_MAT_028.mp3
  MAVWYIN1DA_B02_MRK_001.mp3
  MAVWYIN1DA_B02_MRK_003.mp3
  MAVWYIN1DA_B02_MRK_002.mp3
  MAVWYIN1DA_B02_MRK_004.mp3
  MAVWYIN1DA_B02_MRK_005.mp3
  MAVWYIN1DA_B02_MRK

In [None]:
# Phase 1: Convert MP3 to 16kHz WAV

import subprocess
import os
from tqdm import tqdm

PROJECT_ROOT = "/content/drive/MyDrive/satere_project"
RAW_AUDIO_DIR = f"{PROJECT_ROOT}/raw_audio"
CONVERTED_DIR = f"{PROJECT_ROOT}/converted_audio"

# Create output folder
os.makedirs(CONVERTED_DIR, exist_ok=True)

# Find all MP3 files
mp3_files = [f for f in os.listdir(RAW_AUDIO_DIR) if f.endswith('.mp3')]
print(f"Found {len(mp3_files)} MP3 files")

# Check what's already converted
existing = set(os.listdir(CONVERTED_DIR)) if os.path.exists(CONVERTED_DIR) else set()
to_convert = [f for f in mp3_files if f.replace('.mp3', '.wav') not in existing]
print(f"Already converted: {len(existing)}")
print(f"To convert: {len(to_convert)}")

# Convert each file
for mp3_file in tqdm(to_convert, desc="Converting to WAV"):
    mp3_path = os.path.join(RAW_AUDIO_DIR, mp3_file)
    wav_path = os.path.join(CONVERTED_DIR, mp3_file.replace('.mp3', '.wav'))

    cmd = ["ffmpeg", "-y", "-i", mp3_path, "-ar", "16000", "-ac", "1", wav_path]
    subprocess.run(cmd, capture_output=True)

print(f"\n✓ Done! Converted files are in: {CONVERTED_DIR}")

Found 260 MP3 files
Already converted: 0
To convert: 260


Converting to WAV: 100%|██████████| 260/260 [05:56<00:00,  1.37s/it]


✓ Done! Converted files are in: /content/drive/MyDrive/satere_project/converted_audio





In [None]:
# Phase 2: Acoustic Tokenization (XLSR-53 + K-Means)
# This takes 2-3 hours — you can clean your house now! 🏠

import torch
import torchaudio
import numpy as np
import joblib
import json
import os
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm

PROJECT_ROOT = "/content/drive/MyDrive/satere_project"
AUDIO_DIR = f"{PROJECT_ROOT}/converted_audio"
OUTPUT_DIR = f"{PROJECT_ROOT}/satere_units"
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
LAYER = 14
NUM_CLUSTERS = 100

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load model
print("Loading XLSR-53 model (this takes a minute)...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
model.eval()
print("✓ Model loaded!")

def get_features(path):
    """Extract features from audio file."""
    try:
        waveform, rate = torchaudio.load(path)
        if rate != 16000:
            waveform = torchaudio.transforms.Resample(rate, 16000)(waveform)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        duration = waveform.shape[1] / 16000
        inputs = extractor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
        inputs = inputs.input_values.to(device)

        with torch.no_grad():
            outputs = model(inputs, output_hidden_states=True)

        feats = outputs.hidden_states[LAYER].squeeze(0).cpu().numpy()
        timestamps = np.linspace(0, duration, feats.shape[0])
        return feats, timestamps, duration
    except Exception as e:
        print(f"Error: {path}: {e}")
        return None, None, None

# Get file list
files = sorted([os.path.join(AUDIO_DIR, f) for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
print(f"Found {len(files)} WAV files")

# Step 1: Train K-Means (learn the acoustic alphabet)
print("\n--- Learning Sateré Acoustic Alphabet ---")
kmeans = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, batch_size=1024, random_state=42, n_init=3)

buffer = []
buffer_limit = 50000

for f in tqdm(files, desc="Step 1/2: Extracting features"):
    feats, _, _ = get_features(f)
    if feats is not None:
        buffer.append(feats)
        if sum(b.shape[0] for b in buffer) >= buffer_limit:
            kmeans.partial_fit(np.vstack(buffer))
            buffer = []

if buffer:
    kmeans.partial_fit(np.vstack(buffer))

joblib.dump(kmeans, f"{OUTPUT_DIR}/satere_kmeans.pkl")
print("✓ Acoustic alphabet saved!")

# Step 2: Convert all audio to unit sequences
print("\n--- Converting Audio to Units ---")
corpus = {}

for f in tqdm(files, desc="Step 2/2: Tokenizing"):
    feats, timestamps, duration = get_features(f)
    if feats is None:
        continue

    units = kmeans.predict(feats)
    name = os.path.splitext(os.path.basename(f))[0]

    corpus[name] = {
        "units": units.tolist(),
        "timestamps": timestamps.tolist(),
        "duration_sec": duration,
        "num_frames": len(units)
    }

    with open(f"{OUTPUT_DIR}/{name}.units.txt", "w") as txt:
        txt.write(" ".join(map(str, units)))

# Save everything
with open(f"{OUTPUT_DIR}/satere_corpus_timestamped.json", "w") as f:
    json.dump(corpus, f)

with open(f"{OUTPUT_DIR}/all_units_for_bpe.txt", "w") as f:
    for data in corpus.values():
        f.write(" ".join(map(str, data["units"])) + "\n")

# Summary
total_mins = sum(d["duration_sec"] for d in corpus.values()) / 60
print(f"\n✓ Phase 2 Complete!")
print(f"  Files processed: {len(corpus)}")
print(f"  Total audio: {total_mins:.1f} minutes")
print(f"  Acoustic units: {NUM_CLUSTERS}")

Loading XLSR-53 model (this takes a minute)...
Using: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

✓ Model loaded!
Found 260 WAV files

--- Learning Sateré Acoustic Alphabet ---



Step 1/2: Extracting features: 100%|██████████| 260/260 [00:00<00:00, 4838.90it/s]


Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_001.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_002.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_003.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_004.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_005.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1D


Step 2/2: Tokenizing: 100%|██████████| 260/260 [00:00<00:00, 4467.03it/s]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_001.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_002.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_003.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_004.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_005.wav: TorchCodec is required for load_with_torchcodec. Please install torchcodec to use this function.
Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1D




In [None]:
# Fix audio loading issue
!pip install -q soundfile

# Now let's retry with soundfile backend
import torch
import torchaudio
import soundfile as sf
import numpy as np
import joblib
import json
import os
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm

PROJECT_ROOT = "/content/drive/MyDrive/satere_project"
AUDIO_DIR = f"{PROJECT_ROOT}/converted_audio"
OUTPUT_DIR = f"{PROJECT_ROOT}/satere_units"
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
LAYER = 14
NUM_CLUSTERS = 100

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load model
print("Loading XLSR-53 model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
model.eval()
print("✓ Model loaded!")

def get_features(path):
    """Extract features using soundfile instead of torchaudio."""
    try:
        # Use soundfile to load audio
        waveform, rate = sf.read(path)

        # Convert to tensor
        waveform = torch.tensor(waveform, dtype=torch.float32)

        # Handle stereo
        if len(waveform.shape) > 1:
            waveform = waveform.mean(dim=1)

        # Resample if needed
        if rate != 16000:
            waveform = torchaudio.functional.resample(waveform, rate, 16000)
            rate = 16000

        duration = len(waveform) / rate

        inputs = extractor(waveform.numpy(), return_tensors="pt", sampling_rate=16000)
        inputs = inputs.input_values.to(device)

        with torch.no_grad():
            outputs = model(inputs, output_hidden_states=True)

        feats = outputs.hidden_states[LAYER].squeeze(0).cpu().numpy()
        timestamps = np.linspace(0, duration, feats.shape[0])
        return feats, timestamps, duration
    except Exception as e:
        print(f"Error: {path}: {e}")
        return None, None, None

# Get file list
files = sorted([os.path.join(AUDIO_DIR, f) for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
print(f"Found {len(files)} WAV files")

# Test one file first
print("\nTesting first file...")
test_feats, _, test_dur = get_features(files[0])
if test_feats is not None:
    print(f"✓ Test successful! Duration: {test_dur:.1f}s, Features shape: {test_feats.shape}")
else:
    print("✗ Test failed - check error above")
    raise Exception("Cannot proceed")

# Step 1: Train K-Means
print("\n--- Learning Sateré Acoustic Alphabet ---")
kmeans = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, batch_size=1024, random_state=42, n_init=3)

buffer = []
buffer_limit = 50000

for f in tqdm(files, desc="Step 1/2: Extracting features"):
    feats, _, _ = get_features(f)
    if feats is not None:
        buffer.append(feats)
        if sum(b.shape[0] for b in buffer) >= buffer_limit:
            kmeans.partial_fit(np.vstack(buffer))
            buffer = []

if buffer:
    kmeans.partial_fit(np.vstack(buffer))

joblib.dump(kmeans, f"{OUTPUT_DIR}/satere_kmeans.pkl")
print("✓ Acoustic alphabet saved!")

# Step 2: Tokenize all audio
print("\n--- Converting Audio to Units ---")
corpus = {}

for f in tqdm(files, desc="Step 2/2: Tokenizing"):
    feats, timestamps, duration = get_features(f)
    if feats is None:
        continue

    units = kmeans.predict(feats)
    name = os.path.splitext(os.path.basename(f))[0]

    corpus[name] = {
        "units": units.tolist(),
        "timestamps": timestamps.tolist(),
        "duration_sec": duration,
        "num_frames": len(units)
    }

    with open(f"{OUTPUT_DIR}/{name}.units.txt", "w") as txt:
        txt.write(" ".join(map(str, units)))

# Save everything
with open(f"{OUTPUT_DIR}/satere_corpus_timestamp"):
    pass


: 

In [None]:
# Fix audio loading issue
!pip install -q soundfile

# Now let's retry with soundfile backend
import torch
import torchaudio
import soundfile as sf
import numpy as np
import joblib
import json
import os
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm

PROJECT_ROOT = "/content/drive/MyDrive/satere_project"
AUDIO_DIR = f"{PROJECT_ROOT}/converted_audio"
OUTPUT_DIR = f"{PROJECT_ROOT}/satere_units"
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
LAYER = 14
NUM_CLUSTERS = 100

os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Loading XLSR-53 model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
model.eval()
print("✓ Model loaded!")

def get_features(path):
    try:
        waveform, rate = sf.read(path)
        waveform = torch.tensor(waveform, dtype=torch.float32)
        if len(waveform.shape) > 1:
            waveform = waveform.mean(dim=1)
        if rate != 16000:
            waveform = torchaudio.functional.resample(waveform, rate, 16000)
            rate = 16000
        duration = len(waveform) / rate
        inputs = extractor(waveform.numpy(), return_tensors="pt", sampling_rate=16000)
        inputs = inputs.input_values.to(device)
        with torch.no_grad():
            outputs = model(inputs, output_hidden_states=True)
        feats = outputs.hidden_states[LAYER].squeeze(0).cpu().numpy()
        timestamps = np.linspace(0, duration, feats.shape[0])
        return feats, timestamps, duration
    except Exception as e:
        print(f"Error: {path}: {e}")
        return None, None, None

files = sorted([os.path.join(AUDIO_DIR, f) for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
print(f"Found {len(files)} WAV files")

print("\nTesting first file...")
test_feats, _, test_dur = get_features(files[0])
if test_feats is not None:
    print(f"✓ Test successful! Duration: {test_dur:.1f}s, Features: {test_feats.shape}")
else:
    raise Exception("Cannot proceed - test failed")

print("\n--- Learning Sateré Acoustic Alphabet ---")
kmeans = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, batch_size=1024, random_state=42, n_init=3)
buffer = []
buffer_limit = 50000

for f in tqdm(files, desc="Step 1/2: Extracting features"):
    feats, _, _ = get_features(f)
    if feats is not None:
        buffer.append(feats)
        if sum(b.shape[0] for b in buffer) >= buffer_limit:
            kmeans.partial_fit(np.vstack(buffer))
            buffer = []

if buffer:
    kmeans.partial_fit(np.vstack(buffer))

joblib.dump(kmeans, f"{OUTPUT_DIR}/satere_kmeans.pkl")
print("✓ Acoustic alphabet saved!")

print("\n--- Converting Audio to Units ---")
corpus = {}

for f in tqdm(files, desc="Step 2/2: Tokenizing"):
    feats, timestamps, duration = get_features(f)
    if feats is None:
        continue
    units = kmeans.predict(feats)
    name = os.path.splitext(os.path.basename(f))[0]
    corpus[name] = {
        "units": units.tolist(),
        "timestamps": timestamps.tolist(),
        "duration_sec": duration,
        "num_frames": len(units)
    }
    with open(f"{OUTPUT_DIR}/{name}.units.txt", "w") as txt:
        txt.write(" ".join(map(str, units)))

with open(f"{OUTPUT_DIR}/satere_corpus_timestamped.json", "w") as f:
    json.dump(corpus, f)

with open(f"{OUTPUT_DIR}/all_units_for_bpe.txt", "w") as f:
    for data in corpus.values():
        f.write(" ".join(map(str, data["units"])) + "\n")

total_mins = sum(d["duration_sec"] for d in corpus.values()) / 60
print(f"\n✓ Phase 2 Complete!")
print(f"  Files processed: {len(corpus)}")
print(f"  Total audio: {total_mins:.1f} minutes")
print(f"  Acoustic units: {NUM_CLUSTERS}")

Loading XLSR-53 model...
Using: cuda
✓ Model loaded!
Found 260 WAV files

Testing first file...
✓ Test successful! Duration: 507.4s, Features: (25367, 1024)

--- Learning Sateré Acoustic Alphabet ---


Step 1/2: Extracting features:  37%|███▋      | 96/260 [16:50<25:59,  9.51s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B05_ACT_007.wav: CUDA out of memory. Tried to allocate 12.18 GiB. GPU 0 has a total capacity of 39.56 GiB of which 10.35 GiB is free. Process 8325 has 29.19 GiB memory in use. Of the allocated memory 25.69 GiB is allocated by PyTorch, and 3.01 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  45%|████▌     | 118/260 [21:26<24:58, 10.55s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_001.wav: CUDA out of memory. Tried to allocate 13.81 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 28.97 GiB is allocated by PyTorch, and 2.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  46%|████▌     | 119/260 [21:26<17:43,  7.55s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_002.wav: CUDA out of memory. Tried to allocate 11.44 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 24.21 GiB is allocated by PyTorch, and 7.75 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  46%|████▌     | 120/260 [21:27<12:35,  5.40s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_003.wav: CUDA out of memory. Tried to allocate 8.51 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.31 GiB is allocated by PyTorch, and 13.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  47%|████▋     | 121/260 [21:27<09:01,  3.89s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_004.wav: CUDA out of memory. Tried to allocate 8.52 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.33 GiB is allocated by PyTorch, and 13.63 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  48%|████▊     | 124/260 [21:58<15:13,  6.72s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_007.wav: CUDA out of memory. Tried to allocate 7.54 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.37 GiB is allocated by PyTorch, and 15.59 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  48%|████▊     | 125/260 [21:58<10:57,  4.87s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_008.wav: CUDA out of memory. Tried to allocate 12.35 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 26.04 GiB is allocated by PyTorch, and 5.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  48%|████▊     | 126/260 [21:59<07:57,  3.56s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_009.wav: CUDA out of memory. Tried to allocate 11.16 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 23.63 GiB is allocated by PyTorch, and 8.33 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  49%|████▉     | 128/260 [22:15<11:37,  5.28s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_011.wav: CUDA out of memory. Tried to allocate 12.44 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 26.21 GiB is allocated by PyTorch, and 5.76 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  50%|█████     | 131/260 [22:39<13:09,  6.12s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_014.wav: CUDA out of memory. Tried to allocate 8.77 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.85 GiB is allocated by PyTorch, and 13.12 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  51%|█████     | 132/260 [22:40<09:26,  4.42s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_015.wav: CUDA out of memory. Tried to allocate 10.35 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 22.02 GiB is allocated by PyTorch, and 9.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  51%|█████     | 133/260 [22:40<06:47,  3.21s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_016.wav: CUDA out of memory. Tried to allocate 8.18 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.66 GiB is allocated by PyTorch, and 14.30 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  54%|█████▍    | 140/260 [23:15<07:02,  3.52s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B07_1CO_007.wav: CUDA out of memory. Tried to allocate 7.38 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.04 GiB is allocated by PyTorch, and 15.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  57%|█████▋    | 148/260 [24:19<13:37,  7.30s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B07_1CO_015.wav: CUDA out of memory. Tried to allocate 8.90 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 19.11 GiB is allocated by PyTorch, and 12.86 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  66%|██████▌   | 172/260 [27:25<10:09,  6.93s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B10_EPH_004.wav: CUDA out of memory. Tried to allocate 8.79 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.87 GiB is allocated by PyTorch, and 13.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  67%|██████▋   | 175/260 [27:50<09:47,  6.91s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B11_PHP_001.wav: CUDA out of memory. Tried to allocate 7.52 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.32 GiB is allocated by PyTorch, and 15.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  68%|██████▊   | 176/260 [27:50<06:54,  4.94s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B11_PHP_002.wav: CUDA out of memory. Tried to allocate 7.31 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 15.91 GiB is allocated by PyTorch, and 16.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  81%|████████  | 211/260 [30:50<03:49,  4.69s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_007.wav: CUDA out of memory. Tried to allocate 7.16 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 15.61 GiB is allocated by PyTorch, and 16.36 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  82%|████████▏ | 214/260 [31:12<04:39,  6.07s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_010.wav: CUDA out of memory. Tried to allocate 9.51 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 20.32 GiB is allocated by PyTorch, and 11.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  83%|████████▎ | 215/260 [31:13<03:17,  4.40s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_011.wav: CUDA out of memory. Tried to allocate 10.68 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 22.68 GiB is allocated by PyTorch, and 9.29 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  83%|████████▎ | 216/260 [31:13<02:20,  3.20s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_012.wav: CUDA out of memory. Tried to allocate 8.24 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.78 GiB is allocated by PyTorch, and 14.19 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features:  92%|█████████▏| 238/260 [33:54<00:57,  2.63s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B26_JUD_001.wav: CUDA out of memory. Tried to allocate 9.62 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 20.55 GiB is allocated by PyTorch, and 11.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 1/2: Extracting features: 100%|██████████| 260/260 [36:05<00:00,  8.33s/it]


✓ Acoustic alphabet saved!

--- Converting Audio to Units ---


Step 2/2: Tokenizing:  10%|█         | 26/260 [03:16<29:56,  7.68s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B01_MAT_026.wav: CUDA out of memory. Tried to allocate 7.74 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.77 GiB is allocated by PyTorch, and 15.20 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  16%|█▌        | 42/260 [05:15<23:01,  6.34s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B02_MRK_014.wav: CUDA out of memory. Tried to allocate 7.33 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 15.94 GiB is allocated by PyTorch, and 16.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  21%|██        | 55/260 [07:08<27:28,  8.04s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B03_LUK_011.wav: CUDA out of memory. Tried to allocate 8.03 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.35 GiB is allocated by PyTorch, and 14.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  22%|██▏       | 56/260 [07:08<19:32,  5.75s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B03_LUK_012.wav: CUDA out of memory. Tried to allocate 8.12 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.54 GiB is allocated by PyTorch, and 14.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  25%|██▌       | 66/260 [08:47<26:26,  8.18s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B03_LUK_022.wav: CUDA out of memory. Tried to allocate 8.04 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.37 GiB is allocated by PyTorch, and 14.60 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  35%|███▌      | 91/260 [12:54<16:16,  5.78s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B05_ACT_002.wav: CUDA out of memory. Tried to allocate 8.61 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.52 GiB is allocated by PyTorch, and 13.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  37%|███▋      | 96/260 [13:46<20:26,  7.48s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B05_ACT_007.wav: CUDA out of memory. Tried to allocate 12.18 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 25.69 GiB is allocated by PyTorch, and 6.27 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  37%|███▋      | 97/260 [13:46<14:30,  5.34s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B05_ACT_008.wav: CUDA out of memory. Tried to allocate 7.47 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.23 GiB is allocated by PyTorch, and 15.73 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  38%|███▊      | 99/260 [14:03<16:48,  6.27s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B05_ACT_010.wav: CUDA out of memory. Tried to allocate 8.35 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.00 GiB is allocated by PyTorch, and 13.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  39%|███▉      | 102/260 [14:21<13:45,  5.23s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B05_ACT_013.wav: CUDA out of memory. Tried to allocate 7.38 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.06 GiB is allocated by PyTorch, and 15.91 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  45%|████▌     | 118/260 [17:04<24:21, 10.29s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_001.wav: CUDA out of memory. Tried to allocate 13.81 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 28.97 GiB is allocated by PyTorch, and 2.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  46%|████▌     | 119/260 [17:05<17:17,  7.36s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_002.wav: CUDA out of memory. Tried to allocate 11.44 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 24.21 GiB is allocated by PyTorch, and 7.75 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  46%|████▌     | 120/260 [17:05<12:17,  5.27s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_003.wav: CUDA out of memory. Tried to allocate 8.51 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.31 GiB is allocated by PyTorch, and 13.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  47%|████▋     | 121/260 [17:06<08:48,  3.80s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_004.wav: CUDA out of memory. Tried to allocate 8.52 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.33 GiB is allocated by PyTorch, and 13.63 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  48%|████▊     | 124/260 [17:36<15:07,  6.67s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_007.wav: CUDA out of memory. Tried to allocate 7.54 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.37 GiB is allocated by PyTorch, and 15.59 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  48%|████▊     | 125/260 [17:37<10:53,  4.84s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_008.wav: CUDA out of memory. Tried to allocate 12.35 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 26.04 GiB is allocated by PyTorch, and 5.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  48%|████▊     | 126/260 [17:37<07:54,  3.54s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_009.wav: CUDA out of memory. Tried to allocate 11.16 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 23.63 GiB is allocated by PyTorch, and 8.33 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  49%|████▉     | 128/260 [17:54<11:29,  5.23s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_011.wav: CUDA out of memory. Tried to allocate 12.44 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 26.21 GiB is allocated by PyTorch, and 5.76 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  50%|█████     | 131/260 [18:17<12:59,  6.05s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_014.wav: CUDA out of memory. Tried to allocate 8.77 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.85 GiB is allocated by PyTorch, and 13.12 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  51%|█████     | 132/260 [18:18<09:19,  4.37s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_015.wav: CUDA out of memory. Tried to allocate 10.35 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 22.02 GiB is allocated by PyTorch, and 9.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  51%|█████     | 133/260 [18:18<06:42,  3.17s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B06_ROM_016.wav: CUDA out of memory. Tried to allocate 8.18 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.66 GiB is allocated by PyTorch, and 14.30 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  54%|█████▍    | 140/260 [18:53<07:01,  3.51s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B07_1CO_007.wav: CUDA out of memory. Tried to allocate 7.38 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.04 GiB is allocated by PyTorch, and 15.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  57%|█████▋    | 148/260 [19:56<13:26,  7.20s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B07_1CO_015.wav: CUDA out of memory. Tried to allocate 8.90 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 19.11 GiB is allocated by PyTorch, and 12.86 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  66%|██████▌   | 172/260 [23:00<10:03,  6.85s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B10_EPH_004.wav: CUDA out of memory. Tried to allocate 8.79 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 18.87 GiB is allocated by PyTorch, and 13.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  67%|██████▋   | 175/260 [23:25<09:41,  6.84s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B11_PHP_001.wav: CUDA out of memory. Tried to allocate 7.52 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 16.32 GiB is allocated by PyTorch, and 15.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  68%|██████▊   | 176/260 [23:25<06:51,  4.89s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B11_PHP_002.wav: CUDA out of memory. Tried to allocate 7.31 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 15.91 GiB is allocated by PyTorch, and 16.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  81%|████████  | 211/260 [26:23<03:45,  4.60s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_007.wav: CUDA out of memory. Tried to allocate 7.16 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 15.61 GiB is allocated by PyTorch, and 16.36 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  82%|████████▏ | 214/260 [26:45<04:35,  5.99s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_010.wav: CUDA out of memory. Tried to allocate 9.51 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 20.32 GiB is allocated by PyTorch, and 11.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  83%|████████▎ | 215/260 [26:45<03:15,  4.34s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_011.wav: CUDA out of memory. Tried to allocate 10.68 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 22.68 GiB is allocated by PyTorch, and 9.29 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  83%|████████▎ | 216/260 [26:46<02:18,  3.15s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B19_HEB_012.wav: CUDA out of memory. Tried to allocate 8.24 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 17.78 GiB is allocated by PyTorch, and 14.19 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing:  92%|█████████▏| 238/260 [29:25<00:57,  2.60s/it]

Error: /content/drive/MyDrive/satere_project/converted_audio/MAVWYIN1DA_B26_JUD_001.wav: CUDA out of memory. Tried to allocate 9.62 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.09 GiB is free. Process 8325 has 32.46 GiB memory in use. Of the allocated memory 20.55 GiB is allocated by PyTorch, and 11.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Step 2/2: Tokenizing: 100%|██████████| 260/260 [31:35<00:00,  7.29s/it]



✓ Phase 2 Complete!
  Files processed: 229
  Total audio: 2692.8 minutes
  Acoustic units: 100


In [None]:
# Phase 3: Motif Discovery (BPE)

import sentencepiece as spm
from collections import Counter
import json
import os

PROJECT_ROOT = "/content/drive/MyDrive/satere_project"
UNITS_DIR = f"{PROJECT_ROOT}/satere_units"
MOTIFS_DIR = f"{PROJECT_ROOT}/satere_motifs"
VOCAB_SIZE = 500

os.makedirs(MOTIFS_DIR, exist_ok=True)

print("--- Training BPE Model ---")
spm.SentencePieceTrainer.train(
    input=f"{UNITS_DIR}/all_units_for_bpe.txt",
    model_prefix=f"{MOTIFS_DIR}/satere_bpe",
    vocab_size=VOCAB_SIZE,
    model_type='bpe',
    character_coverage=1.0,
    num_threads=4,
    max_sentence_length=100000
)
print(f"✓ BPE model saved!")

# Load BPE model
sp = spm.SentencePieceProcessor()
sp.load(f"{MOTIFS_DIR}/satere_bpe.model")

# Load timestamped corpus
with open(f"{UNITS_DIR}/satere_corpus_timestamped.json") as f:
    corpus = json.load(f)

print(f"Loaded {len(corpus)} files from corpus")

# Build motif index with timestamps
print("\n--- Building Motif Index ---")
motif_index = {}
motif_counts = Counter()

from tqdm import tqdm

for filename, data in tqdm(corpus.items(), desc="Indexing motifs"):
    unit_string = " ".join(map(str, data["units"]))
    pieces = sp.encode_as_pieces(unit_string)
    timestamps = data["timestamps"]

    unit_pos = 0
    for motif in pieces:
        clean_motif = motif[1:] if motif.startswith("▁") else motif
        motif_units = clean_motif.split()
        length = max(1, len(motif_units))

        if unit_pos < len(timestamps):
            start_t = timestamps[unit_pos]
            end_idx = min(unit_pos + length, len(timestamps) - 1)
            end_t = timestamps[end_idx]

--- Training BPE Model ---


: 

In [None]:
# Phase 3: Motif Discovery (BPE) - Fixed vocabulary size

import sentencepiece as spm
from collections import Counter
import json
import os

PROJECT_ROOT = "/content/drive/MyDrive/satere_project"
UNITS_DIR = f"{PROJECT_ROOT}/satere_units"
MOTIFS_DIR = f"{PROJECT_ROOT}/satere_motifs"
VOCAB_SIZE = 200  # Reduced from 500

os.makedirs(MOTIFS_DIR, exist_ok=True)

print("--- Training BPE Model ---")
spm.SentencePieceTrainer.train(
    input=f"{UNITS_DIR}/all_units_for_bpe.txt",
    model_prefix=f"{MOTIFS_DIR}/satere_bpe",
    vocab_size=VOCAB_SIZE,
    model_type='bpe',
    character_coverage=1.0,
    num_threads=4,
    max_sentence_length=100000
)
print(f"✓ BPE model saved!")

# Load BPE model
sp = spm.SentencePieceProcessor()
sp.load(f"{MOTIFS_DIR}/satere_bpe.model")

# Load timestamped corpus
with open(f"{UNITS_DIR}/satere_corpus_timestamped.json") as f:
    corpus = json.load(f)

print(f"Loaded {len(corpus)} files from corpus")

# Build motif index with timestamps
print("\n--- Building Motif Index ---")
motif_index = {}
motif_counts = Counter()

from tqdm import tqdm

for filename, data in tqdm(corpus.items(), desc="Indexing motifs"):
    unit_string = " ".join(map(str, data["units"]))
    pieces = sp.encode_as_pieces(unit_string)
    timestamps = data["timestamps"]

    unit_pos = 0
    for motif in pieces:
        clean_motif = motif[1:] if motif.startswith("▁") else motif
        motif_units = clean_motif.split()
        length = max(1, len(motif_units))

        if unit_pos < len(timestamps):
            start_t = timestamps[unit_pos]
            end_idx = min(unit_pos + length, len(timestamps) - 1)
            end_t = timestamps[end_idx]

            if motif not in motif_index:
                motif_index[motif] = []
            motif_index[motif].append({
                "file": filename,
                "start_sec": round(start_t, 3),
                "end_sec": round(end_t, 3),
                "unit_position": unit_pos
            })
            motif_counts[motif] += 1

        unit_pos += length

# Save motif index
with open(f"{MOTIFS_DIR}/motif_index.json", "w") as f:
    json.dump(motif_index, f, indent=2)

# Save statistics
motif_stats = []
for motif, count in motif_counts.most_common(100):
    clean = motif[1:] if motif.startswith("▁") else motif
    motif_stats.append({
        "motif": motif,
        "count": count,
        "num_units": len(clean.split())
    })

with open(f"{MOTIFS_DIR}/motif_statistics.json", "w") as f:
    json.dump(motif_stats, f, indent=2)

# Display results
print("\n" + "="*60)
print("TOP 20 DISCOVERED MOTIFS")
print("="*60)
print(f"{'Rank':<5} {'Motif':<30} {'Count':<10} {'Units':<6}")
print("-"*60)

for i, stat in enumerate(motif_stats[:20], 1):
    print(f"{i:<5} {stat['motif']:<30} {stat['count']:<10} {stat['num_units']:<6}")

print(f"\n✓ Phase 3 Complete!")
print(f"  Total unique motifs: {len(motif_index)}")
print(f"  Motif index saved to: {MOTIFS_DIR}/motif_index.json")
print(f"\n📋 Next: Tag deixis markers in AViTA, then run Phase 4")

--- Training BPE Model ---
✓ BPE model saved!
Loaded 229 files from corpus

--- Building Motif Index ---


Indexing motifs: 100%|██████████| 229/229 [00:39<00:00,  5.79it/s]



TOP 20 DISCOVERED MOTIFS
Rank  Motif                          Count      Units 
------------------------------------------------------------
1     ▁29                            266472     1     
2     ▁82                            217421     1     
3     ▁26                            179759     1     
4     ▁81                            161271     1     
5     ▁85                            158889     1     
6     ▁55                            153712     1     
7     ▁18                            148241     1     
8     ▁62                            138527     1     
9     ▁44                            131139     1     
10    ▁23                            130346     1     
11    ▁19                            125666     1     
12    ▁16                            122186     1     
13    ▁1                             119696     1     
14    ▁78                            118846     1     
15    ▁39                            116044     1     
16    ▁7                         