In [5]:
import torch

In [3]:
import re

def german_to_ipa(text):
    ipa_mapping = {
        "sch": "ʃ", "ch": "x", "z": "ts", "j": "j", "r": "ʁ", "ng": "ŋ",
        "au": "aʊ̯", "ei": "aɪ̯", "eu": "ɔʏ̯", "äu": "ɔʏ̯", "sp": "ʃp", "st": "ʃt",
        "ä": "ɛ", "ö": "œ", "ü": "ʏ", "ß": "s", "ph": "f", "qu": "kv"
    }

    words = text.lower().split()
    ipa_words = []

    for word in words:
        ipa_word = word
        for key, val in ipa_mapping.items():
            ipa_word = ipa_word.replace(key, val)

        # Insert syllable boundaries and primary stress for the first syllable
        ipa_word = re.sub(r"([aeiouäöüɪɛœʏ])", r"\1.", ipa_word)  # syllables after vowels
        ipa_word = re.sub(r"\.$", "", ipa_word)  # remove trailing syllable
        ipa_word = "ˈ" + ipa_word  # add primary stress

        ipa_words.append(ipa_word)

    return " ".join(ipa_words)

# Test the function
sentences = [
    "Wollen wir das Ganze einfach mal ausführen",
    "Das ist ein Test",
    "Können wir den Text ins IPA umwandeln"
]

for sentence in sentences:
    ipa = german_to_ipa(sentence.lower())
    print(f"{sentence} → {ipa}")


Wollen wir das Ganze einfach mal ausführen → ˈwo.lle.n ˈwi.ʁ ˈda.s ˈga.ntse ˈa.ɪ.̯nfa.x ˈma.l ˈa.ʊ̯sfʏ.hʁe.n
Das ist ein Test → ˈda.s ˈi.ʃt ˈa.ɪ.̯n ˈte.ʃt
Können wir den Text ins IPA umwandeln → ˈkœ.nne.n ˈwi.ʁ ˈde.n ˈte.xt ˈi.ns ˈi.pa ˈu.mwa.nde.ln


In [4]:
import csv

def convert_metadata_to_ipa(input_file='audio_dataset_2/metadata.csv', output_file='audio_dataset_2/metadata_ipa.csv'):
    """Converts the second column of metadata.csv to IPA and saves it"""
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='|')
        writer = csv.writer(outfile, delimiter='|')

        for row in reader:
            if len(row) < 3:
                continue

            original_text = row[1]
            ipa_text = german_to_ipa(original_text)

            new_row = [row[0], f"/{ipa_text}/", row[2]]
            writer.writerow(new_row)

    print(f"✅ IPA conversion completed. Saved to {output_file}")


# Run the conversion
convert_metadata_to_ipa()


✅ IPA conversion completed. Saved to audio_dataset_2/metadata_ipa.csv


In [10]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()

In [10]:
import os
import librosa
import soundfile as sf

input_audio = "org.mp4"
output_dir = "audio_dataset_2/wavs"
os.makedirs(output_dir, exist_ok=True)

def segment_audio(input_audio, output_dir, segment_durations=[5, 10, 15]):
    print("🎯 Segmenting audio into chunks...")
    y, sr = librosa.load(input_audio, sr=22050)
    total_duration = librosa.get_duration(y=y, sr=sr)

    current_position = 0
    segment_count = 0

    while current_position < total_duration:
        duration = segment_durations[segment_count % len(segment_durations)]
        end_position = min(current_position + duration, total_duration)

        # Extract chunk
        chunk = y[int(current_position * sr):int(end_position * sr)]
        file_path = os.path.join(output_dir, f"segment_{segment_count:03d}.wav")

        # Save chunk
        sf.write(file_path, chunk, sr)
        print(f"✅ Saved: {file_path} ({duration}s)")
        current_position = end_position
        segment_count += 1

    print("🎯 Audio segmentation completed!")

segment_audio(input_audio, output_dir)


🎯 Segmenting audio into chunks...


  y, sr = librosa.load(input_audio, sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✅ Saved: audio_dataset_2/wavs/segment_000.wav (5s)
✅ Saved: audio_dataset_2/wavs/segment_001.wav (10s)
✅ Saved: audio_dataset_2/wavs/segment_002.wav (15s)
✅ Saved: audio_dataset_2/wavs/segment_003.wav (5s)
✅ Saved: audio_dataset_2/wavs/segment_004.wav (10s)
✅ Saved: audio_dataset_2/wavs/segment_005.wav (15s)
✅ Saved: audio_dataset_2/wavs/segment_006.wav (5s)
✅ Saved: audio_dataset_2/wavs/segment_007.wav (10s)
✅ Saved: audio_dataset_2/wavs/segment_008.wav (15s)
✅ Saved: audio_dataset_2/wavs/segment_009.wav (5s)
✅ Saved: audio_dataset_2/wavs/segment_010.wav (10s)
✅ Saved: audio_dataset_2/wavs/segment_011.wav (15s)
✅ Saved: audio_dataset_2/wavs/segment_012.wav (5s)
✅ Saved: audio_dataset_2/wavs/segment_013.wav (10s)
✅ Saved: audio_dataset_2/wavs/segment_014.wav (15s)
✅ Saved: audio_dataset_2/wavs/segment_015.wav (5s)
✅ Saved: audio_dataset_2/wavs/segment_016.wav (10s)
✅ Saved: audio_dataset_2/wavs/segment_017.wav (15s)
✅ Saved: audio_dataset_2/wavs/segment_018.wav (5s)
✅ Saved: audio_data

In [11]:
import os
import whisper

model = whisper.load_model("medium", download_root="/home/ahmet/.cache/whisper")
input_dir = "audio_dataset_2/wavs"
metadata_path = "audio_dataset_2/metadata.csv"

with open(metadata_path, 'w', encoding='utf-8') as f_meta:
    for file in sorted(os.listdir(input_dir)):
        if file.endswith(".wav"):
            file_path = os.path.join(input_dir, file)
            print(f"🎙️ Transcribing {file}...")

            try:
                result = model.transcribe(file_path)
                text = result['text'].strip()
                f_meta.write(f"{file}|{text}|{text}\n")
                print(f"✅ {file}: {text[:50]}...")
            except Exception as e:
                print(f"⚠️ Failed to transcribe {file}: {e}")

print("🎯 Transcription completed!")


  checkpoint = torch.load(fp, map_location=device)


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 7.75 GiB of which 24.44 MiB is free. Including non-PyTorch memory, this process has 7.17 GiB memory in use. Of the allocated memory 7.03 GiB is allocated by PyTorch, and 19.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
import os
import whisper
import csv

def transcribe_and_generate_metadata(audio_folder, metadata_file="metadata.csv", model_size="medium"):
    # Initialize Whisper model
    model = whisper.load_model(model_size, download_root="/home/ahmet/.cache/whisper", device="cuda")

    # Prepare CSV file
    with open(metadata_file, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file, delimiter='|')
        
        # Process each WAV file
        for wav_file in sorted(os.listdir(audio_folder)):
            if wav_file.endswith(".wav"):
                file_path = os.path.join(audio_folder, wav_file)

                # Transcribe audio
                result = model.transcribe(file_path, language="de")  # Set language accordingly
                transcript = result.get("text", "").strip()

                # Add entry to CSV
                writer.writerow([wav_file, transcript, transcript])
                print(f"📝 {wav_file}: {transcript}")

    print(f"✅ Metadata saved to: {metadata_file}")

# Run transcription
audio_folder = "segmented_audio"
transcribe_and_generate_metadata(audio_folder)


  checkpoint = torch.load(fp, map_location=device)


📝 segment_0001.wav: Hallo und herzlich willkommen zur zweiten Folge von Einführung in React mit dem Thema React Setup. Noch einmal kurz zu mir, mein Name ist David Losert, ich bin Software Engineer und seit über zehn Jahren im Web unterwegs und arbeite nun auch bereits seit vier Jahren.
📝 segment_0002.wav: Jahren mit React. Neben React mache ich die Arbeit mit JavaScript, TypeScript, Node.js, Linux-Servern, Docker und AWS. Die heutige Folge dreht sich also nun komplett darum, eine Entwicklungs-
📝 segment_0003.wav: Umgebung aufzusetzen und dort eine erste Reaction.
📝 segment_0004.wav: Hello World Applikation zu implementieren. Wenn wir uns kurz erinnern, in der letzten Folge habe ich die Geschichte und Prinzipien von React kurz vorgestellt und einen ersten theoretischen Einblick auf die Geschichte und Prinzipien von React.
📝 segment_0005.wav: Blick in den Virtual Dom und in JSX gegeben. Das habe ich an dieser Stelle auch einmal kurz visualisiert. Wir erinnern uns der Virtual Dom.
📝 segm

In [8]:
import csv

# Paths
input_csv = "audio_dataset_2/metadata.csv"
output_csv = "audio_dataset_2/metadata_fixed.csv"

# Fix .wav extensions
with open(input_csv, "r", encoding="utf-8") as infile, open(output_csv, "w", encoding="utf-8", newline="") as outfile:
    reader = csv.reader(infile, delimiter="|")
    writer = csv.writer(outfile, delimiter="|")

    for row in reader:
        # Remove '.wav' from first column if it exists
        filename = row[0].replace(".wav", "")
        writer.writerow([filename, row[1], row[2]])

print("✅ Fixed metadata.csv! Saved as metadata_fixed.csv")

# Optional: Replace the old file with the new one
import os
os.replace(output_csv, input_csv)

✅ Fixed metadata.csv! Saved as metadata_fixed.csv
