In [2]:
import os
import subprocess
import pandas as pd
from IPython.display import display
import re

import yt_dlp

import whisper
from deep_translator import GoogleTranslator

In [3]:
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True)
print(" ".join(result.stdout.split()[:3]))

ffmpeg version 7.1


In [4]:
def download_audio(url, filename_base, verbose=False, noprogress=False):
    # yt-dlp options to download best audio, then convert to MP3 with a defined bitrate
    ydl_options = {
        'format': 'bestaudio/best',  # best available audio
        'outtmpl': filename_base,
        'postprocessors': [
            {
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '128',  # 128 kbps
            },
        ],
    }
    # Suppress output if needed
    if not verbose:
        ydl_options['quiet'] = True
        ydl_options['no_warnings'] = True  # Optional: Hide warnings
    if noprogress:
        # ydl_options['noprogress'] = T
        ydl_options['progress_hooks'] = []  # Disable progress bar updates

    with yt_dlp.YoutubeDL(ydl_options) as ydl:
        print(f"Downloading: {url}")
        ydl.download([url])

    print("Download complete!")

filename_base = "dutch_audio_short"
url = "https://www.youtube.com/watch?v=JKjeLNCnkcM"
download_audio(url=url, filename_base=filename_base, verbose=False, noprogress=True)

Downloading: https://www.youtube.com/watch?v=JKjeLNCnkcM
Download complete!                                       


In [5]:
def get_file_size(filename_base):
    audio_filename = filename_base + '.mp3'
    return  os.path.getsize(audio_filename) / (1024 * 1024)
print(f"Audio file size: {get_file_size(filename_base):.2f} MB")

Audio file size: 1.69 MB


In [6]:
# Load a model. 
# "small", "medium", or "large" are typical. "small" is faster but less accurate; "large" is more accurate but slower.
model_size = "small" 
model = whisper.load_model(model_size)

print(f"Loaded Whisper model: {model_size}")


Loaded Whisper model: small


In [7]:
def transcribe_audio(filename_base):
    audio_filename = filename_base + '.mp3'
    result = model.transcribe(audio_filename, language="nl")  # 'nl' is for Dutch
    dutch_text = result["text"]
    return dutch_text

dutch_text = transcribe_audio(filename_base)
print(
    "Dutch transcription (first 100 words):\n",
     " ".join(dutch_text.split()[:100])+"..."
)



Dutch transcription (first 100 words):
 De man en de pet door Noni. We gingen allemaal naar de dorpsmarkt. Papa kocht voor Chintu een zonnepril. Mama kocht voor mij een helderblauwe pet. De baby kreeg supe. Op weg naar huis was er een hele sterke wind. Die blies mijn pet weg. Mijn pet bleef halen aan het dak van de oude pippelboom. Ik helde veel en z'avonds had ik niets. Later die avond ging de maan schijnen. Die keek naar mijn pet aan de oude pippelboom. Die zette mijn pet op. De maan glimlacht het blij en geluid. Ik moest ook glimlachen. De volgende dag gaf...


In [8]:
# Save the dutch transcription in case you mess up and overwrite it and have to rerun whisper (15 minutes on my MacBook)
with open('dutch_transcritpion_short.txt', 'w', encoding='utf-8') as file:
    file.write(dutch_text)

In [10]:
def translate(dutch_text):
    translator = GoogleTranslator(source="nl", target="en")
    return translator.translate(dutch_text)

english_text = translate(dutch_text)
print(
    "English translation (first 100 words):\n",
     " ".join(english_text.split()[:100])+"..."
)

English translation (first 100 words):
 The man and the cap by Noni. We all went to the village market. Dad bought a sunril for Chintu. Mama bought a clear blue cap for me. The baby got Supe. On the way home there was a very strong wind. It blew my cap away. My cap kept on the roof of the old Pippelboom. I led a lot and I had nothing to do in the evening. Later that evening the moon shine. He looked at my cap on the old Pippelboom. He put on my cap. The moon smiles happy and sound. I also had to...


In [None]:
with open('dutch_transcritpion_01.txt', 'r', encoding='utf-8') as file:
    text = file.read()

chunks = re.split(r'\b[Tt]ot de volgende keer', text)
print(len(chunks))

In [12]:
# Optional: Ensure no chunk exceeds 5000 characters
final_chunks = []
for chunk in chunks:
    if len(chunk) <= 5000:
        final_chunks.append(chunk)
    else:
        # Further split large chunks into smaller parts (e.g., at sentence boundaries)
        sentences = chunk.split(". ")  # Split at sentences
        sub_chunk = ""
        for sentence in sentences:
            if len(sub_chunk) + len(sentence) <= 5000:
                sub_chunk += sentence + ". "
            else:
                final_chunks.append(sub_chunk.strip())
                sub_chunk = sentence + ". "
        final_chunks.append(sub_chunk.strip())  # Add last part

translator = GoogleTranslator(source="nl", target="en")

translated_chunks = []
for chunk in final_chunks:
    translation = translator.translate(chunk)
    translated_chunks.append(translation)

KeyboardInterrupt: 

In [188]:
with open("english_transcript.txt", "w", encoding="utf-8") as file:
    file.write("\n\n***\n\n".join(translated_chunks))  # Keep chapter breaks

In [13]:
def convert_to_df(dutch_text=None, english_text=None):
    if not dutch_text or not english_text:
        # Example Dutch and English text (replace these with your actual transcriptions)
        dutch_text = """Dit is een voorbeeldzin in het Nederlands. Ik ben bezig met het testen van Whisper! De resultaten lijken behoorlijk accuraat te zijn?"""

        english_text = """This is an example sentence in Dutch. I am testing Whisper! The results seem to be quite accurate?"""

    def split_and_recombine(text):
        """Splits the text into sentences while keeping punctuation."""
        pattern = r'([.!?]+)'
        split_text = re.split(pattern, text)
        sentences = ["".join(split_text[i:i+2]).strip() for i in range(0, len(split_text)-1, 2)]
        return sentences

    # Apply sentence splitting to both Dutch and English text
    dutch_sentences = split_and_recombine(dutch_text)
    english_sentences = split_and_recombine(english_text)

    # Ensure both lists have the same length (fill with empty strings if needed)
    max_len = max(len(dutch_sentences), len(english_sentences))
    dutch_sentences += [""] * (max_len - len(dutch_sentences))
    english_sentences += [""] * (max_len - len(english_sentences))

    # Create DataFrame
    df = pd.DataFrame({"Dutch (Original)": dutch_sentences, "English (Translation)": english_sentences})

    return df

# Run function and display DataFrame
df = convert_to_df(dutch_text=dutch_text, english_text=english_text)


In [14]:
for i in range(min(len(df), 5)):
    print(f'{df.iloc[i].iloc[0]} | {df.iloc[i].iloc[1]}')

De man en de pet door Noni. | The man and the cap by Noni.
We gingen allemaal naar de dorpsmarkt. | We all went to the village market.
Papa kocht voor Chintu een zonnepril. | Dad bought a sunril for Chintu.
Mama kocht voor mij een helderblauwe pet. | Mama bought a clear blue cap for me.
De baby kreeg supe. | The baby got Supe.


In [15]:
df.head(len(df))

Unnamed: 0,Dutch (Original),English (Translation)
0,De man en de pet door Noni.,The man and the cap by Noni.
1,We gingen allemaal naar de dorpsmarkt.,We all went to the village market.
2,Papa kocht voor Chintu een zonnepril.,Dad bought a sunril for Chintu.
3,Mama kocht voor mij een helderblauwe pet.,Mama bought a clear blue cap for me.
4,De baby kreeg supe.,The baby got Supe.
5,Op weg naar huis was er een hele sterke wind.,On the way home there was a very strong wind.
6,Die blies mijn pet weg.,It blew my cap away.
7,Mijn pet bleef halen aan het dak van de oude p...,My cap kept on the roof of the old Pippelboom.
8,Ik helde veel en z'avonds had ik niets.,I led a lot and I had nothing to do in the eve...
9,Later die avond ging de maan schijnen.,Later that evening the moon shine.


In [160]:
df.to_csv('test.csv')

In [16]:
res = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)

FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg -version'