In [None]:
import os
import sys
import torch

from tqdm import tqdm
from PIL import Image
from moviepy import VideoFileClip
from transformers import pipeline
from transformers import AutoProcessor
from transformers import AutoModelForSpeechSeq2Seq 

In [12]:
sys.argv = ["data/videos", "data/audios", "data/texts"]

In [13]:
INPUT_DIR_VIDEO = sys.argv[0]
OUTPUT_DIR_AUDIO = sys.argv[1]
OUTPUT_DIR_TEXT = sys.argv[2]

MODEL_ID = "openai/whisper-tiny"

INPUT_DIR_VIDEO, OUTPUT_DIR_AUDIO, MODEL_ID

('data/videos', 'data/audios', 'openai/whisper-tiny')

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
print('Using device:', DEVICE, TORCH_DTYPE)

Using device: cpu torch.float32


## Video to Audio

In [14]:
def extract_sound(video_path, output_dir):
    # 1. Töltsd be a videófájlt
    video_klip = VideoFileClip(video_path)
    
    # 2. Vedd ki a hangklipet
    audio_klip = video_klip.audio

    # A fájlnév kinyerése
    file_name = os.path.basename(video_path)
    file_name_without_ext, file_ext = os.path.splitext(file_name)
    file_name_audio = os.path.join(output_dir, f"{file_name_without_ext}.mp3")
    
    # Könyvtár létrehozása
    os.makedirs(output_dir, exist_ok=True)    
    # Exportáld a hangot egy MP3 fájlba
    audio_klip.write_audiofile(file_name_audio)
    
    # Zárd be a klipeket, hogy felszabaduljanak az erőforrások
    audio_klip.close()
    video_klip.close()

In [6]:
files = [f"{INPUT_DIR_VIDEO}/{item}" for item in os.listdir(INPUT_DIR_VIDEO)]
print("Number of files:", len(files))

Number of files: 1


In [7]:
print("Extrat audios from videos")
for file in files:
    try:
        extract_sound(file, OUTPUT_DIR_AUDIO)
    except:
        print("Hiba:", file)

Extrat audios from videos
MoviePy - Writing audio in data/audios\inlp bead 2024 1.mp3


                                                                      

MoviePy - Done.




## Audio to Text

In [17]:
files = [f"{OUTPUT_DIR_AUDIO}/{item}" for item in os.listdir(OUTPUT_DIR_AUDIO) if item != ".ipynb_checkpoints"]
print("Number of files:", len(files))

Number of files: 1


In [15]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID, dtype=TORCH_DTYPE, low_cpu_mem_usage=True, use_safetensors=True)

processor = AutoProcessor.from_pretrained(MODEL_ID)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    dtype=TORCH_DTYPE,
    device=DEVICE,
)
print("Model load complete!")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


Model load complete!


In [19]:
print("Extrat texts from audios")
pbar = tqdm(files)
for file in pbar:
    # try:
        pbar.set_description(file)
    
        # Szövegek kinyerése az audióbol
        result = pipe(file, return_timestamps=True)
    
        # A fájlnév kinyerése
        file_name = os.path.basename(file)
        file_name_without_ext, file_ext = os.path.splitext(file_name)
        file_name_txt = os.path.join(OUTPUT_DIR_TEXT, f"{file_name_without_ext}.txt")
    
        # fájl kiírása
        with open(file_name_txt, "w") as f:
            f.write(result["text"])
    # except:
    #     print("Hiba", file)

Extrat texts from audios


data/audios/inlp bead 2024 1.mp3:   0%|          | 0/1 [00:00<?, ?it/s]


ValueError: ffmpeg was not found but is required to load audio files from filename