In [None]:
%pip install jiwer

In [None]:
import torch
import os
import accelerate
import jiwer
import pandas as pd

from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
gt = pd.read_csv('/home/jupyter/datasphere/project/rodion_dir/GT_train.csv', sep = ';')
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [None]:
def make_audio_files(directory):
    audio_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith((".mp3", ".m4a", ".ogg", ".flac", ".aac", ".wav")):
                file_path = os.path.join(root, file)
                
                # Независимо от формата файла, создаем путь с .wav
                wav_path = file_path.rsplit(".", 1)[0] + ".wav"
                
                # Преобразование файла в формат wav, одноканальный и 16 кГц
                audio = AudioSegment.from_file(file_path)
                audio = audio.set_channels(1).set_frame_rate(16000)
                
                # Экспортируем в wav
                audio.export(wav_path, format="wav")
                
                audio_files.append(wav_path)  # Сохранение пути к новому файлу
    return audio_files

In [None]:
def predict_directory(pipe, directory: str) -> pd.DataFrame:
    audio_files = find_audio_files(directory)
    transcriptions = []
    names = []
    for file_path in audio_files:
        transcription = pipe(sample, generate_kwargs={"language": "ru", "task": "transcribe"})["text"]
        transcriptions.append(transcription)
        names.append(file_path[14:-17])
    submission = pd.DataFrame({"Наименование аудиозаписи": names, "predicted": transcriptions})
    return submission

def calc_sub_metric(submission, gt=gt) -> pd.DataFrame:
    df = gt.merge(submission, how = "left", on = 'Наименование аудиозаписи')
    print(df)
    df['WER'] = [1-jiwer.wer(x[0],x[1]) for x in zip(df['Транскрибированный текст'], df['predicted'])]
    return df


def test_model(pipe, directory, gt=gt) ->pd.DataFrame:
    submission = predict_directory(pipe, directory)
    return calc_sub_metric(submission)

In [None]:
directory = "/home/jupyter/datasphere/project/rodion_dir/train"
audio_files = make_audio_files(directory)

In [None]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [None]:
sample = audio_files[2]

In [None]:
result = pipe(sample, generate_kwargs={"language": "ru", "task": "transcribe"})
print(result["text"])