In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4 -q
!apt update -q
!apt install -y ffmpeg -q

In [None]:

!pip install -q "datasets>=2.6.1"
!pip install -q "evaluate>=0.3.0"
!pip install -q git+https://github.com/huggingface/transformers.git@main
!pip install -q librosa
!pip install -q jiwer
!pip install -q gradio
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/evaluate.git@main



In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name_or_path = "openai/whisper-small"
language ="English"
language_abbr ="en"
task = "transcribe"
dataset_name = "Trelis//llm-lingo"
org = "Trelis"
trained_adapter_name = "whisper-small-llm-lingo-adapters"
trained_model_name = "whisper-small-llm-lingo"
trained_adapter_repo = org + "/" + trained_adapter_name
trained_model_repo = org + '/' + trained_model_name


In [None]:
from google.colab import files
uploaded = files.upload()
!pip install moviepy

from moviepy.editor import *

video = VideoFileClip("train1.mp4")
video.audio.write_audiofile("train1.mp3")

In [None]:
from google.colab import files
uploaded = files.upload()
!pip install moviepy

from moviepy.editor import *

video = VideoFileClip("validation_MyVoice.mp4")
video.audio.write_audiofile("validation_MyVoice.mp3")

In [None]:
!pip install -q transformers datasets librosa torchaudio

from transformers import pipeline
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperTimeStampLogitsProcessor,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
import torch

In [None]:
model_name_or_path = "openai/whisper-small"


whisper_asr = pipeline(
    "automatic-speech-recognition",
    model=model_name_or_path,
    chunk_length_s=30,
    ignore_warning=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

In [None]:
def format_time(seconds):
    if seconds is None:
        return "00:00:00.000"
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:06.3f}".replace('.', ',')


def process_audio_and_create_vtt(audio_filename, audio_type, whisper_asr, output_filename=None):
    """
    Generate VTT subtitle file from audio using Whisper ASR.

    Parameters:
    - audio_filename: base name (without extension)
    - audio_type: e.g., 'mp3', 'mp4', 'wav'
    - whisper_asr: pipeline object from Hugging Face
    - output_filename: optional output filename (e.g., 'output.vtt')
    """

    prediction = whisper_asr(f"{audio_filename}.{audio_type}", return_timestamps=True)


    vtt_file_name = output_filename if output_filename else f"{audio_filename}.vtt"


    with open(vtt_file_name, "w", encoding='utf-8') as vtt_file:
        vtt_file.write("WEBVTT\n\n")
        for chunk in prediction.get("chunks", []):

            start, end = chunk.get("timestamp", (None, None))


            if start is None or end is None:
                continue


            start_time = format_time(start)
            end_time = format_time(end)
            text = chunk.get("text", "").strip()

            vtt_file.write(f"{start_time} --> {end_time}\n{text}\n\n")


In [None]:
process_audio_and_create_vtt("train1", "mp3", whisper_asr)
process_audio_and_create_vtt("validation_MyVoice","mp3",whisper_asr)

In [None]:
!git lfs install
!git clone https://huggingface.co/datasets/Trelis/llm-lingo


In [None]:
import os

for root, dirs, files in os.walk("llm-lingo"):
    for file in files:
        print(os.path.join(root, file))



In [None]:
from datasets import Dataset, DatasetDict, Audio
import pandas as pd


train_df = pd.read_parquet("llm-lingo/data/train-00000-of-00001.parquet")
val_df = pd.read_parquet("llm-lingo/data/validation-00000-of-00001.parquet")


print(f" Train rows: {len(train_df)}")
print(f" Validation rows: {len(val_df)}")


train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


train_dataset = train_dataset.cast_column("audio", Audio())
val_dataset = val_dataset.cast_column("audio", Audio())


dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})


print("\n Dataset loaded:")
print(dataset)

print("\n All Train Samples:")
for i, sample in enumerate(dataset["train"]):
    print(f"Train {i+1}: {sample['audio']['path']} — Text: {sample.get('text', 'No text')}")


print("\n All Validation Samples:")
for i, sample in enumerate(dataset["validation"]):
    print(f"Validation {i+1}: {sample['audio']['path']} — Text: {sample.get('text', 'No text')}")


In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
