In [15]:
import os
import numpy as np
import webvtt
from datasets import Dataset, concatenate_datasets
from scipy.io import wavfile
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer, DataCollatorForSeq2Seq

In [2]:
def vtt_to_text(vtt_file):
    transcript = []
    for caption in webvtt.read(vtt_file):
        transcript.append((caption.start, caption.end, caption.text.strip()))
    return transcript

In [3]:
def create_dataset(audio_file, transcript_data, target_sample_rate=16000):
    try:
        #Load the audio file with scipy
        sample_rate, audio_data = wavfile.read(audio_file)
        print(f"Audio data shape: {audio_data.shape}, Sample rate: {sample_rate}")
    except Exception as e:
        raise RuntimeError(f"Failed to load audio file {audio_file}: {e}")

    #Preparing the dataset
    data = []
    
    for start, end, text in transcript_data:
        #Converting time from HH:MM:SS to seconds
        start_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(start.split(":"))))
        end_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(end.split(":"))))
        
        #Calculating start and end frames based on the target sample rate
        start_frame = max(0, min(int(start_sec * sample_rate), len(audio_data)))
        end_frame = max(0, min(int(end_sec * sample_rate), len(audio_data)))

        #Cutting the audio chunk for the corresponding transcript
        audio_chunk = audio_data[start_frame:end_frame]

        #Ensure the audio chunk is in the correct format
        audio_chunk_list = audio_chunk.tolist()
        data.append({"audio": audio_chunk_list, "text": text})

    #Check if transcript data is empty
    if not data:
        print(f"No valid transcript data for {audio_file}.")
        return None
    
    #Creating the dataset
    dataset = Dataset.from_dict({
        "audio": [d["audio"] for d in data],
        "text": [d["text"] for d in data]
    })
    
    print(f"Created dataset for {audio_file} with {len(data)} entries.")
    return dataset

In [4]:
def find_subtitle_file(audio_filename, subtitle_folder):
    base_audio_name = os.path.splitext(audio_filename)[0]  
    # Searching for the subtitle file that contains the audio filename in their name
    for subtitle_filename in os.listdir(subtitle_folder):
        if base_audio_name in subtitle_filename and subtitle_filename.endswith('.vtt'):
            return os.path.join(subtitle_folder, subtitle_filename)
    
    return None

In [5]:
def process_videos(audio_folder, subtitle_folder):
    datasets = []

    for audio_file in os.listdir(audio_folder):
        if audio_file.endswith('.wav'):
            input_file_path = os.path.join(audio_folder, audio_file)

            #Find the corresponding subtitle file
            subtitle_file_path = find_subtitle_file(audio_file, subtitle_folder)
            if subtitle_file_path:
                print(f"Found subtitle file: {subtitle_file_path}")

                #Convert subtitles to text transcript
                transcript_data = vtt_to_text(subtitle_file_path)
                if transcript_data:
                    #Create the dataset using the existing WAV file
                    dataset = create_dataset(input_file_path, transcript_data)
                    if dataset:  #Check if the dataset is created
                        datasets.append(dataset)
                    else:
                        print(f"Failed to create dataset for {input_file_path}.")
                else:
                    print(f"No transcript data found for {subtitle_file_path}. The file may be empty or incorrectly formatted.")
            else:
                print(f"Subtitle file not found for {audio_file}. Expected naming convention: {os.path.splitext(audio_file)[0]}*.vtt")

    #Concatenate all datasets
    if datasets:
        combined_dataset = concatenate_datasets(datasets)
        print(f"Combined dataset created with {len(datasets)} individual datasets.")
        return combined_dataset
    else:
        print("No datasets were created.")
        return None

In [6]:
audio_path = 'audio_files/'
subtitle_path = 'manual_sub/'
combined_dataset = process_videos(audio_path, subtitle_path)

Found subtitle file: manual_sub/Lec13LagrangemultipliersMIT1802MultivariableCalculusFall2007_manual.en.vtt
Audio data shape: (48159301,), Sample rate: 16000
Created dataset for audio_files/Lec13LagrangemultipliersMIT1802MultivariableCalculusFall2007.wav with 583 entries.
Found subtitle file: manual_sub/Lecture20TaylorsTheoremandtheDefinitionofRiemannSums_manual.en-j3PyPqV-e1s.vtt
Audio data shape: (50431513,), Sample rate: 16000
Created dataset for audio_files/Lecture20TaylorsTheoremandtheDefinitionofRiemannSums.wav with 583 entries.
Found subtitle file: manual_sub/Lecture12TheRatioRootandAlternatingSeriesTests_manual.en-j3PyPqV-e1s.vtt
Audio data shape: (57920598,), Sample rate: 16000
Created dataset for audio_files/Lecture12TheRatioRootandAlternatingSeriesTests.wav with 661 entries.
Found subtitle file: manual_sub/Lecture19DifferentiationRulesRollesTheoremandtheMeanValueTheorem_manual.en-j3PyPqV-e1s.vtt
Audio data shape: (71469906,), Sample rate: 16000
Created dataset for audio_files

Created dataset for audio_files/Lecture17UniformContinuityandtheDefinitionoftheDerivative.wav with 819 entries.
Found subtitle file: manual_sub/Lecture4TheCharacterizationoftheRealNumbers_manual.en-j3PyPqV-e1s.vtt
Audio data shape: (78781099,), Sample rate: 16000
Created dataset for audio_files/Lecture4TheCharacterizationoftheRealNumbers.wav with 898 entries.
Found subtitle file: manual_sub/Lec10SecondderivativetestboundariesinfinityMIT1802MultivariableCalculusFall2007_manual.en-qlPKC2UN_YU.vtt
Audio data shape: (50207115,), Sample rate: 16000
Created dataset for audio_files/Lec10SecondderivativetestboundariesinfinityMIT1802MultivariableCalculusFall2007.wav with 627 entries.
Found subtitle file: manual_sub/Lecture13LimitsofFunctions_manual.en-j3PyPqV-e1s.vtt
Audio data shape: (69971940,), Sample rate: 16000
Created dataset for audio_files/Lecture13LimitsofFunctions.wav with 910 entries.
Found subtitle file: manual_sub/Lec20PathindependenceandconservativefieldsMIT1802MultivariableCalcul

In [7]:
print("Number of entries in the dataset:", len(combined_dataset))
print("Dataset shape:", combined_dataset.shape)
print("Features of the dataset:", combined_dataset.features)

Number of entries in the dataset: 32348
Dataset shape: (32348, 2)
Features of the dataset: {'audio': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'text': Value(dtype='string', id=None)}


In [10]:
model_name = 'openai/whisper-base'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name, language='en', task='transcribe')


def preprocess_function(dataset):
    #Use the feature extractor to process audio inputs
    audio = dataset['audio']
    audio_inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

    #Use the tokenizer to process text inputs
    text = dataset['text']
    text_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    return {
        "input_features": audio_inputs.input_features,
        "labels": text_inputs.input_ids
    }

encoded_dataset = combined_dataset.map(preprocess_function, remove_columns=["audio", "text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/32348 [00:00<?, ? examples/s]

In [12]:
encoded_dataset.save_to_disk("encoded_dataset")

Saving the dataset (0/10 shards):   0%|          | 0/32348 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_name, padding=True)
data_collator

DataCollatorForSeq2Seq(tokenizer=WhisperTokenizer(name_or_path='openai/whisper-base', vocab_size=50258, model_max_length=448, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>', '<|ml|>', '<|cy|>', '<|sk|>', '<|te|>', '<|fa|>', '<|lv|>', '<|bn|>', '<|sr|>', '<|az|>', '<|sl|>', '<|kn|>', '<|et|>', '<|mk|>', '<|br|>', '<|eu|>', '<|is|>', '<|hy|>', '<|ne|>', '<|mn|>', '<|bs|>', '<|kk|>', '<|sq|>', '<|sw|>', '<|g

In [14]:
print("Number of entries in the dataset:", len(encoded_dataset))
print("Dataset shape:", encoded_dataset.shape)
print("Features of the dataset:", encoded_dataset.features)

Number of entries in the dataset: 32348
Dataset shape: (32348, 2)
Features of the dataset: {'input_features': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), 'labels': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None)}
