In [15]:
import webvtt

#vtt similar to our data_collection file
def vtt_to_text(vtt_file):
    transcript = []
    for caption in webvtt.read(vtt_file):
        transcript.append((caption.start, caption.end, caption.text.strip()))
    return transcript

In [16]:
from datasets import Dataset, Audio, concatenate_datasets
import torchaudio

def create_dataset(audio_file, transcript_data, target_sample_rate=16000):
    # Load the audio file
    audio_data, sample_rate = torchaudio.load(audio_file)
    
    # Resampling if the sample rate is different from the target
    if sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        audio_data = resampler(audio_data)

    # Preparing the dataset
    data = []
    
    for start, end, text in transcript_data:
        # Converting time from HH:MM:SS to seconds
        start_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(start.split(":"))))
        end_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(end.split(":"))))
        
        # Calculating start and end frames based on the target sample rate
        start_frame = int(start_sec * target_sample_rate)
        end_frame = int(end_sec * target_sample_rate)
        
        # Cutting the audio chunk for the corresponding transcript
        audio_chunk = audio_data[:, start_frame:end_frame]

        # Ensuring the audio chunk is converted to a list or appropriate format
        audio_chunk_list = audio_chunk.numpy().tolist()

        data.append({"audio": audio_chunk_list, "text": text})

    # Creating the dataset
    dataset = Dataset.from_dict({
        "audio": [d["audio"] for d in data],
        "text": [d["text"] for d in data]
    })
    
    return dataset


In [17]:
import os, re

# Finding the matching subtitle file for a given audio file
def find_subtitle_file(audio_filename, subtitle_folder):
    base_audio_name = os.path.splitext(audio_filename)[0]  
    
    # Searching for the subtitle file that contains the audio filename in their name
    for subtitle_filename in os.listdir(subtitle_folder):
        if base_audio_name in subtitle_filename:
            return os.path.join(subtitle_folder, subtitle_filename)
    
    return None

# Processing all audio and subtitle files in the folder
def process_videos(audio_folder, subtitle_folder):
    datasets = []
    # Loop through all audio files in the folder
    for audio_filename in os.listdir(audio_folder):
        audio_file = os.path.join(audio_folder, audio_filename)

        # Finding the matching subtitle file
        subtitle_file = find_subtitle_file(audio_filename, subtitle_folder)

        if subtitle_file:
            print(f"Processing {audio_filename} and {subtitle_file}")
            transcript_data = vtt_to_text(subtitle_file)
            dataset = create_dataset(audio_file, transcript_data)
            datasets.append(dataset)
        else:
            print(f"No matching subtitle for {audio_filename}")

    # Combining all datasets into one
    if datasets:
        combined_dataset = concatenate_datasets(datasets)
        return combined_dataset
    else:
        raise ValueError("No datasets were processed.")


In [18]:
audio_path = 'audio_files/'
subtitle_path = 'manual_sub/'
combined_dataset = process_videos(audio_path, subtitle_path)

Processing Lecture10TheCompletenessoftheRealNumbersandBasicPropertiesofInfiniteSeries.webm and manual_sub/Lecture10TheCompletenessoftheRealNumbersandBasicPropertiesofInfiniteSeries_manual.en-j3PyPqV-e1s.vtt...
Processing Lecture11AbsoluteConvergenceandtheComparisonTestforSeries.mp4 and manual_sub/Lecture11AbsoluteConvergenceandtheComparisonTestforSeries_manual.en-j3PyPqV-e1s.vtt...
Processing Lecture12TheRatioRootandAlternatingSeriesTests.mp4 and manual_sub/Lecture12TheRatioRootandAlternatingSeriesTests_manual.en-j3PyPqV-e1s.vtt...
Processing Lecture13LimitsofFunctions.mp4 and manual_sub/Lecture13LimitsofFunctions_manual.en-j3PyPqV-e1s.vtt...
Processing Lecture14LimitsofFunctionsinTermsofSequencesandContinuity.mp4 and manual_sub/Lecture14LimitsofFunctionsinTermsofSequencesandContinuity_manual.en-j3PyPqV-e1s.vtt...
Processing Lecture15TheContinuityofSineandCosineandtheManyDiscontinuitiesofDirichletsFunction.mp4 and manual_sub/Lecture15TheContinuityofSineandCosineandtheManyDiscontinuitie

In [20]:
import torch
from datasets import Dataset, Audio
from transformers import WhisperTokenizer, WhisperProcessor, Trainer, TrainingArguments, WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-base')
processor = WhisperProcessor.from_pretrained('openai/whisper-base')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-base', language='en', task='transcribe')


import numpy as np
from datasets import Dataset

def preprocess_function(dataset):
    # Use the feature extractor to process audio inputs
    audio = dataset['audio']
    audio_inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

    # Use the tokenizer to process text inputs
    text = dataset['text']
    text_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    return {
        "input_features": audio_inputs.input_features,
        "labels": text_inputs.input_ids
    }

encoded_dataset = combined_dataset.map(preprocess_function, remove_columns=["audio", "text"])

Map:   0%|          | 0/20688 [00:00<?, ? examples/s]

In [24]:
#Test
encoded_dataset[5]['labels']

[[50258,
  50259,
  50359,
  50363,
  6455,
  321,
  600,
  1612,
  2121,
  3467,
  295,
  198,
  11834,
  2667,
  11,
  1108,
  310,
  546,
  22978,
  11,
  50257]]