<a href="https://colab.research.google.com/github/sadhiika/speech_task/blob/main/sarvam_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages
!pip install pytube pydub transformers torch nltk huggingface_hub

# Install FFmpeg
!apt-get install ffmpeg

import os
import numpy as np
from pytube import YouTube
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import nltk
from nltk.tokenize import sent_tokenize
import gc
import logging
from huggingface_hub import login

nltk.download('punkt')

# Suppress warnings from transformers library
logging.getLogger("transformers").setLevel(logging.ERROR)

# Authenticate with Hugging Face
huggingface_token = "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"
login(huggingface_token)

# Verify FFmpeg installation
!ffmpeg -version
!ffprobe -version

# Step 1: Download Video and Extract Audio
def download_youtube_video(url, output_path="video.mp4"):
    yt = YouTube(url)
    stream = yt.streams.filter(only_audio=True).first()
    stream.download(filename=output_path)
    print(f"Downloaded video to {output_path}")
    if os.path.exists(output_path):
        print(f"Video file {output_path} exists.")
    else:
        print(f"Video file {output_path} does not exist.")

def extract_audio(video_path, audio_path="audio.wav"):
    print(f"Extracting audio from {video_path}")
    if not os.path.exists(video_path):
        print(f"Video file {video_path} does not exist.")
        return
    try:
        video = AudioSegment.from_file(video_path)
        video.export(audio_path, format="wav")
        print(f"Extracted audio to {audio_path}")
    except Exception as e:
        print(f"Error during audio extraction: {e}")

# Function to process audio in chunks
def process_audio_in_chunks(audio_path, chunk_length_ms=60000):  # 60 seconds per chunk
    audio = AudioSegment.from_wav(audio_path)
    chunks = []
    for i in range(0, len(audio), chunk_length_ms):
        chunk = audio[i:i+chunk_length_ms]
        chunk_path = f"chunk_{i//chunk_length_ms}.wav"
        chunk.export(chunk_path, format="wav")
        chunks.append(chunk_path)
    return chunks

# Step 2: Transcribe Audio
def transcribe_audio(audio_path):
    # Load pre-trained model and processor
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    # Load audio file
    audio = AudioSegment.from_wav(audio_path)
    audio = audio.set_frame_rate(16000)
    audio = audio.set_channels(1)
    audio_array = np.array(audio.get_array_of_samples(), dtype=np.float32)

    # Transcription
    input_values = processor(audio_array, return_tensors="pt", sampling_rate=16000).input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

# Step 3: Time-Align Transcript with Audio
def time_align_transcript(audio_path, transcription):
    # Dummy alignment example
    duration = len(AudioSegment.from_wav(audio_path)) / 1000.0
    words = transcription.split()
    avg_word_duration = duration / len(words)

    aligned_transcript = []
    current_time = 0.0

    for word in words:
        word_start_time = current_time
        word_end_time = current_time + avg_word_duration
        aligned_transcript.append((word, word_start_time, word_end_time))
        current_time = word_end_time

    return aligned_transcript

# Step 4: Semantic Chunking of Data
def semantic_chunking(transcript, max_chunk_length=15.0):
    sentences = sent_tokenize(transcript)
    chunks = []
    chunk = []
    current_length = 0.0

    for sentence in sentences:
        sentence_duration = len(sentence.split()) * (max_chunk_length / len(transcript.split()))
        if current_length + sentence_duration > max_chunk_length:
            chunks.append(chunk)
            chunk = []
            current_length = 0.0
        chunk.append(sentence)
        current_length += sentence_duration

    if chunk:
        chunks.append(chunk)

    # Assign IDs and timestamps to chunks
    chunked_data = []
    chunk_start_time = 0.0

    for i, chunk in enumerate(chunks):
        chunk_text = " ".join(chunk)
        chunk_end_time = chunk_start_time + (len(chunk_text.split()) * (max_chunk_length / len(transcript.split())))
        chunk_length = chunk_end_time - chunk_start_time
        chunked_data.append({
            "chunk_id": i + 1,
            "chunk_length": chunk_length,
            "text": chunk_text,
            "start_time": chunk_start_time,
            "end_time": chunk_end_time,
        })
        chunk_start_time = chunk_end_time

    return chunked_data

# Download video and extract audio
download_youtube_video("https://youtu.be/Sby1uJ_NFIY?si=gwHJ-Y17itix3l1Z", "video.mp4")
extract_audio("video.mp4", "audio.wav")

# Process audio in chunks
audio_chunks = process_audio_in_chunks("audio.wav")

all_transcriptions = []
for chunk_path in audio_chunks:
    transcription = transcribe_audio(chunk_path)
    aligned_transcript = time_align_transcript(chunk_path, transcription)
    chunks = semantic_chunking(transcription)
    all_transcriptions.extend(chunks)
    # Clean up the chunk to save memory
    os.remove(chunk_path)
    del transcription, aligned_transcript, chunks
    gc.collect()

# Output results
import json
output_path = "semantic_chunks.json"
with open(output_path, "w") as f:
    json.dump(all_transcriptions, f, indent=4)

# Display output
from IPython.display import display, JSON
display(JSON(all_transcriptions))

# Provide a download link for the JSON file
from google.colab import files
files.download(output_path)


Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

<IPython.core.display.JSON object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


# Short Description of the Code

*Download Video and Extract Audio:*
I used the pytube library to download the YouTube video because its a straightforward and reliable tool for accessing video content. For extracting the audio, pydub is very effective as it handles various audio formats and provides easy-to-use methods for exporting audio files.


*Transcription of Audio*
To transcribe the audio, I chose the Wav2Vec2ForCTC model from Hugging Face. This model is well-regarded for its high accuracy in speech recognition tasks. I used it because it's pre-trained on a large dataset, which makes it highly effective at converting spoken language into text without needing additional training. By setting the audio sample rate to 16kHz and ensuring it is mono, I optimized the transcription quality.


*Time-Align Transcript with Audio*
For aligning the transcript with the audio, I devised a method to calculate the average duration of each word. Although this method assumes a uniform distribution of words, it is straightforward and allows for a basic alignment without complex algorithms. This approach provides a quick and understandable way to match the text with the corresponding audio segments.


*Semantic Chunking of Data*
I used the nltk library to split the transcription into sentences for semantic chunking. This library is well-suited for natural language processing tasks. By grouping sentences into chunks and ensuring each chunk is less than 15 seconds long, I maintained the semantic integrity of the text while also adhering to a manageable chunk length for analysis. This approach balances the need for meaningful text segments with practical audio chunk sizes.


*Handling RAM Limitations*
Initially, I encountered a "ran out of RAM" error when processing the entire audio file at once. To overcome this, I modified the approach to process the audio in smaller chunks, specifically 60-second segments. This chunking method allowed me to handle the audio transcription without exceeding the memory limits of the environment. By processing each chunk individually and then combining the results, I managed to efficiently transcribe the entire audio while keeping the memory usage within acceptable limits.

### [Bonus-2] Utilizing Ground-Truth Transcripts


To improve the quality of the transcript, I would use a ground-truth transcript. We could compare the automatic transcription with this perfect version and use a method called dynamic time warping (DTW) to line up the words in both versions, which should help us see where the automatic transcription made mistakes. By looking at where the words don't match, wr can correct the errors in the automatic transcription by replacing the wrong words with the right ones from the ground-truth transcript. This way, the ground-truth transcript helps me make the automatic transcription much more accurate.