<center><a target="_blank" href="https://githubtocolab.com/sayan1999/YouTube-Video-Summarizer/blob/main/summary.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a></center>

## Imports

In [1]:
# !pip install yt-dlp
# !pip install gradio
# !pip install gtts
# !pip install langchain
# !pip install dotenv
# !pip install huggingface_hub
# !pip install tiktoken

In [2]:
import os

In [3]:
from yt_dlp import YoutubeDL
import glob

def get_transcript(url):
    with YoutubeDL({"skip_download":True, "writeautomaticsub":True, "subtitleslangs":["en", "en-us", "en-us", "en-uk", "en-in", "en-es", "en-fr"]}) as ydl:
        if ydl.download(url):
            print("-----------------------------------------------------------------------------------------------------------LINK FAILED")
            return None, None
        else:
            # print(ydl.extract_info(url))
            op = [f for f in os.listdir() if f.startswith(os.path.splitext(ydl.prepare_filename(ydl.extract_info(url)))[0]) and f.endswith('.vtt')]
            if op:
              return ydl.extract_info(url)['title'], op[0]
            else:
              print("-----------------------------------------------------------------------------------------------------------file download FAILED")
              return None, None

In [4]:
from pathlib import Path
import math, os

from dotenv import load_dotenv

from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)
from huggingface_hub import InferenceClient



load_dotenv(Path("huggingface/.huggingface"))
API_TOKEN = os.getenv("API_TOKEN")
headers = {"Authorization": f"Bearer {API_TOKEN}"}

client = InferenceClient()

import tiktoken

tok = tiktoken.get_encoding("cl100k_base")
tok_len_of = lambda x: len(tok.encode(x))

MAX_INPUT_SIZE = 1024
MAX_OUTPUT_SIZE = 512
MIN_OUTPUT_SIZE = 150


def API_call(text, tempdumpfile):
    print("API call ->>>>>>>>>>>>>>> input length:", tok_len_of(text))
    tempdumpfile.write('\n\n\n\n\nInput>>>>>>>>>>>>>>>>>>\n\n'+ text) 
    summary = client.summarization(
        text,
        parameters={
            "min_length": min(tok_len_of(text), MIN_OUTPUT_SIZE),
            "max_length": MAX_OUTPUT_SIZE,
        },
    ).strip()
    tempdumpfile.write('\n\nResponse>>>>>>>>>>>>>>>>>>\n\n'+ summary) 
    # print("API response <<<<<<<<<<<<<-", summary)
    return summary


def adaptive_chunkify_bart(text):
    if tok_len_of(text) <= MAX_OUTPUT_SIZE:
        return [text]
    n_chunks = math.ceil(tok_len_of(text) / MAX_INPUT_SIZE)
    chunk_size = math.ceil(tok_len_of(text) / n_chunks) + 200
    print(f"{n_chunks=}, {chunk_size=}")

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size,
        chunk_overlap=50,
        is_separator_regex=False,
    )
    chunks = list(
        map(lambda page: page.page_content, text_splitter.create_documents([text]))
    )
    print("Chunks sizes are:", [tok_len_of(c) for c in chunks])
    return chunks


def summarize(comprehension, tempdumpfile=None):
    if not tempdumpfile:
        tempdumpfile=open('tempdump.txt', 'w+')
    chunks = adaptive_chunkify_bart(comprehension)
    if len(chunks) == 1:
        return API_call(chunks[0], tempdumpfile)
    chunk_summaries = [API_call(chunk, tempdumpfile) for chunk in chunks]
    return summarize(" ".join(chunk_summaries), tempdumpfile)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import re
def video_summarize(url):
  title, vttfile=get_transcript(url)
  if vttfile is None:
    return "SOME ISSUE WITH VIDEO LINK OR DOWNLOAINDING VIDEO CONTENTS"
  print("text file expected", vttfile+'.txt')
  os.system(f"cat \"{vttfile}\" | grep : -v | awk '!seen[$0]++' > \"{vttfile}.txt\"")
  tscript = re.sub(r'[\s|\n]',' ',open(f'{vttfile}.txt').read().replace('WEBVTT', '', 1))
  return title, summarize(tscript)

In [None]:
import gradio as gr
from gtts import gTTS

def text_to_speech(text):
    audio = gTTS(text=text, lang="en", tld='co.in',  slow=False)
    audio.save("bot.mp3")
    return "bot.mp3"

with gr.Blocks(title="Youtube Video Summarizer") as demo:
    inputlink = gr.Textbox(label="Insert an youtube link here", autofocus=True)
    btn = gr.Button("Submit")
    outputtext = gr.Textbox(label="Summary")
    opaudio = gr.Audio(label="Audio Summary", source="microphone", type="filepath", autoplay=True)
    clear = gr.ClearButton([outputtext, inputlink, opaudio])

    def respond(link):
        title, summary = video_summarize(link.strip())
        return title + '\n\n' + summary, text_to_speech(summary)

    btn.click(lambda :None, None, inputlink)
    btn.click(lambda :None, None, outputtext)
    btn.click(lambda :None, None, opaudio)
    btn.click(respond, [inputlink], [outputtext, opaudio])

    inputlink.submit(lambda :None, None, inputlink)
    inputlink.submit(lambda :None, None, outputtext)
    inputlink.submit(lambda :None, None, opaudio)
    inputlink.submit(respond, [inputlink], [outputtext, opaudio])

demo.queue(concurrency_count=1)
demo.launch(debug=True, share=True).queue()
pass

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://7a27bf1647eaaf6fb5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


[youtube] Extracting URL: https://www.youtube.com/watch?v=lHrDS_BNoRU
[youtube] lHrDS_BNoRU: Downloading webpage
[youtube] lHrDS_BNoRU: Downloading ios player API JSON
[youtube] lHrDS_BNoRU: Downloading android player API JSON
[youtube] lHrDS_BNoRU: Downloading m3u8 information
[info] lHrDS_BNoRU: Downloading subtitles: en
[info] lHrDS_BNoRU: Downloading 1 format(s): 248+251
Deleting existing file Israel-Hamas war intensifying as survivors of Hamas attacks speak out [lHrDS_BNoRU].en.vtt
[info] Writing video subtitles to: Israel-Hamas war intensifying as survivors of Hamas attacks speak out [lHrDS_BNoRU].en.vtt
[download] Destination: Israel-Hamas war intensifying as survivors of Hamas attacks speak out [lHrDS_BNoRU].en.vtt
[download] 100% of   51.25KiB in 00:00:00 at 234.58KiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=lHrDS_BNoRU
[youtube] lHrDS_BNoRU: Downloading webpage
[youtube] lHrDS_BNoRU: Downloading ios player API JSON
[youtube] lHrDS_BNoRU: Downloading android

Traceback (most recent call last):
  File "/media/instantinopaul/data/Code/ML/github.com/youtube-summarizer/env/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py", line 269, in hf_raise_for_status
    response.raise_for_status()
  File "/media/instantinopaul/data/Code/ML/github.com/youtube-summarizer/env/lib/python3.10/site-packages/requests/models.py", line 1021, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/media/instantinopaul/data/Code/ML/github.com/youtube-summarizer/env/lib/python3.10/site-packages/gradio/queueing.py", line 406, in call_prediction
    output = await route_utils.call_process_api(
  File "/media/instantinopaul/data/Code/ML/github.com/youtube-summarizer/env/lib/python3.