In [10]:
!pip install yt-dlp
!pip install gradio
!pip install gtts
!pip install transformers



In [11]:
import os
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
from yt_dlp import YoutubeDL
import glob

def get_transcript(url):
    with YoutubeDL({"skip_download":True, "writeautomaticsub":True, "subtitleslangs":["en", "en-us", "en-us", "en-uk", "en-in", "en-es", "en-fr"]}) as ydl:
        if ydl.download(url):
            print("-----------------------------------------------------------------------------------------------------------LINK FAILED")
            return None, None
        else:
            # print(ydl.extract_info(url))
            op = [f for f in os.listdir() if f.startswith(os.path.splitext(ydl.prepare_filename(ydl.extract_info(url)))[0]) and f.endswith('.vtt')]
            if op:
              return ydl.extract_info(url)['title'], op[0]
            else:
              print("-----------------------------------------------------------------------------------------------------------file download FAILED")
              return None, None

In [13]:
from transformers import BartForConditionalGeneration, AutoTokenizer
import torch, math

PROMPT_LEN, MAX_N_TOKEN = 1024, 1024

class SUMMARY:
    def __init__(self):
        torch.cuda.empty_cache()
        self.model_ckpt = "facebook/bart-large-cnn"
        self.device = DEVICE
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
        self.model = BartForConditionalGeneration.from_pretrained(self.model_ckpt).to(
            self.device
        )

    def summarize(self, text, first=True):
        input_ids = self.tokenizer.encode(text, return_tensors="pt").to(self.device)
        if input_ids.shape[1] <= PROMPT_LEN and not first:
            summary = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
            print("Summary is ->>>>>>>>>>>>>>>>>>>>>>>>>\n" + summary)
            return summary

        n_split = math.ceil(input_ids.shape[1] / MAX_N_TOKEN)
        splits = [
            input_ids[:, MAX_N_TOKEN * i : MAX_N_TOKEN * i + MAX_N_TOKEN]
            for i in range(0, n_split)
        ]
        summarized_splits = [
            self.model.generate(split, max_length=PROMPT_LEN) for split in splits
        ]
        decoded_summary_splits = [
            self.tokenizer.decode(s[0], skip_special_tokens=True)
            for s in summarized_splits
        ]
        summary = " ".join(decoded_summary_splits)
        print(
            "Intermediate Summaries ->>>>>>>>>>>>>>>>>>>>>>>>>\n",
            decoded_summary_splits,
        )
        return self.summarize(summary, first=False)

summarizer=SUMMARY()

In [14]:
def video_summarize(url):
  title, vttfile=get_transcript(url)
  if vttfile is None:
    return "SOME ISSUE WITH VIDEO LINK OR DOWNLOAINDING VIDEO CONTENTS"
  print("text file expected", vttfile+'.txt')
  os.system(f"cat \"{vttfile}\" | grep : -v | awk '!seen[$0]++' > \"{vttfile}.txt\"")
  tscript = open(f'{vttfile}.txt').read()
  return title, summarizer.summarize(tscript)

In [None]:
import gradio as gr
from gtts import gTTS

def text_to_speech(text):
    audio = gTTS(text=text, lang="en", tld='co.in',  slow=False)
    audio.save("bot.mp3")
    return "bot.mp3"

with gr.Blocks(title="Youtube Video Summarizer") as demo:
    inputlink = gr.Textbox(label="Insert an youtube link here", autofocus=True)
    btn = gr.Button("Submit")
    outputtext = gr.Textbox(label="Summary")
    opaudio = gr.Audio(label="Audio Summary", source="microphone", type="filepath", autoplay=True)
    clear = gr.ClearButton([outputtext, inputlink, opaudio])

    def respond(link):
        title, summary = video_summarize(link.strip())
        return title + '\n\n' + summary, text_to_speech(summary)

    btn.click(lambda :None, None, inputlink)
    btn.click(lambda :None, None, outputtext)
    btn.click(lambda :None, None, opaudio)
    btn.click(respond, [inputlink], [outputtext, opaudio])

    inputlink.submit(lambda :None, None, inputlink)
    inputlink.submit(lambda :None, None, outputtext)
    inputlink.submit(lambda :None, None, opaudio)
    inputlink.submit(respond, [inputlink], [outputtext, opaudio])

demo.queue(concurrency_count=1)
demo.launch(debug=True, share=True).queue()
pass

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://f4da23a0d10681da94.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


[youtube] Extracting URL: https://www.youtube.com/watch?v=FfgT6zx4k3Q
[youtube] FfgT6zx4k3Q: Downloading webpage
[youtube] FfgT6zx4k3Q: Downloading ios player API JSON
[youtube] FfgT6zx4k3Q: Downloading android player API JSON
[youtube] FfgT6zx4k3Q: Downloading player da7c2a60
[youtube] FfgT6zx4k3Q: Downloading m3u8 information
[info] FfgT6zx4k3Q: Downloading subtitles: en, en-es, en-fr
[info] FfgT6zx4k3Q: Downloading 1 format(s): 303+251
[info] Writing video subtitles to: Could Your Phone Hurt You？ Electromagnetic Pollution [FfgT6zx4k3Q].en.vtt
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 1
[download] Destination: Could Your Phone Hurt You？ Electromagnetic Pollution [FfgT6zx4k3Q].en.vtt
[download] 100% of   11.57KiB in 00:00:00 at 201.06KiB/s             
[info] Writing video subtitles to: Could Your Phone Hurt You？ Electromagnetic Pollution [FfgT6zx4k3Q].en-es.vtt
[download] Destination: Could Your Phone Hurt You？ Electromagnetic Pollution [FfgT6zx4k3Q].en-es.vt

Token indices sequence length is longer than the specified maximum sequence length for this model (1574 > 1024). Running this sequence through the model will result in indexing errors


Intermediate Summaries ->>>>>>>>>>>>>>>>>>>>>>>>>
 ['Electricity is the movement of an electrical charge. This movement generates electric and magnetic fields. These fields propagate through space and carry energy. We call this phenomenon, "Electromagnetic Radiation" "Radiation" is a word that makes people very nervous. But, to radiate simply means "to be giving"', 'The WHO classified radio wave fields as "potentially carcinogenic" There is some indication that it might cause cancer, but we can\'t prove it. There is no consistent evidence that electromagnetic radiation below exposure thresholds causes any health problems. The research suggests that people who say the radiation is affecting them could be experiencing the "Nocebo Effect"']
Summary is ->>>>>>>>>>>>>>>>>>>>>>>>>
Electricity is the movement of an electrical charge. This movement generates electric and magnetic fields. These fields propagate through space and carry energy. We call this phenomenon, "Electromagnetic Radiation"