transcription extraction ✅ <br>
summarization ✅

In [1]:
!pip install -q youtube_transcript_api
!pip install -q bert-extractive-summarizer

In [2]:
import os
import re
import json
import spacy
import requests
from string import punctuation
import spacy.cli
spacy.cli.download("en_core_web_lg")
from spacy.lang.en.stop_words import STOP_WORDS

from summarizer import Summarizer
from youtube_transcript_api import YouTubeTranscriptApi as yta

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
summary_model = Summarizer()
nlp = spacy.load("en_core_web_lg")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def get_ids(video_links):
  ids = []
  for links in video_links:
    id = links.split("v=")[-1]
    ids.append(id)
  return ids

In [5]:
def extract_text(text_content):
  transcript = ""
  for value_dict in text_content:
    transcript += " " + value_dict['text'] if (value_dict['text'][0] != "[" and value_dict['text'][-1] != "]") else " "
  return transcript

def get_transcript_from_ids(video_ids):
  transcripts = {}
  data = yta.get_transcripts(video_ids)[0]
  for video_id in data:
    print(f"Getting transcript for {video_id}")
    transcript = extract_text(data[video_id])
    transcripts[video_id]= transcript.strip().rstrip()
    if transcript == "":
      print(f"No transcript found for {video_id}")
    else:
      print(f"Successfully extracted transcript for {video_id}")
    print("------------------------")
  return transcripts

In [6]:
def preprocess_text(text):
  doc = nlp(text)
  joined_text = ".".join([sent.text.capitalize() for sent in doc.sents if len(sent.text.split(" "))>3])
  return joined_text

In [19]:
def scrape_info(ids):
    # getting the request from url
    metadata = {}
    for id in ids:
      url = f"https://www.youtube.com/watch?v={id}"
      r = requests.get(url)
      # converting the text
      base_text = r.text.split("twoColumnWatchNextResults")[1].split("videoPrimaryInfoRenderer:")[0]
      title = base_text.split('"title":')[1].split("}]}")[0].split('"runs":[{"text":')[1].replace('"',"")
      views = base_text.split('"title":')[1].split("videoViewCountRenderer")[1].split("}")[0].split('"simpleText":')[1].split(" ")[0].replace('"',"")
      likes = base_text.split("segmentedLikeDislikeButtonRenderer")[1].split("defaultText")[1].split("}}")[0].split('"label":')[1].split(" ")[0].replace('"',"")
      meta = {"title":title, "likes":likes, "views":views}
      print(f"{id}: {meta}")
      metadata[id] = meta
    return metadata

In [22]:
def main(video_links, save_path="/content/summaries", ratio=0.6):
  video_ids = get_ids(video_links)
  metadata = scrape_info(video_ids)
  print(f"Extracted ids and its meta info: {video_ids}")
  transcripts_data = get_transcript_from_ids(video_ids)
  
  for id in transcripts_data:
    print(f"---------------video ID : {id} | title : {metadata[id]['title']} -----------------")
    print(f"Preprcoessing transcript")
    clean_text = preprocess_text(transcripts_data[id])
    print("Preprocessing complete")
    print("Summarizing...")
    result = summary_model(clean_text, ratio=ratio)
    print("Summarization complete")
    print("saving data...")
    json_result = {}
    for idx, r in enumerate(result.split(".")):
      r = r.strip().rstrip()
      if r != '':
        json_result[idx] = r

    save_file = {"url": f"https://www.youtube.com/watch?v={id}", "video id":id, "metadata":metadata[id], "points":json_result, "full_summary":result, "transcript":transcripts_data[id]}
    with open(f"{save_path}/{metadata[id]['title'].replace(' ', '_')}.json", "w") as f:
      json.dump(save_file, f, indent=4)
    print("saved!!")

In [23]:
if __name__ == "__main__":
  video_links = ["https://www.youtube.com/watch?v=OdzAQFmyxNo",
                 "https://www.youtube.com/watch?v=mU7hdGKOGyk"]
  main(video_links)

OdzAQFmyxNo: {'title': 'Freud and Philosophy', 'likes': '4,108', 'views': '151,865'}
mU7hdGKOGyk: {'title': 'Machiavelli', 'likes': '3,050', 'views': '81,902'}
Extracted ids and its meta info: ['OdzAQFmyxNo', 'mU7hdGKOGyk']
Getting transcript for OdzAQFmyxNo
Successfully extracted transcript for OdzAQFmyxNo
------------------------
Getting transcript for mU7hdGKOGyk
Successfully extracted transcript for mU7hdGKOGyk
------------------------
---------------video ID : OdzAQFmyxNo | title : Freud and Philosophy -----------------
Preprcoessing transcript
Preprocessing complete
Summarizing...
Summarization complete
saving data...
saved!!
---------------video ID : mU7hdGKOGyk | title : Machiavelli -----------------
Preprcoessing transcript
Preprocessing complete
Summarizing...
Summarization complete
saving data...
saved!!
