<a href="https://colab.research.google.com/github/shivaprajapati34390-netizen/ML-project/blob/main/AI_System_to_Summarize_YouTube_Videos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U youtube-transcript-api transformers accelerate sentencepiece

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.4-py3-none-any.whl.metadata (24 kB)
Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading youtube_transcript_api-1.2.4-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.2/485.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.1.0 youtube-transcript-api-1.2.4


In [5]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import re

In [3]:
def extract_video_id(URL):
  # extract video id from different youtube URL format
  # we use regax to hunt for thr 11-character id after 'v=' or 'youtu.be/
  match=re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", url)
  return match.group(1) if match else None

In [6]:
from logging import disable
def get_transcript(video_id):
  """fetch transcript using the new api formay"""
  try:
    api=YouTubeTranscriptApi()
    # fetch method grap the subtitle object list
    transcript=api.fetch(video_id)
    # we join the script into the single line long string of text
    return " ".join([t.text for t in transcript])
  except TranscriptsDisabled:
    return("Error:Transcript disable for this video")
  except NoTranscriptFound:
    return("Error:No transcript found for this video")
  except Exception as e:
    return f"Error: {str(e)}"

In [7]:
import torch
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
# check if we have a GPU(CUDA)available to speed things up
device="cuda"if torch.cuda.is_available() else "cpu"



In [8]:
model_name="google/Flan-t5-Base"
# load the tokenizer(translate next to number)
tokenizer=AutoTokenizer.from_pretrained(model_name)
# load the model into the neural network and move into the GPU and CPU
model=AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
def summarize_chunk(text_chunk):
  # we give the model a specific instruction (prompt engineering)
  prompt=f"summarize the following text clearly:\n{text_chunk}"
  # convert text to tensor number(inputs)
  inputs=tokenizer(prompt,return_tensors="pt",truncation=True,max_length=1024).to(device)
  # generate the summary
  summary_ids=model.generate(**inputs,max_new_tokens=120, #max length of summary
                             num_beams=4,# look of the four best paths(higher quality)
                             length_penalty=1.0,#balance between short and long
                             early_stopping=True)
  # Decode back to text
  return tokenizer.decode(summary_ids[0],skip_special_tokens=True)




In [16]:
def chunk_text(text, chunk_size=1200):
  # Split the text into sentences using '.' as a delimiter
  # Filter out any empty strings that result from splitting
  sentences = [s.strip() for s in text.split('.') if s.strip()]

  chunks = []
  current_chunk_sentences = []
  current_chunk_len = 0

  for sentence in sentences:
    # Estimate length including a space and a dot if joined later
    sentence_len = len(sentence) + 2  # +1 for space, +1 for dot

    if current_chunk_len + sentence_len <= chunk_size:
      current_chunk_sentences.append(sentence)
      current_chunk_len += sentence_len
    else:
      # If the current chunk is not empty, finalize it and start a new one
      if current_chunk_sentences:
        chunks.append(". ".join(current_chunk_sentences) + ".")
      current_chunk_sentences = [sentence]
      current_chunk_len = sentence_len

  # Add any remaining sentences as the last chunk
  if current_chunk_sentences:
    chunks.append(". ".join(current_chunk_sentences) + ".")

  return chunks

In [20]:
def generate_video_notes(video_URL):
  print(f"\n 🎬Processing video:{video_URL}")

  video_id=extract_video_id(video_URL)
  if not video_id:
    print("Invalid Youtube URL")
    return
  # This print statement was unreachable due to the previous return. Moved it here.
  print("🎧 Fetching transcript...")
  transcript=get_transcript(video_id)
  # Check for error string from get_transcript
  if transcript.startswith("Error"):
    print(transcript)
    return

  # This print statement was unreachable due to the previous return. Moved it here.
  print( "🔪 Chunking transcript...")
  chunks=chunk_text(transcript)
  # Ensure chunks is not None and contains data
  if not chunks:
    print("Error: No chunks created from transcript.")
    return
  print(f" -> {len(chunks)} chunk created.")

  print("🧠 Generating AI notes...")
  notes=[]

  # loop through chunk and summarize each one
  for i ,chunk_item in enumerate(chunks): # Corrected typo 'emmunrate' to 'enumerate' and changed variable name from 'chunk' to 'chunk_item' to avoid conflict with the list 'chunks'
    print(f"-> Processing chunk {i+1}/{len(chunks)}") # Using 'chunks' for total length
    summary=summarize_chunk(chunk_item)
    notes.append(f"-{summary}")


  print("\n"+"="*50)
  print("📝 AI GENERATED NOTES")
  print("\n"+"="*50)
  print("\n".join(notes))

if __name__=="__main__":
    url=input("Enter Youtube Video URL: ") # Changed default text to a prompt
    generate_video_notes(url) # Corrected URL to url

Enter Youtube Video URL:  https://www.youtube.com/watch?v=KLfer0MES2w

 🎬Processing video: https://www.youtube.com/watch?v=KLfer0MES2w
🎧 Fetching transcript...
🔪 Chunking transcript...
 -> 8 chunk created.
🧠 Generating AI notes...
-> Processing chunk 1/8
-> Processing chunk 2/8
-> Processing chunk 3/8
-> Processing chunk 4/8
-> Processing chunk 5/8
-> Processing chunk 6/8
-> Processing chunk 7/8
-> Processing chunk 8/8

📝 AI GENERATED NOTES

-No team has ever won a T20 World Cup playing at home. No team has ever defended a T20 World Cup title.
-It takes away the pressure for 1 2 3 4 because the one factor the one key factor still in today's T20 cricket is how many wickets do you lose up front because the most important factor and that is the way the game is being driven is how powerfully how strongly do you finish and to be able to finish strongly powerfully you need wickets in hand.
-Bumrah's economy rate is so important because with someone like him, people are just trying to play hi