# üìπ YouTube Transcript Extractor & Translator

This Colab notebook lets you:
1. Input a YouTube video URL
2. Extract the transcript using `youtube-transcript-api`
3. Translate it into English using `googletrans`
4. Display the translated transcript

In [None]:
# Auto-run on load (if using browser automation)
import IPython
from IPython.display import Javascript

Javascript('''
(async function() {
  await google.colab.kernel.invokeFunction('notebook.RunAll', [], {});
})()
''')


In [None]:
# ‚úÖ Install required packages
!pip install youtube-transcript-api googletrans==4.0.0-rc1
!pip install pytube deep-translator


Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m57.6/57.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [None]:
pip install playwright

Collecting playwright
  Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl (45.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m45.1/45.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.52.0 pyee-13.0.0


In [None]:
!playwright install

Downloading Chromium 136.0.7103.25 (playwright build v1169)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1169/chromium-linux.zip[22m
[1G167.7 MiB [] 0% 0.0s[0K[1G167.7 MiB [] 0% 19.1s[0K[1G167.7 MiB [] 0% 9.7s[0K[1G167.7 MiB [] 0% 5.1s[0K[1G167.7 MiB [] 1% 3.7s[0K[1G167.7 MiB [] 2% 3.3s[0K[1G167.7 MiB [] 2% 3.2s[0K[1G167.7 MiB [] 3% 3.1s[0K[1G167.7 MiB [] 4% 3.1s[0K[1G167.7 MiB [] 4% 2.8s[0K[1G167.7 MiB [] 5% 2.7s[0K[1G167.7 MiB [] 5% 2.9s[0K[1G167.7 MiB [] 6% 2.9s[0K[1G167.7 MiB [] 7% 2.8s[0K[1G167.7 MiB [] 7% 2.7s[0K[1G167.7 MiB [] 8% 2.6s[0K[1G167.7 MiB [] 8% 2.7s[0K[1G167.7 MiB [] 9% 2.6s[0K[1G167.7 MiB [] 9% 2.7s[0K[1G167.7 MiB [] 10% 2.7s[0K[1G167.7 MiB [] 11% 2.6s[0K[1G167.7 MiB [] 11% 2.7s[0K[1G167.7 MiB [] 12% 2.6s[0K[1G167.7 MiB [] 13% 2.5s[0K[1G167.7 MiB [] 14% 2.5s[0K[1G167.7 MiB [] 15% 2.4s[0K[1G167.7 MiB [] 16% 2.4s[0K[1G167.7 MiB [] 17% 2.3s[0K[1G167.7 MiB [] 18% 2.2s[0K[1G167.

In [None]:

import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import re

# Function to extract Telugu + time marker entries
async def fetch_time_tagged_telugu(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_timeout(5000)

        html = await page.content()
        soup = BeautifulSoup(html, 'html.parser')

        timestamp_regex = re.compile(r"\[\d{1,2}:\d{2}\]")
        telugu_regex = re.compile(r"[\u0C00-\u0C7F]")

        entries = []

        for tag in soup.find_all(['p', 'h1', 'h2', 'div', 'span']):
            raw_text = tag.get_text(strip=True)
            if not raw_text:
                continue

            # Find all timestamps in this tag
            timestamps = timestamp_regex.findall(raw_text)

            # If timestamps exist in this block
            if timestamps:
                split_text = timestamp_regex.split(raw_text)
                split_text = [t.strip() for t in split_text if t.strip()]

                for i, sentence in enumerate(split_text):
                    if i < len(timestamps) and telugu_regex.search(sentence):
                        entries.append({
                            "time": timestamps[i],
                            "text": sentence
                        })
            elif telugu_regex.search(raw_text):
                entries.append({
                    "time": "",
                    "text": raw_text
                })

        await browser.close()
        return entries

# Example usage
url = "https://www.eenadu.net/telugu-news"  # Replace with actual article URL
results = asyncio.get_event_loop().run_until_complete(fetch_time_tagged_telugu(url))

# Display clean output
for entry in results:
    if entry["time"]:
        print(f"{entry['time']} {entry['text']}")
    else:
        print(entry["text"])


[12:36] TRENDINGFlight CrashIND vs ENGRainsBreaking|Feedback|ePratibha|E-PAPER|Pratibha‡∞Ü‡∞Ç‡∞ß‡±ç‡∞∞‡∞™‡±ç‡∞∞‡∞¶‡±á‡∞∂‡±ç‡∞∞‡∞æ‡∞∑‡±ç‡∞ü‡±ç‡∞∞ ‡∞µ‡∞æ‡∞∞‡±ç‡∞§‡∞≤‡±Å‡∞ú‡∞ø‡∞≤‡±ç‡∞≤‡∞æ ‡∞µ‡∞æ‡∞∞‡±ç‡∞§‡∞≤‡±Å‡∞§‡±Ü‡∞≤‡∞Ç‡∞ó‡∞æ‡∞£‡∞∞‡∞æ‡∞∑‡±ç‡∞ü‡±ç‡∞∞ ‡∞µ‡∞æ‡∞∞‡±ç‡∞§‡∞≤‡±Å‡∞ú‡∞ø‡∞≤‡±ç‡∞≤‡∞æ ‡∞µ‡∞æ‡∞∞‡±ç‡∞§‡∞≤‡±Å‡∞ú‡∞æ‡∞§‡±Ä‡∞Ø‡∞Ç‡∞Ö‡∞Ç‡∞§‡∞∞‡±ç‡∞ú‡∞æ‡∞§‡±Ä‡∞Ø‡∞Ç‡∞ï‡±ç‡∞∞‡±à‡∞Æ‡±ç‡∞¨‡∞ø‡∞ú‡∞ø‡∞®‡±Ü‡∞∏‡±ç‡∞ï‡±ç‡∞∞‡±Ä‡∞°‡∞≤‡±Å‡∞∏‡∞ø‡∞®‡∞ø‡∞Æ‡∞æ‡∞µ‡∞∏‡±Å‡∞Ç‡∞ß‡∞∞‡∞´‡±Ä‡∞ö‡∞∞‡±ç ‡∞™‡±á‡∞ú‡±Ä‡∞≤‡±Å‡∞ö‡∞¶‡±Å‡∞µ‡±Å‡∞∏‡±Å‡∞ñ‡±Ä‡∞≠‡∞µ‡∞à-‡∞®‡∞æ‡∞°‡±Å‡∞Æ‡∞ï‡∞∞‡∞Ç‡∞¶‡∞Ç‡∞à ‡∞§‡∞∞‡∞Ç‡∞Ü‡∞π‡∞æ‡∞π‡∞æ‡∞Ø‡±ç ‡∞¨‡±Å‡∞ú‡±ç‡∞ú‡±Ä‡∞∏‡±ç‡∞•‡∞ø‡∞∞‡∞æ‡∞∏‡±ç‡∞§‡∞ø‡∞¶‡±á‡∞µ‡∞§‡∞æ‡∞∞‡±ç‡∞ö‡∞®‡∞µ‡±Ü‡∞¨‡±ç ‡∞∏‡±ç‡∞ü‡±ã‡∞∞‡±Ä‡∞∏‡±ç‡∞ï‡∞•‡∞æ‡∞Æ‡±É‡∞§‡∞Ç‡∞é‡∞®‡±ç‡∞Ü‡∞∞‡±ç‡∞ê‡∞á‡∞Ç‡∞ï‡∞æ..‡∞´‡±ä‡∞ü‡±ã‡∞≤‡±Å‡∞µ‡±Ä‡∞°‡∞ø‡∞Ø‡±ã‡∞≤‡±Å‡∞µ‡±Ü‡∞¨‡±ç ‡∞™‡±ç‡∞∞‡∞§‡±ç‡∞Ø‡±á‡∞ï‡∞Ç‡∞∏‡∞Ç‡∞°‡±á ‡∞Æ‡±ç‡∞Ø‡∞æ‡∞ó‡∞ú‡±à‡∞®‡±ç‡∞ï‡±ç‡∞Ø‡∞æ‡∞≤‡±Ü‡∞Ç‡∞°‡∞∞‡±ç‡∞∞‡∞æ‡∞∂‡∞ø‡∞´‡∞≤‡∞Ç‡∞∞‡∞ø‡∞ú‡∞≤‡±ç‡∞ü‡±ç‡∞∏‡±ç‡∞¨‡±ç‡∞∞‡±á‡∞ï‡∞ø‡∞Ç‡∞ó‡±ç‡∞Æ‡±Ä‡∞∞