# Extract YouTube Data with Playwrite and GPT-4o

In [None]:
!pip install instructor
!pip install markdownify
!pip install playwright
!python -m playwright install

In [None]:
import re
import asyncio
import instructor
from openai import OpenAI
from pydantic import BaseModel
from markdownify import markdownify as md
from playwright.async_api import async_playwright
from google.colab import userdata

## Extract search results from YouTube

In [None]:
search_query = "super cars 2025"

async with async_playwright() as p:
  browser = await p.chromium.launch(headless=True)
  context = await browser.new_context()
  page = await context.new_page()

  # Navigating to the YouTube video URL
  await page.goto("https://www.youtube.com/", wait_until="networkidle")
  await page.fill('input[name="search_query"]', search_query)
  await page.press('input[name="search_query"]', 'Enter')

  # Scrolling down to load more comments
  for _ in range(20):
    await page.mouse.wheel(0, 200)
    await asyncio.sleep(0.5)

  # Giving some time for additional content to load
  await page.wait_for_timeout(1000)

  # Extracting the page content
  page_content = await page.content()

In [None]:
# convvert HTML content to markdown
content = md(page_content, heading_style="ATX")

# remove empty lines
content = re.sub(r'\n{3,}', '\n\n', content)

In [None]:
print(content)

## Extract insights using GPT-4o

Use Instructor library to extract insights from the markdown doc in a fixed format.

In [None]:
class youtube_info(BaseModel):
  video_title: str
  video_duration: str
  views: str
  video_upload_time: str
  channel_id: str

In [None]:
client = instructor.from_openai(OpenAI(api_key = userdata.get('OPENAI-KEY')))

prompt = """
From the following markdown content of a youtube search results page, extract info like -

1. video name
2. duration of video
3. views count
4. time past since video upload
5. channel id of each video

* Do not consider YouTube Shorts videos.
\n\n
"""

res = client.chat.completions.create_iterable(
    model="gpt-4o",
    response_model=youtube_info,
    messages=[{"role": "user", "content": prompt+content}],
)

In [None]:
video_info = []

# generate multiple choice questions
for v in res:
    video_info.append(v.model_dump())

In [None]:
# check number of entities extracted
len(video_info)

In [None]:
# print results
video_info