# 📘 Notebook: Criação da Tabela `users_yt` a partir da Wikipedia API
Este notebook tem como objetivo extrair informações de usuários do YouTube presentes na Wikipedia e criar a tabela **`users_yt`** no Databricks.


In [0]:
import requests
from pyspark.sql import SparkSession
import re

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
df_wiki = spark.table("default.creators_scrape_wiki")
wiki_pages = [row["wiki_page"] for row in df_wiki.collect()]

In [0]:
def extract_user_id_from_wiki(page_name):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": page_name,
        "format": "json"
    }
    try:
        response = requests.get(
            url,
            params=params,
            headers={"User-Agent": "Mozilla/5.0"}  # <- melhoria 1
        )
        if response.status_code != 200:
            return None
        html = response.json()["parse"]["text"]["*"]

        # Regex para pegar /user/, /channel/ ou /c/
        match = re.search(
            r'https:\/\/www\.youtube\.com\/(?:user|channel|c)\/([a-zA-Z0-9_-]+)',
            html
        )
        if match:
            return match.group(1)
    except Exception as e:
        print(f"Erro ao processar {page_name}: {e}")
    return None

In [0]:
users_data = []
for page in wiki_pages:
    user_id = extract_user_id_from_wiki(page)
    if user_id:
        users_data.append({"wiki_page": page, "user_id": user_id})

In [0]:
df_users = spark.createDataFrame(users_data)

In [0]:
df_users.write.format("delta").mode("overwrite").saveAsTable("default.users_yt")

In [0]:
print("Tabela 'users_yt' criada com sucesso 🚀")

Tabela 'users_yt' criada com sucesso 🚀


In [0]:
%sql
SELECT * FROM default.users_yt

user_id,wiki_page
CanalKondZilla,KondZilla
luccasneto,Luccas_Neto
UCbCmjCuTUZos6Inko4u57UQ,Cocomelon
portadosfundos,Porta_dos_Fundos
felipeneto,Felipe_Neto
PewDiePie,PewDiePie
judsonlaipply,Judson_Laipply
raywilliamjohnson,Ray_William_Johnson
MoreZoella,Zoe_Sugg
