In [None]:
import time
import re
import sqlite3
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://github.com/orgs/google/repositories?type=all"
ORG = "google"
LIST_URL = f"{BASE_URL}/orgs/{ORG}/repositories"

HEADERS = {
    "User-Agent": "google-repos-scraper/1.0 (+contact@example.com)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Connection": "close",
}

def parse_star_count(text: str) -> int:
    # 例: "1.2k", "3,456", "987", "2.1M"
    t = text.strip().lower().replace(",", "")
    m = re.match(r"^([0-9]+(?:\.[0-9]+)?)([km]?)$", t)
    if not m:
        return 0
    num = float(m.group(1))
    suffix = m.group(2)
    if suffix == "k":
        num *= 1_000
    elif suffix == "m":
        num *= 1_000_000
    return int(num)

def fetch(url: str, timeout=(10, 60)) -> str:
    resp = requests.get(url, headers=HEADERS, timeout=timeout)
    resp.raise_for_status()
    return resp.text

def extract_repos_from_list_page(html: str) -> List[Dict[str, Optional[str]]]:
    soup = BeautifulSoup(html, "html.parser")
    results = []
    # 各リポジトリカードを探す
    # GitHubはマークアップが変わる可能性があるため、複数候補セレクタを試す
    repo_cards = soup.select("li[data-test-selector='repo-list-item']") or \
                 soup.select("div.org-repos li") or \
                 soup.select("div[data-test-selector='org-repositories'] li")
    for card in repo_cards:
        # リポジトリ名リンク（例: <a data-testid="repo-name-link" href="/google/guava">guava</a>）
        name = None
        link = card.select_one("a[href*='/{}/']".format(ORG)) or \
               card.select_one("a[data-testid='repo-name-link']") or \
               card.select_one("a[href^='/{}/']".format(ORG))
        if link:
            name = link.get_text(strip=True)
            # たまに "google / repo" のように含む場合は末尾を採用
            if "/" in name:
                name = name.split("/")[-1].strip()

        # 言語（例: <span itemprop="programmingLanguage">Java</span>）
        lang = None
        lang_el = card.select_one("[itemprop='programmingLanguage']") or \
                  card.select_one("span:has(svg[aria-label='Programming language'])") or \
                  card.find("span", string=True)
        if lang_el and getattr(lang_el, "get_text", None):
            cand = lang_el.get_text(strip=True)
            # 明らかに不適切な文言を除外
            if cand and len(cand) <= 30 and "Updated" not in cand and "Star" not in cand:
                lang = cand

        # スター（例: <a href="/google/guava/stargazers">1.2k</a>）
        stars = 0
        star_el = None
        for sel in [
            "a[href$='/stargazers']",
            "a.Link--muted[href$='/stargazers']",
            "a[aria-label*='star'][href$='/stargazers']",
        ]:
            star_el = card.select_one(sel)
            if star_el:
                break
        if star_el:
            stars = parse_star_count(star_el.get_text(strip=True))

        if name:
            results.append({"name": name, "language": lang, "stars": stars})
    return results

def find_next_page_url(html: str, current_url: str) -> Optional[str]:
    soup = BeautifulSoup(html, "html.parser")
    # ページネーションのNextリンク
    next_link = soup.select_one("a.next_page") or soup.select_one("a[rel='next']")
    if next_link and next_link.get("href"):
        return urljoin(current_url, next_link["href"])
    return None

# DB関連
def init_db(db_path: str = "repos.db"):
    con = sqlite3.connect(db_path)
    cur = con.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS repos (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT UNIQUE,
            language TEXT,
            stars INTEGER
        )
    """)
    con.commit()
    return con

def upsert_repos(con, repos: List[Dict[str, Optional[str]]]):
    cur = con.cursor()
    for r in repos:
        cur.execute("""
            INSERT INTO repos (name, language, stars)
            VALUES (?, ?, ?)
            ON CONFLICT(name) DO UPDATE SET
                language=excluded.language,
                stars=excluded.stars
        """, (r["name"], r.get("language"), r.get("stars", 0)))
    con.commit()

def main(max_pages: int = 5, db_path: str = "repos.db"):
    con = init_db(db_path)

    page_url = LIST_URL
    pages_fetched = 0

    while page_url and pages_fetched < max_pages:
        print(f"Fetching: {page_url}")
        html = fetch(page_url)
        repos = extract_repos_from_list_page(html)
        print(f" - Found {len(repos)} repos on this page")
        upsert_repos(con, repos)
        pages_fetched += 1

        # 連続アクセス抑制
        time.sleep(1)

        next_url = find_next_page_url(html, page_url)
        page_url = next_url

    # 保存内容の確認表示
    cur = con.cursor()
    for row in cur.execute("SELECT name, language, stars FROM repos ORDER BY stars DESC, name ASC"):
        name, lang, stars = row
        print(f"{name}\t{lang or ''}\t{stars}")

    con.close()

if __name__ == "__main__":
    main(max_pages=1000, db_path="google_repos.db")

Fetching: https://github.com/orgs/google/repositories/orgs/google/repositories


HTTPError: 404 Client Error: Not Found for url: https://github.com/orgs/google/repositories/orgs/google/repositories