In [12]:
import sqlite3

db_path = "google_repos.db"

conn = sqlite3.connect(db_path)
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS repos (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    language TEXT,
    stars INTEGER
)
""")

conn.commit()
conn.close()

print("DB作成完了")


DB作成完了


In [13]:
import requests
from bs4 import BeautifulSoup
import time

BASE_URL = "https://github.com/orgs/google/repositories"


In [14]:
def fetch_page(url):
    try:
        res = requests.get(url, timeout=5)
        res.raise_for_status()
        return res.text
    except Exception as e:
        print("Error:", e)
        return None


In [None]:
def parse_repos(html):
    soup = BeautifulSoup(html, "html.parser")
    repo_list = []

    items = soup.select("div.Box-row")
    print("HTMLから取得した件数:", len(items))

    for item in items:
        name_tag = item.select_one("h3 a")
        name = name_tag.text.strip() if name_tag else None

        lang_tag = item.select_one("span[itemprop='programmingLanguage']")
        lang = lang_tag.text.strip() if lang_tag else None

        star_tag = item.select_one("a[href$='stargazers']")
        if star_tag:
            s = star_tag.text.strip().lower().replace(",", "")
            if "k" in s:
                stars = int(float(s.replace("k", "")) * 1000)
            else:
                stars = int(s) if s.isdigit() else 0
        else:
            stars = 0

        repo_list.append((name, lang, stars))

    return repo_list


In [None]:
def scrape_all():
    url = BASE_URL
    all_repos = []

    while url:
        print("Fetch:", url)
        html = fetch_page(url)
        if not html:
            break

        repos = parse_repos(html)
        all_repos.extend(repos)

        soup = BeautifulSoup(html, "html.parser")
        next_btn = soup.select_one("a[rel='next']")
        url = "https://github.com" + next_btn["href"] if next_btn else None

        time.sleep(1)

    return all_repos


In [17]:
def save_to_db(repos):
    conn = sqlite3.connect("google_repos.db")
    cur = conn.cursor()

    for r in repos:
        cur.execute("""
            INSERT INTO repos (name, language, stars)
            VALUES (?, ?, ?)
        """, r)
        conn.commit()

    conn.close()
    print("DB保存完了")


In [18]:
repos = scrape_all()

print("取得したリポジトリ数:", len(repos))

save_to_db(repos)


Fetch: https://github.com/orgs/google/repositories
HTMLから取得した件数: 0
Fetch: https://github.com#2
HTMLから取得した件数: 0
取得したリポジトリ数: 0
DB保存完了


In [19]:
conn = sqlite3.connect("google_repos.db")
cur = conn.cursor()

for row in cur.execute("SELECT * FROM repos LIMIT 30"):
    print(row)

conn.close()
