# GitHubリポジトリのスクレイピング

In [23]:
import requests
from bs4 import BeautifulSoup
import time
import sqlite3


## 設定


In [24]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

base_url = 'https://github.com/orgs/google/repositories'
db_name = '課題.db'


## データベースの作成


In [25]:
try:
    conn = sqlite3.connect(db_name)
    cur = conn.cursor()
    
    sql = '''CREATE TABLE IF NOT EXISTS repositories (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT NOT NULL,
                language TEXT,
                stars TEXT
            );'''
    
    cur.execute(sql)
    conn.commit()
    
except sqlite3.Error as e:
    print('エラーが発生しました:', e)
else:
    print('テーブルの作成が完了しました。')
finally:
    conn.close()


テーブルの作成が完了しました。


## スクレイピング関数


In [26]:
def scrape_google_repos(max_pages=3):
    repos = []
    page = 1
    
    while page <= max_pages:
        url = f"{base_url}?page={page}"
        
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
        except requests.exceptions.HTTPError as e:
            print("HTTPエラーが発生しました:", e)
            break
        except requests.exceptions.RequestException as e:
            print("リクエストエラーが発生しました:", e)
            break
        else:
            soup = BeautifulSoup(response.text, 'html.parser')
            h3_tags = soup.find_all('h3')
            
            if not h3_tags:
                break
            
            for h3 in h3_tags:
                try:
                    name_elem = h3.find('a')
                    if not name_elem:
                        continue
                    
                    name = name_elem.text.strip()
                    li_parent = h3.find_parent('li')
                    
                    if li_parent:
                        lang_elem = li_parent.find('span', class_='ReposListItem-module__Text_4--mkG7R')
                        language = lang_elem.text.strip() if lang_elem else "None"
                        
                        star_elem = li_parent.find('a', href=lambda x: x and 'stargazers' in x)
                        stars = star_elem.text.strip() if star_elem else "0"
                    else:
                        language = "None"
                        stars = "0"
                    
                    repos.append((name, language, stars))
                except:
                    continue
            
            time.sleep(1)
            page += 1
        finally:
            print(f"ページ {page-1} 完了")
    
    return repos


## スクレイピング実行

In [27]:
repos = scrape_google_repos(max_pages=3)
print(f"{len(repos)}件取得")

ページ 1 完了
ページ 2 完了
ページ 3 完了
90件取得


## データベースへの保存

In [28]:
try:
    conn = sqlite3.connect(db_name)
    cur = conn.cursor()
    
    sql = "INSERT INTO repositories (name, language, stars) VALUES (?, ?, ?);"
    cur.executemany(sql, repos)
    conn.commit()
    
except sqlite3.Error as e:
    print('エラーが発生しました:', e)
else:
    print(f'{len(repos)}件保存完了')
finally:
    conn.close()


90件保存完了


## データベースの表示

In [29]:
try:
    conn = sqlite3.connect(db_name)
    cur = conn.cursor()
    
    sql = "SELECT * FROM repositories;"
    cur.execute(sql)
    
except sqlite3.Error as e:
    print('エラーが発生しました:', e)
else:
    for row in cur:
        id, name, language, stars = row
        print(f"{id} | {name} | {language} | {stars}")
finally:
    conn.close()


1 | site-kit-wp | JavaScript | 1.3k
2 | sedpack | Python | 28
3 | nomulus | Java | 1.8k
4 | docsy | JavaScript | 2.9k
5 | deps.dev | Go | 354
6 | budoux | Python | 1.5k
7 | perfetto | C++ | 5k
8 | or-tools | C++ | 13k
9 | googletest-rust | Rust | 393
10 | dwh-migration-tools | Java | 54
11 | gtm-session-fetcher | Objective-C | 265
12 | flatbuffers | C++ | 25k
13 | nearby | C++ | 887
14 | orbax | Python | 455
15 | angle | C++ | 3.8k
16 | osv-scalibr | Go | 537
17 | tcmalloc | C++ | 5k
18 | bigwheels | C++ | 101
19 | device-infra | Java | 58
20 | secops-wrapper | Python | 44
21 | kotlin-fhirpath | Kotlin | 6
22 | protobuf.dart | Dart | 558
23 | tabuli | C++ | 35
24 | tsl | C++ | 101
25 | closure-compiler | JavaScript | 7.6k
26 | yggdrasil-decision-forests | C++ | 622
27 | adk-go | Go | 4.8k
28 | ts-bridge | Go | 54
29 | filonov | Python | 11
30 | synopsys-dw-uart | Rust | 1
31 | percore | Rust | 8
32 | aarch64-rt | Rust | 17
33 | safe-mmio | Rust | 17
34 | pl011-uart | Rust | 6
35 | aarc