In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import logging
from datetime import datetime

In [3]:
# ロギングの設定
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# スクレイピング関数の定義
def scrape_page(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('a', href=True)
        urls_titles = []
        for a_tag in articles:
            h2_tag = a_tag.find('h2')
            if h2_tag:
                urls_titles.append({
                    "Title": h2_tag.text,
                    "URL": a_tag['href']
                })
        return urls_titles
    except requests.RequestException as e:
        logging.error(f"Error fetching {url}: {e}")
        return []

# スクレイピング開始処理
def start_scraping(base_url, max_pages):
    all_data = []
    for page_num in range(1, max_pages + 1):
        current_url = f"{base_url}/page/{page_num}/"
        logging.info(f"Scraping {current_url}")
        data = scrape_page(current_url)
        all_data.extend(data)
    return all_data

# ページから名言を抽出する関数
def extract_quotes(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        divs = soup.find_all('div', class_='blank-box bb-green')
        quotes_texts = []
        for div in divs:
            strong_tags = div.find_all('strong')
            combined_text = ' '.join(tag.get_text(strip=True) for tag in strong_tags)
            if combined_text:
                quotes_texts.append(combined_text)
        return quotes_texts
    except Exception as e:
        logging.error(f"Error extracting quotes from {url}: {e}")
        return []

# 全てのURLに対して名言を抽出し、リストに追加
def scrape_all_quotes(urls_titles):
    all_quotes = []
    for url_title in urls_titles:
        logging.info(f"Extracting quotes from {url_title['URL']}")
        quotes = extract_quotes(url_title['URL'])
        for quote in quotes:
            all_quotes.append({"Title": url_title['Title'], "Quote": quote, "URL": url_title['URL']})
    return all_quotes

#DBファイルとして保存する関数
def save_to_database(quotes, db_name=None):
    if db_name is None:
        date_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        db_name = f'quotes_{date_str}.db'
    
    try:
        df = pd.DataFrame(quotes)
        conn = sqlite3.connect(db_name)
        df.to_sql('quotes', conn, if_exists='append', index=False)
        conn.close()
        logging.info(f"Data successfully saved to {db_name}")
    except Exception as e:
        logging.error(f"Error saving data to database: {e}")

# 実行部分 もともとは　213pages
if __name__ == "__main__":
    base_url = "https://bontoku.com"
    max_pages = 1
    scraped_data = start_scraping(base_url, max_pages)
    all_quotes = scrape_all_quotes(scraped_data)
    save_to_database(all_quotes)


2024-04-20 08:52:42,190 - INFO - Scraping https://bontoku.com/page/1/


2024-04-20 08:52:42,918 - INFO - Extracting quotes from https://bontoku.com/meigen-yorokobareruhitoni
2024-04-20 08:52:43,668 - INFO - Extracting quotes from https://bontoku.com/meigen-nagamatusigehisa
2024-04-20 08:52:44,531 - INFO - Extracting quotes from https://bontoku.com/akindle-sellinfo
2024-04-20 08:52:46,702 - INFO - Extracting quotes from https://bontoku.com/meigen-nisioiti
2024-04-20 08:52:47,572 - INFO - Data successfully saved to quotes_20240420_085247.db


In [5]:
#dbのクレンジング

import sqlite3

# データベース名を指定
db_name = 'quotes_20240417_135122_加工用.db'  # ここに実際のデータベースファイル名を入力してください

# SQLiteに接続
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# 不要な行を削除
delete_queries = [
    "DELETE FROM quotes WHERE Quote LIKE '%下に目次あります%'",
    "DELETE FROM quotes WHERE Quote LIKE '%↓目次をクリック↓%'"
]

for query in delete_queries:
    cursor.execute(query)
    print(f"Deleted rows: {cursor.rowcount}")  # 削除された行数を表示

# 変更をコミットして接続を閉じる
conn.commit()
conn.close()

print("The database has been updated.")


OperationalError: no such table: quotes