In [None]:
import os
import re
import time
import math
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

# 定数設定
KW_PARAMETER = '公式サイト'
INPUT_FILE = '/content/input.csv'
OUTPUT_FILE = '/content/seturl.csv'
TIMEOUT = 30
SLEEP_RANGE = (2, 4)  # 秒

# 正規表現パターン
PHONE_PATTERN = re.compile(
    r'(?:\d{4}|０{4})[-－](?:\d{2}|０{2})[-－](?:\d{4}|０{4})|'  # 4-2-4形式
    r'(?:\d{2}|０{2})[-－](?:\d{4}|０{4})[-－](?:\d{4}|０{4})|'  # 2-4-4形式
    r'(?:\d{3}|０{3})[-－](?:\d{3}|０{3})[-－](?:\d{4}|０{4})|'  # 3-3-4形式
    r'(?:\d{3}|０{3})[-－](?:\d{4}|０{4})[-－](?:\d{4}|０{4})|'  # 3-4-4形式
    r'((?:\d{4}|０{4})[-－](?:\d{3}|０{3})[-－](?:\d{3}|０{3}))'  # 4-3-3形式
)
EMAIL_PATTERN = re.compile(r'[a-zA-Z0-9-_]+(?:\.[a-zA-Z0-9-_]+)*@[a-zA-Z0-9-_]+(?:\.[a-zA-Z0-9-_]+)*\.[a-zA-Z]{1,}')

# 共通キーワード
CONTACT_KEYWORDS = ['contact', 'inquiry', 'support']
COMPANY_INFO_KEYWORDS = ['会社概要', '会社案内', '企業概要', '企業情報', 'company', 'about']

# パブリックIPの取得
def get_public_ip():
    """外部サービスを利用してパブリックIPを取得する関数"""
    response = requests.get('https://httpbin.org/ip')
    return response.json()['origin']

# URLが有効かどうかをチェックする関数
def is_valid_url(url):
    """URLが有効かどうかを確認する"""
    parsed = urlparse(url)
    return bool(parsed.scheme) and bool(parsed.netloc)

# 相対パスを絶対URLに変換する関数
def convert_to_absolute_url(base_url, link):
    """相対パスを絶対URLに変換"""
    return urljoin(base_url, link)

# Google検索で企業URLを取得する関数
def get_company_url(company_name, representative_director, address):
    """企業情報を基にGoogle検索からURLを取得する"""
    base_url = 'https://www.google.com/search?q='
    company_name = company_name or ''
    representative_director = representative_director or ''
    address = address or ''

    query = f"{base_url}{company_name}+{representative_director}+{address}+{KW_PARAMETER}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }

    res = requests.get(query, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')

    print(query)

    # 検索結果からURLを抽出
    table = soup.find("a", attrs={"data-sb": True, "jsname": "UWckNb"})
    url = table.get("href") if table else "null"
    print(url)
    return url

# 企業情報と問い合わせリンクを取得する関数
def extract_company_info(url):
    """企業ページから電話番号、メールアドレス、問い合わせリンクを抽出"""
    if not is_valid_url(url):
        print(f"Invalid URL skipped: {url}")
        return None, None, None, None

    try:
        response = requests.get(url, verify=False, timeout=TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()

        phone_match = PHONE_PATTERN.search(text)
        email_match = EMAIL_PATTERN.search(text)
        phone_number = phone_match.group() if phone_match else None
        email_address = email_match.group() if email_match else None

        contact_link = find_priority_link(soup, CONTACT_KEYWORDS, url)
        company_info_link = find_priority_link(soup, COMPANY_INFO_KEYWORDS, url)

        return phone_number, email_address, contact_link, company_info_link

    except Exception as e:
        print(f"Error accessing {url}: {e}")
        return None, None, None, None

# 優先リンクを探す関数
def find_priority_link(soup, keywords, base_url):
    """指定したキーワードに基づいてリンクを探す"""
    for link in soup.find_all('a'):
        href = link.get('href', '')
        if href and any(keyword in href.lower() for keyword in keywords):
            return convert_to_absolute_url(base_url, href)
    return None

# メイン処理
def main():
    """メインのCSV処理と情報取得のループ"""
    input_df = pd.read_csv(INPUT_FILE)
    output_df = pd.read_csv(OUTPUT_FILE) if os.path.exists(OUTPUT_FILE) else pd.DataFrame(columns=['企業名', '代表取締役', '住所', 'ホームページ', '完了フラグ'])

    last_completed_index = input_df[input_df['完了フラグ'] == 1].index.max()
    start_index = 0 if pd.isna(last_completed_index) else last_completed_index + 1

    for index, row in input_df.iloc[start_index:].iterrows():
        company_name = row['企業名']
        representative_director = row['代表取締役']
        address = row['住所']

        url = get_company_url(company_name, representative_director, address)
        phone, email, contact, company_info = extract_company_info(url)

        new_row = pd.DataFrame({
            '企業名': [company_name],
            '代表取締役': [representative_director],
            '住所': [address],
            'ホームページ': [url],
            '電話番号': [phone],
            'メールアドレス': [email],
            '問い合わせページ': [contact],
            '企業概要ページ': [company_info],
            '完了フラグ': [1]
        })

        output_df = pd.concat([output_df, new_row], ignore_index=True)
        output_df.to_csv(OUTPUT_FILE, index=False)

        time.sleep(random.uniform(*SLEEP_RANGE))

    print("処理が完了しました。")

if __name__ == '__main__':
    print("パブリックIPアドレス:", get_public_ip())
    main()
