<a href="https://colab.research.google.com/github/shrmtmt/Get-English-Japanese-translation-from-weblio/blob/main/EJ_dictionary_with_GoogleWorksheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#@title #設定 { run: "auto" }

#@markdown ###英単語が記されたGoogle Spreadsheetの指定方法を選択してください
specify_method = 'File name' #@param ["File name", "URL", "Spreadsheet ID"] {allow-input: false}

#@markdown ###英単語が記されたGoogle Spreadsheet名、Google Spreadsheet URL、Google Spreadsheet IDは
spreadsheet_name = 'EnglishWorkListSample'#@param {type:"string"}

#@markdown ###英単語が記されたシート名は
sheet_name = 'Sheet1'#@param {type:"string"}

#@markdown ###英単語が記された列番号は（取得しない場合=0、A列=1）
input_column = 1#@param {type:"integer"}

#@markdown ###和訳の保存先列番号は（取得しない場合=0、A列=1）
translation_column = 2#@param {type:"integer"}

#@markdown ###レベルの保存先列番号は（取得しない場合=0、A列=1）
level_column = 3#@param {type:"integer"}

#@markdown ###発音記号の保存先列番号は（取得しない場合=0、A列=1）
phonetic_column = 4#@param {type:"integer"}

#@markdown ###音声取得済みチェックの出力先列番号は（取得しない場合=0、A列=1）
sound_column = 5#@param {type:"integer"}

#@markdown ###音声MP3ファイルの保存先フォルダ名は
sound_path = '\u52C9\u5F37/\u82F1\u8A9E/EnglishWorkListSample_sound'#@param {type:"string"}

print('設定が終了しました。')

設定が終了しました。


In [2]:
!pip install gspread



In [3]:
import requests
import os
from bs4 import BeautifulSoup
import urllib.parse
import gspread
from google.colab import drive, auth
from google.auth import default
from time import sleep

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
class WeblioScraper:
    def __init__(self, spreadsheet_name, sheet_name, sound_path, input_column, translation_column, level_column, phonetic_column, sound_column):
        auth.authenticate_user()
        creds, _ = default()
        gc = gspread.authorize(creds)

        self.worksheet = self._get_worksheet(gc, specify_method, spreadsheet_name, sheet_name)
        self.download_folder = os.path.join('/content/drive/My Drive/', sound_path)
        self.input_column = input_column
        self.translation_column = translation_column
        self.level_column = level_column
        self.phonetic_column = phonetic_column
        self.sound_column = sound_column

    def _get_worksheet(self, gc, specify_method, spreadsheet_name, sheet_name):
        try:
            if specify_method =="File name":
                worksheet = gc.open(spreadsheet_name).worksheet(sheet_name)
            elif specify_method == "URL":
                worksheet = gc.open_by_url(spreadsheet_name).worksheet(sheet_name)
            elif specify_method == "Spreadsheet ID":
                worksheet = gc.open_by_key(spreadsheet_name).worksheet(sheet_name)
            else:
                raise Exception("Incorrect spreadsheet specification")
            return worksheet
        except Exception as e:
            raise Exception("Error accessing the worksheet:", e)

    def fetch_data(self, word):
        url = "https://ejje.weblio.jp/content/" + word
        response = requests.get(url)
        response.encoding = response.apparent_encoding

        if response.status_code != 200:
            return {"translation": "not_found", "level": "not_found", "phonetic": "not_found", "sound": "not_found"}

        data = BeautifulSoup(response.text, "html.parser")
        translation = self._get_translation(data)
        level = self._get_level(data)
        phonetic = self._get_phonetic(data)
        sound = self._get_sound(data, word)
        return {"translation": translation, "level": level, "phonetic": phonetic, "sound": sound}

    def _get_translation(self, data):
        translation_selector = "#summary > div.summaryM.descriptionWrp > p > span.content-explanation.ej"
        translation_element = data.select_one(translation_selector)
        return translation_element.text.strip() if translation_element else "not_found"

    def _get_level(self, data):
        level_selector = "#learning-level-table > div > span:nth-of-type(1) > span:nth-of-type(3)"
        level_element = data.select_one(level_selector)
        return level_element.text if level_element else "not_found"

    def _get_phonetic(self, data):
        phonetic_selector = "#phoneticEjjeNavi > div > span:nth-child(2)"
        phonetic_element = data.select_one(phonetic_selector)
        return phonetic_element.text if phonetic_element else "not_found"

    def _get_sound(self, data, word):
        sound_selector = "#summary > div.summary-title-wrp > div.summary-icon-cells > div:nth-child(1) > i > audio > source"
        mp3_url_element = data.select_one(sound_selector)
        if mp3_url_element:
            mp3_url = mp3_url_element['src']
            response = requests.get(mp3_url)
            if response.status_code == 200:
                if not os.path.exists(self.download_folder):
                    os.makedirs(self.download_folder)
                mp3_filepath = os.path.join(self.download_folder, word + '.mp3')
                with open(mp3_filepath, 'wb') as mp3_file:
                    mp3_file.write(response.content)
                return word + '.mp3'
        return "not_found"

    def update_worksheet(self, row, data):
        update_values = []
        if self.translation_column > 0:
            update_values.append((row, self.translation_column, data["translation"]))

        if self.level_column > 0:
            update_values.append((row, self.level_column, data["level"]))

        if self.phonetic_column > 0:
            update_values.append((row, self.phonetic_column, data["phonetic"]))

        if self.sound_column > 0:
            update_values.append((row, self.sound_column, data["sound"]))

        for update in update_values:
            self.worksheet.update_cell(update[0], update[1], update[2])


In [7]:
def main():
    scraper = WeblioScraper(spreadsheet_name, sheet_name, sound_path, input_column, translation_column, level_column, phonetic_column, sound_column)
    counter = 1

    while scraper.worksheet.cell(counter, input_column).value != '':
        check_column = max(translation_column, level_column, phonetic_column, sound_column)
        if scraper.worksheet.cell(counter, check_column).value == '':
            word = scraper.worksheet.cell(counter, input_column).value
            result = scraper.fetch_data(word)
            scraper.update_worksheet(counter, result)
            print(result)
        sleep(1)
        counter += 1


In [None]:
main()