This notebook creates a dictionary out of the vocabulary provided by 実用日本語表現辞典.

In [9]:
# String utility

import re

hiragana_re = re.compile(r'[ぁ-ん]')

# Uncomment the second line if the provided text uses "う゛" instead of "ゔ"
def hiragana2katakana(hiragana):
    #hiragana = hiragana.replace('う゛', 'ヴ')
    return hiragana_re.sub(lambda x: chr(ord(x.group(0)) + 0x60), hiragana)

def has_katakana_only(string):
    match = re.match("^[ァ-ヴ]*$", string)
    return match is not None

In [26]:
# Website information

ROOT_PATH = 'https://www.weblio.jp/category/dictionary/jtnhj/'

initial_letters = [
    'aa', 'ii', 'uu', 'ee', 'oo',
    'ka', 'ki', 'ku', 'ke', 'ko',
    'sa', 'shi', 'su', 'se', 'so',
    'ta', 'chi', 'tsu', 'te', 'to',
    'na', 'ni', 'nu', 'ne', 'no',
    'ha', 'hi', 'fu', 'he', 'ho',
    'ma', 'mi', 'mu', 'me', 'mo',
    'ya', 'yu', 'yo',
    'ra', 'ri', 'ru', 're', 'ro',
    'wa', 'wo', 'nn',
    'ga', 'gi', 'gu', 'ge', 'go',
    'za', 'zi', 'zu', 'ze', 'zo',
    'da', 'du', 'de', 'do', # 'di' doesn't exist
    'ba', 'bi', 'bu', 'be', 'bo',
    'pa', 'pi', 'pu', 'pe', 'po'
]

# Different pages are in different directories as in 'hiragana/13'

In [None]:
# Scraping

import time
from requests import get
from requests.exceptions import RequestException
from contextlib import closing # For with construction
from bs4 import BeautifulSoup

def get_html(url):
    try:
        with closing(get(url, stream=True)) as resp:
            content_type = resp.headers['Content-Type'].lower()
            is_good_response = (resp.status_code == 200 
                and content_type is not None 
                and content_type.find('html') > -1)
            if is_good_response:
                return resp.content
            else:
                return None

    except RequestException as e:
        print(e)
        return None

words = []

for initial_letter in initial_letters:
    print(initial_letter)
    
    # Get the max page number of that initial_letter
    first_page_html = BeautifulSoup(get_html(ROOT_PATH + initial_letter), 'html.parser')
    pager = first_page_html.select_one('.CtgryPg')
    if pager == None: # There is only one page
        page_max = 1
    else: # There are multiple pages
        page_max = int(pager.select('a')[-2].get_text())
    
    for page_number in range(1, page_max + 1):
        print('page: ' + str(page_number))
        html = BeautifulSoup(get_html(ROOT_PATH + initial_letter + '/' + str(page_number)), 'html.parser')
        for li in html.select_one('.CtgryUlL').select('li') + html.select_one('.CtgryUlR').select('li'):
            original_word = li.get_text()
            katakana_word = hiragana2katakana(original_word)
            if has_katakana_only(katakana_word) and (len(words) == 0 or words[-1][1] != katakana_word):
                words.append([original_word, katakana_word])
        time.sleep(0.1)

print(words[:50])

In [29]:
# Convert the list into a pandas DataFrame and save it

import pandas as pd

df = pd.DataFrame(words)
df.to_csv('dic/practical.csv', encoding='utf-8', header=None, index=None)