### 采集趣词词素数据

词素包含前缀后缀和词根

[Word Roots Dictionary](https://www.quword.com/root/)

In [35]:
# 导入库并定义初始变量
import requests
from bs4 import BeautifulSoup

# 测试使用
debug = 0
# groups = ['a', 'b']
groups = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
          'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
main_url = "https://www.quword.com/root/page/"
fake_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}


class WordMap:
    def __init__(self):
        # 前缀
        self.prefix = []
        # 后缀
        self.suffix = []
        # 词根
        self.root = []
        # 其它
        self.other = []

    def extend(self, map):
        self.prefix.extend(map.prefix)
        self.suffix.extend(map.suffix)
        self.root.extend(map.root)
        self.other.extend(map.other)

In [38]:
# 工具方法

# 获取当前字母组最大页码
def getMaxPage(content):
    max_page = 0
    page_bar = content.find('div', class_="paging")
    if page_bar == None:
        return 0
    for page_tag in page_bar.find_all('a'):
        page_now = int(page_tag.text)
        if max_page < page_now:
            max_page = page_now
    # 页码从0开始
    if max_page > 0:
        max_page -= 1
    return max_page

# 获取当前页所有词根词缀
def getPageData(content):
    map = WordMap()
    title_list = content.find_all('h2')
    for title in title_list:
        text = title.text
        type = text[0:text.find("：")]
        # 去除类型前缀
        text = text[text.find("：")+1:len(text)]
        # 去除带有解释的单词和字符
        if text.find('=') > -1:
            text = text[0:text.find('=')].strip()
        # 拆分遍历
        for item in text.split(','):
            # 清理字符串
            item = cleanText(item)
            if len(item) == 0:
                continue
            # 根据标识判断类型
            if item.startswith('-') and item.endswith('-'):
                map.root.append(item)
            elif item.startswith('-'):
                map.prefix.append(item)
            elif item.endswith('-'):
                map.suffix.append(item)
            else:
                if type.startswith("词根"): map.root.append(item)
                else: map.other.append(item)
    return map

# 清理字符串
def cleanText(txt):
    txt = txt.strip()
    list = txt.split(' ')
    item = list[0]
    if item.find('”') > -1:
        return ''
    if item.endswith('(') or item.endswith(')'):
        item = item[0:len(item)-1]
    if item.endswith(';'):
        return item[0:len(item)-1]
    return item

# 获取页面内容
def getPageContent(group, page):
    url = "{}{}/{}".format(main_url, group, page)
    response = requests.get(url, headers=fake_headers)
    soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
    return soup.find('div', id="article")

In [39]:
# 获取字母组所有页数据
def getAllGroup():
    pubMap = WordMap()
    for group in groups:
        now_page = 0
        max_page = getMaxPage(getPageContent(group, now_page))
        if debug == 1: print("[%c] 总页数: %i" % (group, max_page))
        while now_page <= max_page:
            map = getPageData(getPageContent(group, now_page))
            if debug == 1: print("第%i页\t前缀: %i\t后缀: %i\t词根: %i\t其他: %i" % (now_page, len(map.prefix), len(map.suffix), len(map.root), len(map.other)))
            now_page += 1
            pubMap.extend(map)
    return pubMap

map = getAllGroup()
print("前缀: {} {}\n后缀: {} {}\n词根: {} {}\n其他: {} {}".format(
    len(map.prefix), map.prefix, len(map.suffix), map.suffix, len(map.root), map.root, len(map.other), map.other))

前缀: 70 ['-i', '-ial', '-ian', '-ibility', '-ible', '-ject', '-jecting', '-jected', '-jection', '-jector', '-jectory', '-kin', '-kinesia', '-kinesis', '-kinetic', '-kinesias', '-kineses', '-kinetical', '-kinetically', '-kleptic', '-clepty', '-cleptic', '-nomy', '-quirement', '-quirable', '-quisition', '-quisitive', '-ual', '-ular', '-ule', '-ulous', '-um', '-uous', '-ure', '-ular', '-ule', '-ole', '-le', '-ulous', '-ulously', '-ulus', '-olus', '-ulum', '-ola', '-um', '-uncle', '-ward', '-wards', '-ways', '-wise', '-xenic', '-xenism', '-xenist', '-xenous', '-xeny', '-y', '-yer', '-y', '-zoic', '-zoid', '-zoite', '-zoal', '-zonal', '-zooid', '-zoon', '-zoa', '-zoan', '-zygous', '-zyme', '-zymic']
后缀: 385 ['a-', 'ab-', 'abs-', 'ac-', 'ad-', 'af-', 'ag-', 'al-', 'all-', 'am-', 'amph-', 'amphi-', 'an-', 'ana-', 'ance-', 'ante-', 'ant-', 'anti-', 'ap-', 'aph-', 'apo-', 'ar-', 'as-', 'at-', 'auto-', 'be-', 'bene-', 'beni-', 'bi-', 'by-', 'bene-', 'calli-', 'cat-', 'cata-', 'cath-', 'chili-', '