In [1]:
import re

In [2]:
from bs4 import BeautifulSoup

In [3]:
import numpy as np

In [4]:
import pandas as pd

定义两个函数：
- find_phonetic 寻找所有音标
- find_speech 寻找所有词性

In [5]:
def find_phonetic(data):
    bsObj = BeautifulSoup(data, "lxml")
    mylist = bsObj.find_all('font', {'color':'darkslategray'})
    lst = [i.get_text() for i in mylist]
    return '\n'.join(lst)

In [6]:
def find_speech(data):
    bsObj = BeautifulSoup(data, "lxml")
    mylist = bsObj.find_all('span', {'style':'color: #FFFFFF; background-color: #006400; font-size: xx-small'})
    lst = [i.get_text() for i in mylist]
    return '\n'.join(lst)

导入从欧路导出的韦伯详细解释 html 文件

In [7]:
with open('top1000_Eudic_WB.html') as file:
    data = file.read()

In [8]:
df = pd.DataFrame({'all': data.split('<tr>\n                ')[2:]}) # 从第二个开始算起

In [9]:
df.head()

Unnamed: 0,all
0,"<td class=""export-td"">1</td>\n ..."
1,"<td class=""export-td"">2</td>\n ..."
2,"<td class=""export-td"">3</td>\n ..."
3,"<td class=""export-td"">4</td>\n ..."
4,"<td class=""export-td"">5</td>\n ..."


In [12]:
df['phonetic'] = df['all'].map(find_phonetic)

In [13]:
df.head()

Unnamed: 0,all,phonetic
0,"<td class=""export-td"">1</td>\n ...","/ˈeı/\n/ə, ˈeı/\n/ən, ˈæn/"
1,"<td class=""export-td"">2</td>\n ...",/əˈbıləti/
2,"<td class=""export-td"">3</td>\n ...",/ˈeıbəl/
3,"<td class=""export-td"">4</td>\n ...",/əˈbaʊt/
4,"<td class=""export-td"">5</td>\n ...",/əˈbʌv/


In [15]:
df.phonetic[0]

'/ˈeı/\n/ə, ˈeı/\n/ən, ˈæn/'

导入一个 top1000词 的 csv 文件做拼接用

In [17]:
df_csv = pd.read_csv('top1000_Eudic_WB.csv')

In [18]:
df_csv.rename(columns={'单词': 'words', '解释': 'meaning'}, inplace=True)

In [19]:
df = pd.concat([df_csv.ix[:, ['words']], df], axis=1)

In [20]:
df.head()

Unnamed: 0,words,all,phonetic
0,a,"<td class=""export-td"">1</td>\n ...","/ˈeı/\n/ə, ˈeı/\n/ən, ˈæn/"
1,ability,"<td class=""export-td"">2</td>\n ...",/əˈbıləti/
2,able,"<td class=""export-td"">3</td>\n ...",/ˈeıbəl/
3,about,"<td class=""export-td"">4</td>\n ...",/əˈbaʊt/
4,above,"<td class=""export-td"">5</td>\n ...",/əˈbʌv/


In [21]:
df['all_speech'] = df['all'].map(find_speech)

In [22]:
df.head(10)

Unnamed: 0,words,all,phonetic,all_speech
0,a,"<td class=""export-td"">1</td>\n ...","/ˈeı/\n/ə, ˈeı/\n/ən, ˈæn/",or \n noun \n pl \n or \n or \n or \n[ count...
1,ability,"<td class=""export-td"">2</td>\n ...",/əˈbıləti/,also \n noun suffix \n noun \n pl \n[ count ...
2,able,"<td class=""export-td"">3</td>\n ...",/ˈeıbəl/,also \n adj suffix \n adj
3,about,"<td class=""export-td"">4</td>\n ...",/əˈbaʊt/,adv \n prep \n adj
4,above,"<td class=""export-td"">5</td>\n ...",/əˈbʌv/,adv \n prep \n chiefly Brit \n adj
5,accept,"<td class=""export-td"">6</td>\n ...",/ıkˈsɛpt/\n/ıkˈsɛptɚ/,verb \n[ + obj ]\n[ no obj ]\n[ + obj ]...
6,according,"<td class=""export-td"">7</td>\n ...",,
7,across,"<td class=""export-td"">8</td>\n ...",/əˈkʰrɑːs/,prep \n adv
8,act,"<td class=""export-td"">9</td>\n ...",/ˈækt/,noun \n pl \n[ count ]\n[ count ]\n[ cou...
9,action,"<td class=""export-td"">10</td>\n ...",/ˈækʃən/,noun \n pl \n[ count ]\n[ noncount ]\n[ ...


导入一个 8 大词性文件， 便于从 all 词性里面提取有价值词性

In [25]:
df_eight_speech = pd.read_csv('eight_speech.txt')

In [26]:
df_eight_speech

Unnamed: 0,speech,count
0,noun,802
1,verb,435
2,adj,363
3,adv,229
4,pronoun,71
5,prep,51
6,conj,31
7,interj,9


定义一个函数求以下两个 array 的交集
- df.all_speech
- df_eight_speech.values

In [35]:
def find_intersection(values):
    arr1 = np.asarray(values.split('\n'))  # 先转成列表，再转 array
    arr2 = df_eight_speech.speech.values
    result = np.intersect1d(arr1, arr2)
    lst = result.tolist()
    return '\n'.join(lst)

In [36]:
df['speech'] = df.all_speech.map(find_intersection)

In [41]:
df.head()

Unnamed: 0,words,all,phonetic,all_speech,speech
0,a,"<td class=""export-td"">1</td>\n ...","/ˈeı/\n/ə, ˈeı/\n/ən, ˈæn/",or \n noun \n pl \n or \n or \n or \n[ count...,noun
1,ability,"<td class=""export-td"">2</td>\n ...",/əˈbıləti/,also \n noun suffix \n noun \n pl \n[ count ...,noun
2,able,"<td class=""export-td"">3</td>\n ...",/ˈeıbəl/,also \n adj suffix \n adj,adj
3,about,"<td class=""export-td"">4</td>\n ...",/əˈbaʊt/,adv \n prep \n adj,adj \n adv \n prep
4,above,"<td class=""export-td"">5</td>\n ...",/əˈbʌv/,adv \n prep \n chiefly Brit \n adj,adj \n adv \n prep


导入一个 coca 的文件，方便合并排序

In [42]:
df_coca = pd.read_csv('top1000_coca.csv')

In [44]:
df = pd.merge(df_coca, df, on='words', how='left')

In [45]:
df.head()

Unnamed: 0,rank,words,coca_speech,all,phonetic,all_speech,speech
0,1,the,a,"<td class=""export-td"">786</td>\n ...","/ðə before consonant sounds, ði before vowel s...",definite article,
1,2,be,v,"<td class=""export-td"">72</td>\n ...","/ˈbiː/\n/ˈæm, əm/\n/ˈɑɚ, ɚ/\n/ˈız, əz/\n/ˈwəz/...",verb \n present first singular \n second sing...,verb
2,3,and,c,"<td class=""export-td"">37</td>\n ...","/ˈænd, ənd, ən/",conj,conj
3,4,of,i,"<td class=""export-td"">518</td>\n ...","/ˈʌv, əv, Brit ˈɒv, əv, ə/",prep \n US \n informal \n abbr,prep
4,5,a,a,"<td class=""export-td"">1</td>\n ...","/ˈeı/\n/ə, ˈeı/\n/ən, ˈæn/",or \n noun \n pl \n or \n or \n or \n[ count...,noun


In [46]:
df.columns

Index(['rank', 'words', 'coca_speech', 'all', 'phonetic', 'all_speech',
       'speech'],
      dtype='object')

In [47]:
df.rename(columns={'speech': 'WB_speech'}, inplace=True)

In [49]:
df.to_csv('top1000_phonetic.csv', columns=['rank', 'words', 'phonetic', 'coca_speech', 'WB_speech'], index=False)

In [50]:
df.to_excel('top1000_phonetic.xlsx', columns=['rank', 'words', 'phonetic', 'coca_speech', 'WB_speech'], index=False)