# 下载数据

In [1]:
!git clone https://github.com/leodotnet/neural-chinese-address-parsing

Cloning into 'neural-chinese-address-parsing'...
remote: Enumerating objects: 87, done.[K
remote: Total 87 (delta 0), reused 0 (delta 0), pack-reused 87[K
Unpacking objects: 100% (87/87), 4.52 MiB | 12.00 KiB/s, done.


In [2]:
!head neural-chinese-address-parsing/data/*.txt

==> neural-chinese-address-parsing/data/dev.txt <==
宁 B-city
波 I-city
市 I-city
江 B-district
东 I-district
区 I-district
金 B-road
家 I-road
一 I-road
路 I-road

==> neural-chinese-address-parsing/data/labels.txt <==
country
prov
city
district
devzone
town
community
road
subroad
roadno

==> neural-chinese-address-parsing/data/test.txt <==
龙 B-town
港 I-town
镇 I-town
泰 B-poi
和 I-poi
小 I-poi
区 I-poi
B B-houseno
懂 I-houseno
1097 B-roomno

==> neural-chinese-address-parsing/data/train.txt <==
龙 B-town
山 I-town
镇 I-town
慈 B-community
东 I-community
滨 B-redundant
海 I-redundant
区 I-redundant
海 B-road
丰 I-road


In [3]:
def get_data_list(fn):
    with open(fn) as f:
        data_list = []  # 空的数据列表
        token, label = [], []  # 当前数据的字符和标签序列
        for l in f:
            l = l.strip().split()
            if not l:  # 如果 l 为空，说明当前数据结束了
                data_list.append([token, label])
                token, label = [], []
                continue
            token.append(l[0])
            label.append(l[1])
        assert len(token) == 0  # 数据最后一行应该是空行
    return data_list

In [4]:
import collections
train_data = get_data_list('neural-chinese-address-parsing/data/train.txt')
dev_data = get_data_list('neural-chinese-address-parsing/data/dev.txt')
test_data = get_data_list('neural-chinese-address-parsing/data/test.txt')
label_counter = collections.Counter()
all_cnt = 0
for d in train_data + dev_data + test_data:
    for label in d[1]:
        label_counter[label] += 1
        all_cnt += 1
print(len(label_counter))
label_list = list(label_counter.items())
label_list.sort(key=lambda x:-x[1])
for label, cnt in label_list:
    print('%12s  %5d  %4.2f %%' % (label, cnt, cnt / all_cnt * 100))

46
       I-poi  40691  16.29 %
      I-road  21570  8.64 %
  I-district  19177  7.68 %
      I-town  15232  6.10 %
      I-city  14823  5.94 %
      I-prov  12303  4.93 %
       B-poi  10527  4.22 %
  B-district   9829  3.94 %
      B-road   8999  3.60 %
      B-city   8099  3.24 %
    B-roadno   7152  2.86 %
    I-subpoi   6668  2.67 %
    I-roadno   6607  2.65 %
      B-town   6580  2.63 %
      B-prov   6376  2.55 %
 B-redundant   5862  2.35 %
 I-redundant   5191  2.08 %
   B-houseno   4914  1.97 %
 I-community   4509  1.81 %
    B-roomno   4409  1.77 %
   I-houseno   4324  1.73 %
    I-person   3388  1.36 %
   I-devZone   2487  1.00 %
    B-subpoi   2383  0.95 %
 B-community   2110  0.84 %
    I-cellno   2094  0.84 %
    B-cellno   1880  0.75 %
   I-floorno   1810  0.72 %
   B-floorno   1796  0.72 %
    I-roomno   1736  0.70 %
    B-assist   1165  0.47 %
    B-person   1065  0.43 %
    I-assist   1058  0.42 %
   I-subRoad   1045  0.42 %
   B-subRoad    569  0.23 %
   B-devZone    

In [5]:
mod_cnt = 0
T0 = ['redundant']
T1 = ['town', 'poi', 'assist']
T2 = ['houseno', 'city', 'district', 'road', 'roadno', 'subpoi', 'subRoad', 'person']
T3 = ['prov']
T4 = ['roomno', 'cellno', 'community', 'devZone', 'subroadno', 'floorno', 'country', 'otherinfo']

olabels = ['B-assist', 'I-assist', 'B-cellno', 'I-cellno', 'B-city', 'I-city', 'B-community', 'I-community', 'B-country', 'I-country', 'B-devZone', 'I-devZone', 'B-district', 'I-district', 'B-floorno', 'I-floorno', 'B-houseno', 'I-houseno', 'B-otherinfo', 'I-otherinfo', 'B-person', 'I-person', 'B-poi', 'I-poi', 'B-prov', 'I-prov', 'B-redundant', 'I-redundant', 'B-road', 'I-road', 'B-roadno', 'I-roadno', 'B-roomno', 'I-roomno', 'B-subRoad', 'I-subRoad', 'B-subRoadno', 'I-subRoadno', 'B-subpoi', 'I-subpoi', 'B-subroad', 'I-subroad', 'B-subroadno', 'I-subroadno', 'B-town', 'I-town']
olabels2id = {}
for i, l in enumerate(olabels):
    olabels2id[l] = i
labels = ['B-prov', 'I-prov', 'B-city', 'I-city', 'B-district', 'I-district', 'B-town', 'I-town',  'I-community', 'B-road', 'I-road', 'B-roadno', 'I-roadno', 'B-poi', 'I-poi', 'B-houseno', 'I-houseno', 'I-cellno', 'I-floorno', 'I-roomno', 'B-assist', 'I-assist', 'I-country', 'I-devZone', 'I-otherinfo', 'B-person', 'I-person', 'B-redundant', 'I-redundant', 'B-subpoi', 'I-subpoi', 'B-subroad', 'I-subroad', 'I-subroadno', ]
print(len(labels))
num_labels = len(labels)

label2id = {}
for i, l in enumerate(labels):
    label2id[l] = i
print(label2id)
remove_labels = T4
def get_data_list(fn):
    global mod_cnt
    with open(fn) as f:
        data_list = []
        origin_token, token, label, origin_label = [], [], [], []
        for l in f:
            l = l.strip().split()
            if not l:
                data_list.append([token, label, origin_label, origin_token])
                origin_token, token, label, origin_label = [], [], [], []
                continue
            if l[1] == 'B-subRoadno':
                l[1] = 'B-subroadno'
            elif l[1] == 'I-subRoadno':
                l[1] = 'I-subroadno'
            elif l[1] == 'B-subRoad':
                l[1] = 'B-subroad'
            elif l[1] == 'I-subRoad':
                l[1] = 'I-subroad'
            # 去除某些 B 标签
            ll = l[1]
            if l[1][0] == 'B' and l[1][2:] in remove_labels:
                ll = 'I' + l[1][1:]
                mod_cnt += 1
            if len(l[0]) == 1:
                token.append(l[0])
                label.append(label2id[ll])
            else:
                the_type = ll[1:]
                for i, tok in enumerate(l[0]):
                    token.append(tok)
                    if i == 0:
                        label.append(label2id[ll])
                    else:
                        label.append(label2id['I'+the_type])

            if len(l[0]) == 1:
                origin_label.append(l[1])
            else:
                the_type = l[1][1:]
                for i, tok in enumerate(l[0]):
                    if i == 0:
                        origin_label.append(l[1])
                    else:
                        origin_label.append('I'+the_type)
            origin_token.append(l[0])
        assert len(token) == 0
    return data_list

34
{'B-prov': 0, 'I-prov': 1, 'B-city': 2, 'I-city': 3, 'B-district': 4, 'I-district': 5, 'B-town': 6, 'I-town': 7, 'I-community': 8, 'B-road': 9, 'I-road': 10, 'B-roadno': 11, 'I-roadno': 12, 'B-poi': 13, 'I-poi': 14, 'B-houseno': 15, 'I-houseno': 16, 'I-cellno': 17, 'I-floorno': 18, 'I-roomno': 19, 'B-assist': 20, 'I-assist': 21, 'I-country': 22, 'I-devZone': 23, 'I-otherinfo': 24, 'B-person': 25, 'I-person': 26, 'B-redundant': 27, 'I-redundant': 28, 'B-subpoi': 29, 'I-subpoi': 30, 'B-subroad': 31, 'I-subroad': 32, 'I-subroadno': 33}


In [6]:
import collections
train_data = get_data_list('neural-chinese-address-parsing/data/train.txt')
dev_data = get_data_list('neural-chinese-address-parsing/data/dev.txt')
test_data = get_data_list('neural-chinese-address-parsing/data/test.txt')
label_counter = collections.Counter()
all_cnt = 0
for d in train_data + dev_data + test_data:
    for label in d[1]:
        label_counter[label] += 1
        all_cnt += 1
print(len(label_counter))
label_list = list(label_counter.items())
label_list.sort(key=lambda x:-x[1])
for label, cnt in label_list:
    print('%12s  %5d  %4.2f %%' % (label, cnt, cnt / all_cnt * 100))

34
          14  40899  14.45 %
          10  21687  7.66 %
          12  21272  7.51 %
           5  19177  6.77 %
          19  17511  6.19 %
           7  15232  5.38 %
           3  14823  5.24 %
           1  12303  4.35 %
          13  10527  3.72 %
           4   9829  3.47 %
           9   8999  3.18 %
          16   8413  2.97 %
           2   8099  2.86 %
          11   7152  2.53 %
          30   6750  2.38 %
           8   6623  2.34 %
           6   6580  2.32 %
           0   6376  2.25 %
          27   5862  2.07 %
          28   5623  1.99 %
          15   4914  1.74 %
          17   4649  1.64 %
          18   4356  1.54 %
          26   3616  1.28 %
          23   3042  1.07 %
          29   2383  0.84 %
          32   1266  0.45 %
          20   1165  0.41 %
          21   1131  0.40 %
          25   1065  0.38 %
          33   1038  0.37 %
          31    572  0.20 %
          22    141  0.05 %
          24     16  0.01 %


In [7]:
word_embedding_file = 'neural-chinese-address-parsing/data/giga.vec100'
word2vec = {}
with open(word_embedding_file) as ff:
    for l in ff:
        l = l.strip().split(' ')
        word2vec[l[0]] = [float(x) for x in l[1:]]
print(len(word2vec))

6082
