In [1]:
import re
import json

In [2]:
with open('./addr_raw.json') as fp:
    addr_raw = json.load(fp)

In [3]:
def clean(s):
    s = re.sub(r'[，。、；]+$', '', s)
    s = s.replace('（住宅）', '')
    s = s.replace('（宿舍）', '')
    s = s.strip()

    # 分割一些地址的简写
    if '、' in s:
        # 嘉定区江桥镇五四村、红光村、增建村
        if '镇' in s:
            pos = s.index('镇') + 1
            parent_addr = s[:pos]
            others = s[pos:].split('、')
            return [
                parent_addr + o
                for o in others
            ]
        # 嘉定区嘉定工业区草庵村、灯塔村、旺泾村、陆渡村
        if '工业区' in s:
            pos = s.index('工业区') + 3
            parent_addr = s[:pos]
            others = s[pos:].split('、')
            return [
                parent_addr + o
                for o in others
            ]
        # 嘉定区真新街道丰庄三队、新郁路欣会公寓
        if '真新街道' in s:
            pos = s.index('真新街道') + 4
            parent_addr = s[:pos]
            others = s[pos:].split('、')
            return [
                parent_addr + o
                for o in others
            ]
        # 宝山区月浦六村、恒高路128弄
        if s == '宝山区月浦六村、恒高路128弄':
            return [
                '宝山区月浦六村',
                '宝山区恒高路128弄'
            ]
        
    return [s]

In [4]:
assert clean("徐汇区蒲汇塘路50号，") == ["徐汇区蒲汇塘路50号"]
assert clean("长宁区虹桥路961弄（住宅）") == ["长宁区虹桥路961弄"]
assert clean('嘉定区江桥镇五四村、红光村、增建村') == ['嘉定区江桥镇五四村', '嘉定区江桥镇红光村', '嘉定区江桥镇增建村']
assert clean('嘉定区嘉定工业区草庵村、灯塔村、旺泾村、陆渡村') == [
    '嘉定区嘉定工业区草庵村', '嘉定区嘉定工业区灯塔村', '嘉定区嘉定工业区旺泾村', '嘉定区嘉定工业区陆渡村']
assert clean('嘉定区真新街道丰庄三队、新郁路欣会公寓') == ['嘉定区真新街道丰庄三队', '嘉定区真新街道新郁路欣会公寓']

In [5]:
addr = []
for item in addr_raw:
    new_obj = {
        'date': item['date'],
        'addr': []
    }
    new_obj['addr'] = []
    for a in item['addr_raw']:
        new_obj['addr'] += clean(a)
    
    new_obj['addr'] = sorted(
        list(set(new_obj['addr'])),
        key=lambda x: -len(x)
    )

    addr.append(new_obj)

In [6]:
with open('./addr.json', 'w') as fp:
    json.dump(addr, fp, indent=4, ensure_ascii=False)

In [7]:
addr_flatten = []
for x in addr:
    addr_flatten += x['addr']

In [8]:
print(len(addr_flatten))

155244


In [9]:
addr_flatten = list(set(addr_flatten))
print(len(addr_flatten))

36635


In [10]:
addr_flatten = sorted(addr_flatten, key=lambda x: -len(x))

In [11]:
print('\n'.join(addr_flatten[:5]))

浦东新区白莲泾路与浦东南路交界口工地生活区
闵行区江川路街道剑川路综合服务中心工地宿舍
浦东新区东大公路老芦公路交界口工地生活区
浦东新区御桥路290-292号工地生活区
浦东新区金银花路凌霄花路交界口工地生活区


In [12]:
print('\n'.join(addr_flatten[-5:]))

奉贤区奉城村
闵行区光辉村
青浦区泖阳路
青浦区众舟
金山区油车
