# 将数据处理为W2NER可以接受的格式

## 处理CLUENER数据集

一个样例的转换

In [3]:
def convert_cluener_to_w2nerformat(sample):
    """ 
    将cluener格式的数据转换为w2v格式的数据
    input:
        sample (dict): cluener格式的数据
    output:
        w2ner_sample (dict): w2ner格式的数据
    """
    
    text = sample['text']
    label = sample['label']
    
    w2ner_sample = {}
    sentence = [char for char in text]
    ners = []
    for key in label:
        entity_type = key   # 实体类型

        for item in label[key]:
            # 取实体索引即可
            # label[key][item]: [[0, 2]]
            start_index = int(label[key][item][0][0])
            end_index = int(label[key][item][0][-1])
            w2ner_entity = {'index': [i for i in range(start_index, end_index + 1)],
                            'type': entity_type}
            ners.append(w2ner_entity)
                
    w2ner_sample['sentence'] = sentence
    w2ner_sample['ner'] = ners
                
    return w2ner_sample

In [4]:
s = {"text": "方传柳实习生王梦菲", "label": {"name": {"方传柳": [[0, 2]], "王梦菲": [[6, 8]]}, "position": {"实习生": [[3, 5]]}}}
s

{'text': '方传柳实习生王梦菲',
 'label': {'name': {'方传柳': [[0, 2]], '王梦菲': [[6, 8]]},
  'position': {'实习生': [[3, 5]]}}}

In [5]:
convert_cluener_to_w2nerformat(s)

{'sentence': ['方', '传', '柳', '实', '习', '生', '王', '梦', '菲'],
 'ner': [{'index': [0, 1, 2], 'type': 'name'},
  {'index': [6, 7, 8], 'type': 'name'},
  {'index': [3, 4, 5], 'type': 'position'}]}

整个数据集的转换

In [6]:
def convert_cluener_to_w2nerformat_file(cluener_file_path, w2ner_file_path):
    """ 
    将cluener格式的文件转换为w2ner格式的文件
    input:
        file_path (str): cluener格式的数据文件路径
    output:
        w2ner_file_path (str): 转换后的w2ner格式的数据文件路径
    """
    
    import json
    
    with open(cluener_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        data = [json.loads(line.strip()) for line in lines]
        
    w2ner_data = [convert_cluener_to_w2nerformat(sample) for sample in data]
    
    with open(w2ner_file_path, 'w', encoding='utf-8') as f:
        json.dump(w2ner_data, f, ensure_ascii=False)

    return w2ner_data

In [11]:
convert_cluener_to_w2nerformat_file(cluener_file_path="data/cluener_public/test.json", w2ner_file_path="data/cluener/test.json")

KeyError: 'label'

测试集没有标签, 需要单独处理

In [12]:
def convert_test_cluener_to_w2nerformat_file(test_cluener_file_path, test_w2ner_file_path):
    """ 
    将cluener格式的文件转换为w2ner格式的文件
    input:
        file_path (str): cluener格式的测试数据文件路径
    output:
        w2ner_file_path (str): 转换后的w2ner格式的测试数据文件路径
    """
    
    import json
    
    with open(test_cluener_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        data = [json.loads(line.strip()) for line in lines]
        
    w2ner_data = []
    
    for sample in data:
        text = sample['text']
        sentence = [char for char in text]
        w2ner_data.append(
            {'sentence': sentence,
             'ner': []}
            )
    
    with open(test_w2ner_file_path, 'w', encoding='utf-8') as f:
        json.dump(w2ner_data, f, ensure_ascii=False)
    
    return w2ner_data

In [13]:
convert_test_cluener_to_w2nerformat_file(test_cluener_file_path="data/cluener_public/test.json", test_w2ner_file_path="data/cluener/test.json")

[{'sentence': ['四',
   '川',
   '敦',
   '煌',
   '学',
   '”',
   '。',
   '近',
   '年',
   '来',
   '，',
   '丹',
   '棱',
   '县',
   '等',
   '地',
   '一',
   '些',
   '不',
   '知',
   '名',
   '的',
   '石',
   '窟',
   '迎',
   '来',
   '了',
   '海',
   '内',
   '外',
   '的',
   '游',
   '客',
   '，',
   '他',
   '们',
   '随',
   '身',
   '携',
   '带',
   '着',
   '胡',
   '文',
   '和',
   '的',
   '著',
   '作',
   '。'],
  'ner': []},
 {'sentence': ['尼',
   '日',
   '利',
   '亚',
   '海',
   '军',
   '发',
   '言',
   '人',
   '当',
   '天',
   '在',
   '阿',
   '布',
   '贾',
   '向',
   '尼',
   '日',
   '利',
   '亚',
   '通',
   '讯',
   '社',
   '证',
   '实',
   '了',
   '这',
   '一',
   '消',
   '息',
   '。'],
  'ner': []},
 {'sentence': ['销',
   '售',
   '冠',
   '军',
   '：',
   '辐',
   '射',
   '3',
   '-',
   'B',
   'e',
   't',
   'h',
   'e',
   's',
   'd',
   'a'],
  'ner': []},
 {'sentence': ['所',
   '以',
   '大',
   '多',
   '数',
   '人',
   '都',
   '是',
   '从',
   '巴',
   '厘',
   '岛',
   '南',
   '部',
   '开',
   '始',
   '环',
   