In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_dataset_dir = "dataset"
new_dataset_dir = "data"

In [3]:
if not os.path.exists(new_dataset_dir):
    os.mkdir(new_dataset_dir)

In [4]:
file_list1 = os.listdir(raw_dataset_dir)
file_list1 = [file for file in file_list1 if file.endswith(".json")]

In [5]:
result_datas = []
entity_set = set()
intent_list = []
for file in file_list1:
    file_path = os.path.join(raw_dataset_dir, file)
    # 获取意图
    intent = os.path.splitext(file)[0].upper()
    intent_list.append(intent)
    # 读取文件
    with open(file_path, "rt", encoding="utf-8") as f:
        data_list = json.load(f)
    for data in data_list:
        text = data["text"]
        entity_dict = data.get("entity", {})
        entity = []
        if len(entity_dict) > 0:
            entity_type = intent + "/" + entity_dict.get("type")
            entity_start = entity_dict.get("start")
            entity_end = entity_dict.get("end")
            entity.append([entity_start, entity_end, entity_type])
            # 记录一下实体总类
            entity_set.add(entity_type)
        temp_data = {"text": text, "intent": intent, "entity": entity}
        result_datas.append(temp_data)

In [6]:
print("数据总数：", len(result_datas), "意图总数：", len(intent_list), "实体总数：", len(entity_set))

数据总数： 3212 意图总数： 11 实体总数： 19


In [7]:
intent_list

['VOLUME',
 'POWER_SAVING_MODE',
 'VIDEO_CHAT',
 'BATTERY',
 'TASK_MANAGER',
 'CAMERA',
 'SCREENSHOT',
 'BRIGHTNESS',
 'AIRPLANE_MODE',
 'SYSTEM_INFO',
 'CALCULATOR']

In [8]:
entity_list = list(entity_set)
entity_list.sort()
entity_list

['AIRPLANE_MODE/Off',
 'AIRPLANE_MODE/On',
 'BRIGHTNESS/Add',
 'BRIGHTNESS/Sub',
 'BRIGHTNESS/To',
 'CALCULATOR/Off',
 'CALCULATOR/On',
 'CAMERA/Off',
 'CAMERA/On',
 'POWER_SAVING_MODE/Off',
 'POWER_SAVING_MODE/On',
 'TASK_MANAGER/Off',
 'TASK_MANAGER/On',
 'VIDEO_CHAT/Off',
 'VIDEO_CHAT/On',
 'VOLUME/Add',
 'VOLUME/Close',
 'VOLUME/Sub',
 'VOLUME/To']

### 分析Token长度分布

In [9]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
seq_len_list = []
for data in result_datas:
    token_dict = tokenizer(data["text"])
    input_ids = token_dict["input_ids"]
    seq_len = len(input_ids)
    seq_len_list.append(seq_len)

In [10]:
# seq_len_list

In [11]:
df1 = pd.DataFrame({"seq_len": seq_len_list})

In [12]:
df1.describe()

Unnamed: 0,seq_len
count,3212.0
mean,12.089041
std,2.883848
min,6.0
25%,10.0
50%,12.0
75%,14.0
max,23.0


In [13]:
# 最大输入长度可以定128
max_seq_len = 128

### 按类别/实体类型做切分，暂定9:1

In [14]:
from collections import defaultdict
label_datas = defaultdict(list)

In [15]:
for data in result_datas:
    intent = data["intent"]
    entity_data = data.get("entity", [])
    # 优先对实体类别做切分
    if len(entity_data) > 0:
        entity_type = entity_data[0][2]
        label_datas[entity_type].append(data)
    else:
        label_datas[intent].append(data)

In [16]:
len(label_datas.keys())

22

In [17]:
train_datasets = []
valid_datasets = []
for (key, values) in label_datas.items():
    data_size = len(values)
    train_size = int(data_size * 0.9)
    train_indices = np.random.choice(list(range(data_size)), size=train_size, replace=False)
    train_datas = np.take(values, train_indices)
    valid_indices = list(set(range(data_size)) - set(train_indices))
    valid_datas = np.take(values, valid_indices)
    train_datasets.extend(train_datas)
    valid_datasets.extend(valid_datas)

In [18]:
print(f"训练集数据为：{len(train_datasets)}条，测试级数据为：{len(valid_datasets)}条")

训练集数据为：2881条，测试级数据为：331条


### 保存结果

In [19]:
train_path = os.path.join("data", "train.json")
valid_path = os.path.join("data", "valid.json")
intent_label_path = os.path.join("data", "intent_label.json")
entity_label_path = os.path.join("data", "entity_label.json")

In [20]:
intent2id = {intent: i for i, intent in enumerate(intent_list)}
id2intent = {str(i): intent for i, intent in enumerate(intent_list)}
intent_dict = {"intent2id": intent2id, "id2intent": id2intent}

In [21]:
entity2id = {entity: i for i, entity in enumerate(entity_list)}
id2entity = {str(i): entity for i, entity in enumerate(entity_list)}
entity_dict = {"entity2id": entity2id, "id2entity": id2entity}

In [22]:
with open(train_path, "wt", encoding="utf-8") as f1:
    json.dump(train_datasets, f1, indent=4, ensure_ascii=False)
with open(valid_path, "wt", encoding="utf-8") as f2:
    json.dump(valid_datasets, f2, indent=4, ensure_ascii=False)
with open(intent_label_path, "wt", encoding="utf-8") as f3:
    json.dump(intent_dict, f3, indent=4, ensure_ascii=False)
with open(entity_label_path, "wt", encoding="utf-8") as f4:
    json.dump(entity_dict, f4, indent=4, ensure_ascii=False)