In [1]:
import json

In [7]:
# 长度为300的列表，切分训练集和验证集，0.8为训练集，0.2为验证集，json文件格式，同时切成json文件保存
with open('/mnt/cfs/huangzhiwei/BAE2025/data/mrbench_v3_devset.json', 'r', encoding='utf-8') as f:
    datas = json.load(f)
    
# 切分训练集和验证集
train_data = datas[:int(len(datas)*0.8)]
valid_data = datas[int(len(datas)*0.8):]

# 保存训练集和验证集
with open('/mnt/cfs/huangzhiwei/BAE2025/data/train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)
    
with open('/mnt/cfs/huangzhiwei/BAE2025/data/valid.json', 'w', encoding='utf-8') as f:
    json.dump(valid_data, f, ensure_ascii=False, indent=4)

print('Done!')

Done!


In [9]:
import json
import numpy as np
from sklearn.model_selection import KFold

# 读取数据
with open('/mnt/cfs/huangzhiwei/BAE2025/data/mrbench_v3_devset.json', 'r', encoding='utf-8') as f:
    datas = json.load(f)
    
# 创建5折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 获取所有折的索引并保存
all_folds = []
for train_index, valid_index in kf.split(datas):
    all_folds.append({
        "train_indices": train_index.tolist(),
        "valid_indices": valid_index.tolist()
    })

# 保存所有折的索引信息
with open('/mnt/cfs/huangzhiwei/BAE2025/data/fold_indices.json', 'w', encoding='utf-8') as f:
    json.dump(all_folds, f, ensure_ascii=False, indent=4)

# 原有的代码继续执行...
fold = 0
for train_index, valid_index in kf.split(datas):
    # 根据索引获取对应的数据
    train_data = [datas[i] for i in train_index]
    valid_data = [datas[i] for i in valid_index]
    
    # 保存当前折的训练集和验证集
    with open(f'/mnt/cfs/huangzhiwei/BAE2025/data/train_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)
    
    with open(f'/mnt/cfs/huangzhiwei/BAE2025/data/valid_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(valid_data, f, ensure_ascii=False, indent=4)
    
    print(f'Fold {fold} - 训练集大小: {len(train_data)}, 验证集大小: {len(valid_data)}')
    
    # 打印前几个索引，帮助调试
    print(f"Fold {fold} - 训练集前5个索引: {train_index[:5]}")
    print(f"Fold {fold} - 验证集前5个索引: {valid_index[:5]}")
    fold += 1

print('所有5折交叉验证数据已保存完成!')


Fold 0 - 训练集大小: 240, 验证集大小: 60
Fold 0 - 训练集前5个索引: [0 1 2 3 4]
Fold 0 - 验证集前5个索引: [ 5  7  9 17 24]
Fold 1 - 训练集大小: 240, 验证集大小: 60
Fold 1 - 训练集前5个索引: [0 1 2 3 4]
Fold 1 - 验证集前5个索引: [ 6 10 15 16 18]
Fold 2 - 训练集大小: 240, 验证集大小: 60
Fold 2 - 训练集前5个索引: [0 1 3 4 5]
Fold 2 - 验证集前5个索引: [ 2 12 26 28 29]
Fold 3 - 训练集大小: 240, 验证集大小: 60
Fold 3 - 训练集前5个索引: [1 2 3 5 6]
Fold 3 - 验证集前5个索引: [ 0  4  8 11 14]
Fold 4 - 训练集大小: 240, 验证集大小: 60
Fold 4 - 训练集前5个索引: [0 2 4 5 6]
Fold 4 - 验证集前5个索引: [ 1  3 13 20 21]
所有5折交叉验证数据已保存完成!


In [2]:
# 新的数据划分方法，基于conversation_id划分
import json
import random
import numpy as np
from sklearn.model_selection import train_test_split

# 读取数据
with open('/mnt/cfs/huangzhiwei/BAE2025/data/mrbench_v3_devset.json', 'r', encoding='utf-8') as f:
    datas = json.load(f)

# 提取所有唯一的conversation_id
conversation_ids = list(set(data["conversation_id"] for data in datas))

# 设置随机种子以确保结果可复现
random.seed(42)
np.random.seed(42)

# 按照0.8:0.2的比例划分conversation_ids
train_conversation_ids, valid_conversation_ids = train_test_split(
    conversation_ids, test_size=0.2, random_state=42
)

# 根据对话ID筛选数据
train_data = [data for data in datas if data["conversation_id"] in train_conversation_ids]
valid_data = [data for data in datas if data["conversation_id"] in valid_conversation_ids]

# 打印划分信息
print(f'训练集大小: {len(train_data)} 对话, 验证集大小: {len(valid_data)} 对话')
print(f'训练集占比: {len(train_data) / (len(train_data) + len(valid_data)):.2f}')

# 检查类别分布
def count_labels(data_list):
    label_counts = {"Yes": 0, "To some extent": 0, "No": 0}
    total_samples = 0
    
    for data in data_list:
        for model, response_data in data["tutor_responses"].items():
            label = response_data["annotation"]["Providing_Guidance"]
            label_counts[label] += 1
            total_samples += 1
    
    return label_counts, total_samples

train_labels, train_samples = count_labels(train_data)
valid_labels, valid_samples = count_labels(valid_data)

print("\n标签分布:")
print(f"训练集 ({train_samples} 样本):")
for label, count in train_labels.items():
    print(f"  - {label}: {count} ({count/train_samples*100:.2f}%)")

print(f"验证集 ({valid_samples} 样本):")
for label, count in valid_labels.items():
    print(f"  - {label}: {count} ({count/valid_samples*100:.2f}%)")

# 保存划分后的数据
with open('/mnt/cfs/huangzhiwei/BAE2025/data/train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open('/mnt/cfs/huangzhiwei/BAE2025/data/valid.json', 'w', encoding='utf-8') as f:
    json.dump(valid_data, f, ensure_ascii=False, indent=4)

print('\n数据已保存!')

训练集大小: 240 对话, 验证集大小: 60 对话
训练集占比: 0.80

标签分布:
训练集 (1975 样本):
  - Yes: 1107 (56.05%)
  - To some extent: 417 (21.11%)
  - No: 451 (22.84%)
验证集 (501 样本):
  - Yes: 300 (59.88%)
  - To some extent: 86 (17.17%)
  - No: 115 (22.95%)

数据已保存!


In [2]:
# 检测一下train.json和valid.json的数据中“conversation_history”长度超过512的数量
with open('/mnt/cfs/huangzhiwei/BAE2025/data/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open('/mnt/cfs/huangzhiwei/BAE2025/data/valid.json', 'r', encoding='utf-8') as f:
    valid_data = json.load(f)

train_count = 0
valid_count = 0

for data in train_data:
    if len(data["conversation_history"]) > 512:
        train_count += 1

for data in valid_data:
    if len(data["conversation_history"]) > 512:
        valid_count += 1

print(f'训练集中conversation_history长度超过512的数量: {train_count}')
print(f'验证集中conversation_history长度超过512的数量: {valid_count}')

训练集中conversation_history长度超过512的数量: 185
验证集中conversation_history长度超过512的数量: 39


# 切分新的训练集和测试集（1）

In [3]:
# 新的数据划分方法，基于conversation_id划分
import json
import random
import numpy as np
from sklearn.model_selection import train_test_split

# 读取数据
with open('/mnt/cfs/huangzhiwei/BAE2025/data_extend/extend_1_8+8.json', 'r', encoding='utf-8') as f:
    datas = json.load(f)

# 提取所有唯一的conversation_id
conversation_ids = list(set(data["conversation_id"] for data in datas))

# 设置随机种子以确保结果可复现
random.seed(42)
np.random.seed(42)

# 按照0.8:0.2的比例划分conversation_ids
train_conversation_ids, valid_conversation_ids = train_test_split(
    conversation_ids, test_size=0.2, random_state=42
)

# 根据对话ID筛选数据
train_data = [data for data in datas if data["conversation_id"] in train_conversation_ids]
valid_data = [data for data in datas if data["conversation_id"] in valid_conversation_ids]

# 打印划分信息
print(f'训练集大小: {len(train_data)} 对话, 验证集大小: {len(valid_data)} 对话')
print(f'训练集占比: {len(train_data) / (len(train_data) + len(valid_data)):.2f}')

# 检查类别分布
def count_labels(data_list):
    label_counts = {"Yes": 0, "To some extent": 0, "No": 0}
    total_samples = 0
    
    for data in data_list:
        for model, response_data in data["tutor_responses"].items():
            label = response_data["annotation"]["Providing_Guidance"]
            label_counts[label] += 1
            total_samples += 1
    
    return label_counts, total_samples

train_labels, train_samples = count_labels(train_data)
valid_labels, valid_samples = count_labels(valid_data)

print("\n标签分布:")
print(f"训练集 ({train_samples} 样本):")
for label, count in train_labels.items():
    print(f"  - {label}: {count} ({count/train_samples*100:.2f}%)")

print(f"验证集 ({valid_samples} 样本):")
for label, count in valid_labels.items():
    print(f"  - {label}: {count} ({count/valid_samples*100:.2f}%)")

# 保存划分后的数据
with open('/mnt/cfs/huangzhiwei/BAE2025/data_extend/train_8+8.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open('/mnt/cfs/huangzhiwei/BAE2025/data_extend/valid_8+8.json', 'w', encoding='utf-8') as f:
    json.dump(valid_data, f, ensure_ascii=False, indent=4)

print('\n数据已保存!')

训练集大小: 444 对话, 验证集大小: 112 对话
训练集占比: 0.80

标签分布:
训练集 (2205 样本):
  - Yes: 1171 (53.11%)
  - To some extent: 514 (23.31%)
  - No: 520 (23.58%)
验证集 (527 样本):
  - Yes: 272 (51.61%)
  - To some extent: 125 (23.72%)
  - No: 130 (24.67%)

数据已保存!


In [None]:
# 检测一下train.json和valid.json的数据中“conversation_history”长度超过512的数量
with open('/mnt/cfs/huangzhiwei/BAE2025/data_extend/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open('/mnt/cfs/huangzhiwei/BAE2025/data_extend/valid.json', 'r', encoding='utf-8') as f:
    valid_data = json.load(f)

train_count = 0
valid_count = 0

for data in train_data:
    if len(data["conversation_history"]) > 512:
        train_count += 1

for data in valid_data:
    if len(data["conversation_history"]) > 512:
        valid_count += 1

print(f'训练集中conversation_history长度超过512的数量: {train_count}')
print(f'验证集中conversation_history长度超过512的数量: {valid_count}')