# 清洗LUCID v1数据
- 对对话进行编号
- 记录轮次以及对话位置

In [4]:
import json
import csv
import os
from pathlib import Path

def process_lucid_data(input_path, output_path="lucid_processed.csv"):
    """
    Process Lucid dataset and convert it to CSV format.
    """
    # Check if input path is a file or directory
    input_path = Path(input_path)
    if input_path.is_dir():
        # Read all JSON files in the directory
        files = list(input_path.glob("*.json"))
    else:
        files = [input_path]
    
    all_rows = []
    total_conversations = 0
    
    for file_path in files:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Handle both single conversations and lists of conversations
        conversations = data if isinstance(data, list) else [data]
        
        for conversation in conversations:
            conversation_id = conversation.get("_id", "").replace("LUCID_", "")
            if not conversation_id:
                conversation_id = str(total_conversations)
            
            messages = []
            for turn in conversation["turns"]:
                if turn["author"] == "User":
                    messages.append({"role": "user", "content": turn["query"]})
                elif turn["author"] == "Response":
                    messages.append({"role": "assistant", "content": turn["text"]})
            
            # Process messages and assign turns and positions
            position = 0
            turn = 0
            for i, msg in enumerate(messages):
                all_rows.append({
                    "conversation_id": conversation_id,
                    "role": msg["role"],
                    "content": msg["content"],
                    "turn": turn,
                    "position": position
                })
                position += 1
                
                # Increment turn after assistant messages (except the last one)
                if msg["role"] == "assistant" and i < len(messages) - 1:
                    turn += 1
            
            total_conversations += 1
    
    # Write to CSV
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["conversation_id", "role", "content", "turn", "position"])
        writer.writeheader()
        writer.writerows(all_rows)
    
    return total_conversations, len(all_rows)

# Set your input path where the Lucid data is stored
input_path = "../original_data/ml-lucid-datagen/lucid_v1.0/LUCID_data.json"  # Update this with your actual path
output_path = "../cleaned_data/lucid_processed.csv"

# Process the data
total_conversations, total_messages = process_lucid_data(input_path, output_path)

print(f"Processed data saved to {output_path}")
print(f"Total conversations: {total_conversations}")
print(f"Total messages: {total_messages}")

# Processed data saved to lucid_processed.csv
# Total conversations: 4277
# Total messages: 40137

Processed data saved to ../cleaned_data/lucid_processed.csv
Total conversations: 4277
Total messages: 40137
