In [1]:
import os
import json
import csv
import glob

# Define the path to the MultiWOZ 2.2 dataset
data_path = "..../original_data/multiwoz/data/MultiWOZ_2.2/train"

# Create a list to hold all the conversation data
conversation_data = []

# Get all JSON files in the directory
json_files = glob.glob(os.path.join(data_path, "*.json"))

# Initialize conversation_id
conversation_id = 0

# Process each JSON file
for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
        for dialogue in data:
            turns = dialogue.get('turns', [])
            print(f"Processing dialogue with {len(turns)} turns.")
            
            # Skip empty dialogues
            if not turns:
                continue
            
            # Process each turn in the dialogue
            position = 0
            turn_counter = 0
            
            for i in range(0, len(turns), 2):
                # Get user utterance
                if i < len(turns) and turns[i]['speaker'] == 'USER':
                    user_utterance = turns[i]['utterance']
                    conversation_data.append([
                        conversation_id,
                        'user',
                        user_utterance,
                        turn_counter,
                        position
                    ])
                    position += 1
                
                # Get system response
                if i + 1 < len(turns) and turns[i + 1]['speaker'] == 'SYSTEM':
                    system_utterance = turns[i + 1]['utterance']
                    conversation_data.append([
                        conversation_id,
                        'assistant',
                        system_utterance,
                        turn_counter,
                        position
                    ])
                    position += 1
                    turn_counter += 1
            
            # Increment conversation_id for the next dialogue
            conversation_id += 1

# Write the processed data to a CSV file
output_file = "../cleaned_data/multiwoz_processed.csv"

with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header
    csv_writer.writerow(['conversation_id', 'role', 'content', 'turn', 'position'])
    # Write the data
    csv_writer.writerows(conversation_data)

print(f"Data processing complete. Output saved to {output_file}")
print(f"Total conversations processed: {conversation_id}")

Data processing complete. Output saved to ../cleaned_data/multiwoz_processed.csv
Total conversations processed: 0
