In [1]:
import json
import os
import csv
import glob

# Define the path to the SGD dataset
data_dir = "../original_data/dstc8-schema-guided-dialogue/train"

# Create a list to store all the conversations
conversations = []

# Track global position and conversation_id
conversation_id = 0
message_counter = 0

# Iterate through all JSON files in the directory
for filepath in glob.glob(os.path.join(data_dir, "*.json")):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
        # Process each dialogue in the file
        for dialogue in data:
            position_counter = 0
            
            turns = dialogue.get('turns', [])
            
            # Skip if no turns
            if not turns:
                continue
                
            turn_id = 0
            turn_pairs = []
            
            # Process each turn in the dialogue
            for i in range(0, len(turns), 2):
                # Skip if we can't form a complete turn pair
                if i + 1 >= len(turns):
                    break
                    
                user_turn = turns[i]
                assistant_turn = turns[i+1]
                
                # Verify that we have a user-assistant pair
                if user_turn.get('speaker') == 'USER' and assistant_turn.get('speaker') == 'SYSTEM':
                    user_utterance = user_turn.get('utterance', '')
                    assistant_utterance = assistant_turn.get('utterance', '')
                    
                    # Add the user utterance
                    turn_pairs.append({
                        'conversation_id': conversation_id,
                        'role': 'user',
                        'content': user_utterance,
                        'turn': turn_id,
                        'position': position_counter
                    })
                    position_counter += 1
                    message_counter += 1
                    
                    # Add the assistant utterance
                    turn_pairs.append({
                        'conversation_id': conversation_id,
                        'role': 'assistant',
                        'content': assistant_utterance,
                        'turn': turn_id,
                        'position': position_counter
                    })
                    position_counter += 1
                    message_counter += 1
                    
                    turn_id += 1
            
            # Add the dialogue to our list if we have any valid turn pairs
            if turn_pairs:
                conversations.extend(turn_pairs)
                conversation_id += 1
                
# Write the data to a CSV file
output_file = "../cleaned_data/sgd_processed.csv"
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['conversation_id', 'role', 'content', 'turn', 'position'])
    writer.writeheader()
    writer.writerows(conversations)

print(f"Processed {conversation_id} conversations with {message_counter} utterances.")
print(f"Output saved to {output_file}")

# Processed 16142 conversations with 28 utterances.
# Output saved to ../sgd_processed.csv

Processed 16142 conversations with 329964 utterances.
Output saved to ../cleaned_data/sgd_processed.csv
