In [1]:
import pandas as pd
import json
from collections import Counter

In [4]:
def convert_json_to_agreement_df(input_file='Dataset_Clean.json', output_file='agreement_data.csv'):
    """
    Convert Dataset_Clean.json to agreement_df format for IAA analysis
    """
    print("Loading Dataset_Clean.json...")
    
    # Load the cleaned dataset
    df = pd.read_json(input_file)
    print(f"Loaded {len(df)} documents")
    
    # Create agreement_df format
    agreement_data = []
    
    # Process each document
    for _, row in df.iterrows():
        doc_id = row['id']
        text = row['text']
        
        # Process each annotator column (A_1 to A_7)
        for i in range(1, 8):
            annotator = f'A_{i}'
            
            # Fix: Check if annotator exists and has valid data
            if annotator in df.columns and isinstance(row[annotator], list):
                labels = row[annotator]
                
                # Extract hierarchical labels
                if len(labels) >= 2:
                    l1_label = labels[0]
                    l2_label = labels[1]
                    full_label = f"{l1_label}_{l2_label}"
                    
                    agreement_data.append({
                        'id': doc_id,
                        'annotator': annotator,
                        'text': text,
                        'L1_label': l1_label,
                        'L2_label': l2_label,
                        'full_label': full_label
                    })
    
    # Create agreement DataFrame
    agreement_df = pd.DataFrame(agreement_data)
    
    print(f"Created agreement_df with {len(agreement_df)} annotations")
    print(f"Documents: {agreement_df['id'].nunique()}")
    print(f"Annotators: {agreement_df['annotator'].nunique()}")
    print(f"Unique labels: {agreement_df['full_label'].nunique()}")
    
    # Save to CSV
    agreement_df.to_csv(output_file, index=False)
    print(f"Saved to {output_file}")
    
    return agreement_df

In [5]:
if __name__ == "__main__":
    # Convert the data
    agreement_df = convert_json_to_agreement_df()
    
    # Display sample
    print("\nSample data:")
    print(agreement_df.head())
    
    print("\nLabel distribution:")
    print(agreement_df['full_label'].value_counts().head())

Loading Dataset_Clean.json...
Loaded 1935 documents
Created agreement_df with 13545 annotations
Documents: 1935
Annotators: 7
Unique labels: 9
Saved to agreement_data.csv

Sample data:
   id annotator                                               text  \
0   1       A_1  Hello, I need to reset my password. I have for...   
1   1       A_2  Hello, I need to reset my password. I have for...   
2   1       A_3  Hello, I need to reset my password. I have for...   
3   1       A_4  Hello, I need to reset my password. I have for...   
4   1       A_5  Hello, I need to reset my password. I have for...   

             L1_label        L2_label                         full_label  
0  Account Management  Password Reset  Account Management_Password Reset  
1  Account Management  Password Reset  Account Management_Password Reset  
2  Account Management  Password Reset  Account Management_Password Reset  
3  Account Management  Password Reset  Account Management_Password Reset  
4  Account Manageme