In [16]:
LABEL_FILES = ["Dataset1/Labels/dataset_1_vul_two_one_names_labels.json", "Dataset1/Labels/dataset_1_vul_five_one_names_labels.json", "Dataset2/Labels/dataset2_train_test_names_labels.json"]
FILTER_CATEGORIES = ["Reentrancy", "reentrancy-eth"]
SUM_INDEXES = [[0,2],[1,2]]

In [17]:
import json
import pandas as pd
from pathlib import Path

def analyze_dataset_file(file_path):
    """Analyze a single JSON file and return statistics for filtered categories"""
    print(f"\n=== Analysis for {file_path} ===")
    
    if not Path(file_path).exists():
        print(f"File not found: {file_path}")
        return
    
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    stats = {}
    
    # Filter data by FILTER_CATEGORIES
    filtered_data = {k: v for k, v in data.items() if k in FILTER_CATEGORIES}
    
    if not filtered_data:
        print(f"No matching categories found. Available categories: {list(data.keys())}")
        return
    
    for category, category_data in filtered_data.items():
        train_names = category_data.get('train_names', [])
        test_names = category_data.get('test_names', [])
        train_labels = category_data.get('train_labels', [])
        test_labels = category_data.get('test_labels', [])
        
        # Count label distributions
        train_labels_0 = train_labels.count(0) if train_labels else 0
        train_labels_1 = train_labels.count(1) if train_labels else 0
        test_labels_0 = test_labels.count(0) if test_labels else 0
        test_labels_1 = test_labels.count(1) if test_labels else 0
        
        stats[category] = {
            'train_names': len(train_names),
            'test_names': len(test_names),
            'train_labels_0': train_labels_0,
            'train_labels_1': train_labels_1,
            'test_labels_0': test_labels_0,
            'test_labels_1': test_labels_1,
            'total_samples': len(train_names) + len(test_names)
        }
        
        print(f"{category}:")
        print(f"  Train names: {len(train_names)}")
        print(f"  Train labels - 0s: {train_labels_0}, 1s: {train_labels_1}")
        print(f"  Test names: {len(test_names)}")
        print(f"  Test labels - 0s: {test_labels_0}, 1s: {test_labels_1}")
        print(f"  Total samples: {len(train_names) + len(test_names)}")
    
    # Create summary table
    df = pd.DataFrame(stats).T
    print(f"\nSummary table:")
    print(df)
    
    # Overall totals
    print(f"\nOverall totals across filtered categories:")
    print(f"Total train samples: {df['train_names'].sum()}")
    print(f"Total test samples: {df['test_names'].sum()}")
    print(f"Total train labels - 0s: {df['train_labels_0'].sum()}, 1s: {df['train_labels_1'].sum()}")
    print(f"Total test labels - 0s: {df['test_labels_0'].sum()}, 1s: {df['test_labels_1'].sum()}")
    print(f"Grand total samples: {df['total_samples'].sum()}")
    
    return stats

# Analyze each file separately
all_stats = {}
for file_path in LABEL_FILES:
    stats = analyze_dataset_file(file_path)
    if stats:
        all_stats[file_path] = stats


=== Analysis for Dataset1/Labels/dataset_1_vul_two_one_names_labels.json ===
reentrancy-eth:
  Train names: 1018
  Train labels - 0s: 678, 1s: 340
  Test names: 255
  Test labels - 0s: 170, 1s: 85
  Total samples: 1273

Summary table:
                test_labels_0  test_labels_1  test_names  total_samples  \
reentrancy-eth            170             85         255           1273   

                train_labels_0  train_labels_1  train_names  
reentrancy-eth             678             340         1018  

Overall totals across filtered categories:
Total train samples: 1018
Total test samples: 255
Total train labels - 0s: 678, 1s: 340
Total test labels - 0s: 170, 1s: 85
Grand total samples: 1273

=== Analysis for Dataset1/Labels/dataset_1_vul_five_one_names_labels.json ===
reentrancy-eth:
  Train names: 2039
  Train labels - 0s: 1699, 1s: 340
  Test names: 509
  Test labels - 0s: 424, 1s: 85
  Total samples: 2548

Summary table:
                test_labels_0  test_labels_1  test_names 

In [18]:
# Combine statistics using SUM_INDEXES
print("\n" + "="*60)
print("COMBINED STATISTICS USING SUM_INDEXES")
print("="*60)

for idx, sum_group in enumerate(SUM_INDEXES):
    print(f"\n--- Sum Group {idx + 1}: Files {sum_group} ---")
    
    combined_stats = {}
    valid_files = []
    
    # Collect stats from specified file indexes
    for file_idx in sum_group:
        if file_idx < len(LABEL_FILES):
            file_path = LABEL_FILES[file_idx]
            if file_path in all_stats:
                valid_files.append(file_path)
                print(f"Including: {file_path}")
                
                for category, stats in all_stats[file_path].items():
                    if category not in combined_stats:
                        combined_stats[category] = {
                            'train_names': 0,
                            'test_names': 0,
                            'train_labels_0': 0,
                            'train_labels_1': 0,
                            'test_labels_0': 0,
                            'test_labels_1': 0
                        }
                    
                    # Sum up the statistics
                    for key in combined_stats[category]:
                        combined_stats[category][key] += stats[key]
    
    if not combined_stats:
        print("No valid statistics found for this group.")
        continue
    
    # Display combined results
    print(f"\nCombined results for files: {valid_files}")
    
    total_0s = 0
    total_1s = 0
    
    for category, stats in combined_stats.items():
        category_0s = stats['train_labels_0'] + stats['test_labels_0']
        category_1s = stats['train_labels_1'] + stats['test_labels_1']
        
        print(f"\n{category}:")
        print(f"  Total samples: {stats['train_names'] + stats['test_names']}")
        print(f"  Total 0s: {category_0s}")
        print(f"  Total 1s: {category_1s}")
        print(f"  Ratio (1s/total): {category_1s/(category_0s + category_1s):.3f}")
        
        total_0s += category_0s
        total_1s += category_1s
    
    print(f"\n{'='*30}")
    print(f"OVERALL TOTALS FOR GROUP {idx + 1}:")
    print(f"Total 0s across all categories: {total_0s}")
    print(f"Total 1s across all categories: {total_1s}")
    print(f"Grand total samples: {total_0s + total_1s}")
    print(f"Overall ratio (1s/total): {total_1s/(total_0s + total_1s):.3f}")
    print(f"{'='*30}")


COMBINED STATISTICS USING SUM_INDEXES

--- Sum Group 1: Files [0, 2] ---
Including: Dataset1/Labels/dataset_1_vul_two_one_names_labels.json
Including: Dataset2/Labels/dataset2_train_test_names_labels.json

Combined results for files: ['Dataset1/Labels/dataset_1_vul_two_one_names_labels.json', 'Dataset2/Labels/dataset2_train_test_names_labels.json']

reentrancy-eth:
  Total samples: 1273
  Total 0s: 848
  Total 1s: 425
  Ratio (1s/total): 0.334

Reentrancy:
  Total samples: 63
  Total 0s: 42
  Total 1s: 21
  Ratio (1s/total): 0.333

OVERALL TOTALS FOR GROUP 1:
Total 0s across all categories: 890
Total 1s across all categories: 446
Grand total samples: 1336
Overall ratio (1s/total): 0.334

--- Sum Group 2: Files [1, 2] ---
Including: Dataset1/Labels/dataset_1_vul_five_one_names_labels.json
Including: Dataset2/Labels/dataset2_train_test_names_labels.json

Combined results for files: ['Dataset1/Labels/dataset_1_vul_five_one_names_labels.json', 'Dataset2/Labels/dataset2_train_test_names_la