# ReasonAQA Dataset Exploration

This notebook helps you explore and understand the ReasonAQA dataset structure, content, and characteristics.

## 1. Setup and Imports

In [4]:
!pip install matplotlib seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [5]:
import json
import os
from collections import Counter, defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Base path
BASE_PATH = Path('/home/ikulkar1/qwen_omni_finetune/audio_reasoning_interspeech')
DATA_PATH = BASE_PATH / 'src/data/reasonaqa/reasonaqa'

print(f"Data directory: {DATA_PATH}")
print(f"Directory exists: {DATA_PATH.exists()}")

Data directory: /home/ikulkar1/qwen_omni_finetune/audio_reasoning_interspeech/src/data/reasonaqa/reasonaqa
Directory exists: True


## 2. Load Dataset Files

In [6]:
def load_split(split_name):
    """Load a dataset split."""
    file_path = DATA_PATH / f"{split_name}.json"
    with open(file_path, 'r') as f:
        data = json.load(f)
    print(f"Loaded {split_name}: {len(data):,} samples")
    return data

# Load all splits
train_data = load_split('train')
val_data = load_split('val')
test_data = load_split('test')

print(f"\nTotal samples: {len(train_data) + len(val_data) + len(test_data):,}")

Loaded train: 968,071 samples
Loaded val: 114,188 samples
Loaded test: 161,695 samples

Total samples: 1,243,954


## 3. Dataset Structure Overview

In [7]:
# Show first sample structure
print("Sample structure (first entry from test set):")
print(json.dumps(test_data[0], indent=2))

print("\nKeys in each sample:")
print(list(test_data[0].keys()))

Sample structure (first entry from test set):
{
  "taskname": "audiocaps",
  "filepath1": "AudioCapsLarger/test/Y7fmOlUlwoNg.wav",
  "filepath2": "AudioCapsLarger/test/YZYWCwfCkBp4.wav",
  "caption1": "Constant rattling noise and sharp vibrations",
  "caption2": "Sawing wood with music playing in the distance",
  "input": "explain the difference in few words",
  "answer": "Audio 1 features a constant, high-frequency rattling noise with sharp vibrations, while Audio 2 combines a mid-frequency sawing sound with a distant, low-frequency music accompaniment.",
  "subtype": "ACD-1.json"
}

Keys in each sample:
['taskname', 'filepath1', 'filepath2', 'caption1', 'caption2', 'input', 'answer', 'subtype']


## 4. Dataset Statistics

In [8]:
def get_dataset_stats(data, split_name):
    """Get comprehensive statistics about a dataset split."""
    stats = {
        'split': split_name,
        'total_samples': len(data),
        'tasknames': Counter([item['taskname'] for item in data]),
        'subtypes': Counter([item['subtype'] for item in data]),
        'has_audio1': sum(1 for item in data if item['filepath1']),
        'has_audio2': sum(1 for item in data if item['filepath2']),
        'dual_audio': sum(1 for item in data if item['filepath1'] and item['filepath2']),
        'single_audio': sum(1 for item in data if item['filepath1'] and not item['filepath2']),
    }
    return stats

# Get statistics for all splits
train_stats = get_dataset_stats(train_data, 'train')
val_stats = get_dataset_stats(val_data, 'val')
test_stats = get_dataset_stats(test_data, 'test')

# Display summary
print("="*80)
print("DATASET STATISTICS SUMMARY")
print("="*80)

for stats in [train_stats, val_stats, test_stats]:
    print(f"\n{stats['split'].upper()} SPLIT:")
    print(f"  Total samples: {stats['total_samples']:,}")
    print(f"  Dual audio samples: {stats['dual_audio']:,}")
    print(f"  Single audio samples: {stats['single_audio']:,}")
    print(f"\n  Task breakdown:")
    for task, count in stats['tasknames'].most_common():
        print(f"    {task}: {count:,}")

DATASET STATISTICS SUMMARY

TRAIN SPLIT:
  Total samples: 968,071
  Dual audio samples: 203,565
  Single audio samples: 764,506

  Task breakdown:
    audiocaps: 675,303
    clothov21: 278,680
    clotho_aqa_train: 14,088

VAL SPLIT:
  Total samples: 114,188
  Dual audio samples: 23,043
  Single audio samples: 91,145

  Task breakdown:
    clothov21: 75,871
    audiocaps: 34,189
    clotho_aqa_val: 4,128

TEST SPLIT:
  Total samples: 161,695
  Dual audio samples: 29,715
  Single audio samples: 131,980

  Task breakdown:
    audiocaps: 80,102
    clothov21: 75,917
    clotho_aqa_test: 5,676


## 5. Visualize Task Distribution

In [None]:
# Create visualization of task distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (data, title) in zip(axes, 
    [(train_data, 'Train'), (val_data, 'Val'), (test_data, 'Test')]):
    
    tasknames = Counter([item['taskname'] for item in data])
    
    ax.bar(tasknames.keys(), tasknames.values())
    ax.set_title(f'{title} Split - Task Distribution', fontsize=14, fontweight='bold')
    ax.set_xlabel('Task Name')
    ax.set_ylabel('Number of Samples')
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for i, (task, count) in enumerate(tasknames.items()):
        ax.text(i, count, f'{count:,}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 6. Explore Different Subtypes

In [None]:
# Show top 15 subtypes
print("TOP 15 SUBTYPES IN TEST SET:")
print("="*80)

subtypes = Counter([item['subtype'] for item in test_data])
for i, (subtype, count) in enumerate(subtypes.most_common(15), 1):
    print(f"{i:2d}. {subtype:<40} {count:>6,} samples")

## 7. Example Samples from Different Task Types

In [None]:
# Show examples from different subtypes
subtypes_to_show = ['ACD-1.json', 'ACD-2.json', 'ACE.json', 'AudioCaps-MCQ.json', 'AudioCaps-Detail.json']

print("EXAMPLE SAMPLES FROM DIFFERENT SUBTYPES:")
print("="*80)

for subtype in subtypes_to_show:
    # Find first sample with this subtype
    sample = next((item for item in test_data if item['subtype'] == subtype), None)
    if sample:
        print(f"\n{'='*80}")
        print(f"SUBTYPE: {subtype}")
        print(f"{'='*80}")
        print(json.dumps(sample, indent=2))
        print()

## 8. Audio File Path Analysis

In [None]:
# Analyze audio file paths
def analyze_audio_paths(data):
    """Analyze audio file paths to understand directory structure."""
    paths1 = [item['filepath1'] for item in data if item['filepath1']]
    paths2 = [item['filepath2'] for item in data if item['filepath2']]
    
    # Extract directories
    dirs1 = [Path(p).parts[0] if p else None for p in paths1]
    dirs2 = [Path(p).parts[0] if p else None for p in paths2]
    
    return Counter(dirs1), Counter(dirs2)

dirs1, dirs2 = analyze_audio_paths(test_data)

print("AUDIO FILE DIRECTORIES (filepath1):")
for dir_name, count in dirs1.most_common():
    print(f"  {dir_name}: {count:,} files")

print("\nAUDIO FILE DIRECTORIES (filepath2):")
for dir_name, count in dirs2.most_common():
    print(f"  {dir_name}: {count:,} files")

## 9. Input and Answer Length Analysis

In [None]:
# Analyze question and answer lengths
input_lengths = [len(item['input']) for item in test_data]
answer_lengths = [len(item['answer']) for item in test_data]

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Input lengths
axes[0].hist(input_lengths, bins=50, edgecolor='black')
axes[0].set_title('Question/Input Length Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')
axes[0].axvline(sum(input_lengths)/len(input_lengths), color='r', linestyle='--', label=f'Mean: {sum(input_lengths)/len(input_lengths):.1f}')
axes[0].legend()

# Answer lengths
axes[1].hist(answer_lengths, bins=50, edgecolor='black')
axes[1].set_title('Answer Length Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Character Count')
axes[1].set_ylabel('Frequency')
axes[1].axvline(sum(answer_lengths)/len(answer_lengths), color='r', linestyle='--', label=f'Mean: {sum(answer_lengths)/len(answer_lengths):.1f}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Input length - Min: {min(input_lengths)}, Max: {max(input_lengths)}, Avg: {sum(input_lengths)/len(input_lengths):.1f}")
print(f"Answer length - Min: {min(answer_lengths)}, Max: {max(answer_lengths)}, Avg: {sum(answer_lengths)/len(answer_lengths):.1f}")

## 10. Compare Single vs Dual Audio Tasks

In [None]:
# Compare single vs dual audio tasks
single_audio = [item for item in test_data if item['filepath1'] and not item['filepath2']]
dual_audio = [item for item in test_data if item['filepath1'] and item['filepath2']]

print(f"Single audio tasks: {len(single_audio):,} ({len(single_audio)/len(test_data)*100:.1f}%)")
print(f"Dual audio tasks: {len(dual_audio):,} ({len(dual_audio)/len(test_data)*100:.1f}%)")

print("\nSubtypes for SINGLE audio tasks:")
single_subtypes = Counter([item['subtype'] for item in single_audio])
for subtype, count in single_subtypes.most_common(10):
    print(f"  {subtype}: {count:,}")

print("\nSubtypes for DUAL audio tasks:")
dual_subtypes = Counter([item['subtype'] for item in dual_audio])
for subtype, count in dual_subtypes.most_common(10):
    print(f"  {subtype}: {count:,}")

## 11. Create DataFrame for Easy Exploration

In [None]:
# Convert to pandas DataFrame for easier exploration
df_test = pd.DataFrame(test_data)

# Add helper columns
df_test['has_dual_audio'] = df_test['filepath2'].apply(lambda x: bool(x))
df_test['input_length'] = df_test['input'].apply(len)
df_test['answer_length'] = df_test['answer'].apply(len)

print("DataFrame shape:", df_test.shape)
print("\nColumn names:")
print(df_test.columns.tolist())

print("\nFirst 5 rows:")
df_test.head()

## 12. Filter and Search Samples

In [None]:
# Example: Find all MCQ (Multiple Choice Question) tasks
mcq_samples = df_test[df_test['subtype'].str.contains('MCQ', na=False)]
print(f"Found {len(mcq_samples)} MCQ samples")
print("\nExample MCQ sample:")
print(json.dumps(mcq_samples.iloc[0].to_dict(), indent=2))

In [None]:
# Example: Find samples with short answers (potential multiple choice)
short_answers = df_test[df_test['answer_length'] < 50]
print(f"Found {len(short_answers)} samples with short answers (<50 chars)")
print("\nSample short answers:")
for i, row in short_answers.head(5).iterrows():
    print(f"  Input: {row['input'][:60]}...")
    print(f"  Answer: {row['answer']}")
    print()

## 13. Export Sample Subset for Testing

In [None]:
# Export a small subset for testing
subset_size = 100
test_subset = test_data[:subset_size]

output_path = BASE_PATH / 'notebooks/reasonaqa_test_subset.json'
with open(output_path, 'w') as f:
    json.dump(test_subset, f, indent=2)

print(f"Exported {subset_size} samples to: {output_path}")

## 14. Custom Exploration

Use the cells below for your own custom exploration of the dataset.

In [None]:
# Your custom exploration code here
