In [9]:
import pandas as pd
import json
from collections import Counter

In [28]:
# Load  data
with open('data/hover_train_release_v1.1.json', 'r') as f:
    hover_train = json.load(f)
with open('data/hover_dev_release_v1.1.json', 'r') as f:
    hover_dev = json.load(f)
with open('data/hover_test_release_v1.1.json', 'r') as f:
    hover_test = json.load(f)
print(f"Training examples: {len(hover_train)}")
print(f"Dev examples: {len(hover_dev)}")
print(f"Test examples: {len(hover_test)}")

Training examples: 18171
Dev examples: 4000
Test examples: 4000


In [26]:
# Dataset structure and distributions 
example = hover_train[101]
print("\nExample:")
print(example)
print("Keys in each example:", list(example.keys()))

train_labels = [ex['label'] for ex in hover_train]
dev_labels = [ex['label'] for ex in hover_dev]
train_dist = Counter(train_labels)
dev_dist = Counter(dev_labels)
print("\nTraining set:")
for label, count in train_dist.items():
    print(f"  {label}: {count} ({count/len(hover_train)*100:.1f}%)")
print("\nDev set:")
for label, count in dev_dist.items():
    print(f"  {label}: {count} ({count/len(hover_dev)*100:.1f}%)")

train_hops = [ex['num_hops'] for ex in hover_train]
hop_dist = Counter(train_hops)
print("\nNumber of hops distribution:")
for hops, count in sorted(hop_dist.items()):
    print(f"  {hops} hops: {count} ({count/len(hover_train)*100:.1f}%)")

num_facts = [len(ex['supporting_facts']) for ex in hover_train]
print(f"\nAverage supporting facts per claim: {sum(num_facts)/len(num_facts):.2f}")
print(f"Min supporting facts: {min(num_facts)}")
print(f"Max supporting facts: {max(num_facts)}")

# Distribution
facts_dist = Counter(num_facts)
print("\nDistribution of supporting facts:")
for n, count in sorted(facts_dist.items())[:5]: 
    print(f"  {n} facts: {count} examples")


Example:
{'uid': '2427c810-c00d-4876-8b10-c87c4ba3648b', 'claim': "A hockey team calls Madison Square Garden it's home. That team, along with the New York Islanders, and the New Jersey Devils NHL franchise, are popular in the New York metropolitan area.", 'supporting_facts': [['1974â€“75 New York Islanders season', 2], ['New York Rangers', 3], ['Madison Square Garden', 5]], 'label': 'SUPPORTED', 'num_hops': 3, 'hpqa_id': '5a820e8855429926c1cdae14'}
Keys in each example: ['uid', 'claim', 'supporting_facts', 'label', 'num_hops', 'hpqa_id']

Training set:
  SUPPORTED: 11023 (60.7%)
  NOT_SUPPORTED: 7148 (39.3%)

Dev set:
  SUPPORTED: 2000 (50.0%)
  NOT_SUPPORTED: 2000 (50.0%)

Number of hops distribution:
  2 hops: 9052 (49.8%)
  3 hops: 6084 (33.5%)
  4 hops: 3035 (16.7%)

Average supporting facts per claim: 3.07
Min supporting facts: 2
Max supporting facts: 14

Distribution of supporting facts:
  2 facts: 6153 examples
  3 facts: 6445 examples
  4 facts: 4216 examples
  5 facts: 1045 e