In [9]:
import json
import pandas as pd
from pathlib import Path

# Define file paths
data_dir = Path(".")
files = {
    "Training": data_dir / "training.json",
    "Validation": data_dir / "validation.json",
    "Test": data_dir / "test.json"
}

# Function to analyze a dataset
def analyze_dataset(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Count total examples
        total_count = len(data)
        
        # Count examples by star rating
        star_counts = {}
        for i in range(1, 6):
            star_counts[i] = sum(1 for item in data if item.get("stars") == float(i))
            
        return total_count, star_counts
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found.")
        return 0, {i: 0 for i in range(1, 6)}
    except json.JSONDecodeError:
        print(f"Warning: File {file_path} contains invalid JSON.")
        return 0, {i: 0 for i in range(1, 6)}

# Analyze all datasets
results = {}
for name, file_path in files.items():
    total, star_counts = analyze_dataset(file_path)
    results[name] = {"Total": total, **{f"{i} Star": count for i, count in star_counts.items()}}

# Create DataFrame for display
df = pd.DataFrame(results).T

# Display the table
print("\nDataset Statistics:")
print(df.to_string())

# Optional: Create a more visually appealing table with styling
try:
    from IPython.display import display
    styled_df = df.style.set_caption("Dataset Statistics by Star Rating")
    display(styled_df)
except ImportError:
    # If not in a notebook environment, the regular print is already done
    pass


Dataset Statistics:
            Total  1 Star  2 Star  3 Star  4 Star  5 Star
Training    16000    3200    3200    3200    3200    3200
Validation    800     320     320     160       0       0
Test          800       0       0     160     320     320
