# POC — Exploratory Analysis (CIFAR-10, AG News, mini-UCF)

Minimal dataset sanity checks and quick visuals for the technical report.

In [None]:
import os, sys, math, random
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path.cwd().resolve()
if ROOT.name == "notebooks":
    sys.path.append(str(ROOT.parent))
    PROJ = ROOT.parent
else:
    sys.path.append(str(ROOT))
    PROJ = ROOT

print("Project root:", PROJ)

## CIFAR-10 — label distribution and sample grid

In [None]:
from datasets import load_dataset
from collections import Counter

ds_train = load_dataset("cifar10", split="train[:90%]")
ds_val   = load_dataset("cifar10", split="train[90%:]")
names = ds_train.features["label"].names

print(f"{'Class':<12} {'Name':<12} {'Train Count':<12} {'Val Count':<10} {'Train %':<8} {'Val %':<8}")
print("-" * 70)

train_counts = Counter(int(r["label"]) for r in ds_train)
val_counts = Counter(int(r["label"]) for r in ds_val)

train_total = len(ds_train)
val_total = len(ds_val)

for i, class_name in enumerate(names):
    train_count = train_counts.get(i, 0)
    val_count = val_counts.get(i, 0)
    train_pct = (train_count / train_total) * 100
    val_pct = (val_count / val_total) * 100
    
    print(f"{i:<12} {class_name:<12} {train_count:<12} {val_count:<10} {train_pct:<8.1f} {val_pct:<8.1f}")

print("-" * 70)
print(f"{'Total':<12} {'':<12} {train_total:<12} {val_total:<10} {100.0:<8.1f} {100.0:<8.1f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def show_cifar_grid(ds, n=12, title="CIFAR-10 samples"):
    idxs = np.random.choice(len(ds), size=min(n, len(ds)), replace=False)
    cols = int(np.sqrt(len(idxs))); rows = int(np.ceil(len(idxs)/cols))
    plt.figure(figsize=(cols*2, rows*2))
    for j, i in enumerate(idxs):
        ex = ds[int(i)]
        img = ex["img"]
        label = ds.features["label"].names[int(ex["label"])]
        plt.subplot(rows, cols, j+1)
        plt.imshow(img); plt.axis("off")
        plt.title(label, fontsize=8)
    plt.suptitle(title, fontsize=12)
    plt.tight_layout(); plt.show()

show_cifar_grid(ds_train, n=12)

## AG News — class distribution and sample snippets

In [None]:
from datasets import load_dataset
from collections import Counter

ds_tr_text = load_dataset("ag_news", split="train[:2000]")
ds_va_text = load_dataset("ag_news", split="test[:1000]")
names = ["World", "Sports", "Business", "Sci/Tech"]

print(f"{'Class':<12} {'Name':<12} {'Train Count':<12} {'Val Count':<10} {'Train %':<8} {'Val %':<8}")
print("-" * 70)

train_counts = Counter(int(r["label"]) for r in ds_tr_text)
val_counts = Counter(int(r["label"]) for r in ds_va_text)

train_total = len(ds_tr_text)
val_total = len(ds_va_text)

for i, class_name in enumerate(names):
    train_count = train_counts.get(i, 0)
    val_count = val_counts.get(i, 0)
    train_pct = (train_count / train_total) * 100
    val_pct = (val_count / val_total) * 100
    
    print(f"{i:<12} {class_name:<12} {train_count:<12} {val_count:<10} {train_pct:<8.1f} {val_pct:<8.1f}")

print("-" * 70)
print(f"{'Total':<12} {'':<12} {train_total:<12} {val_total:<10} {100.0:<8.1f} {100.0:<8.1f}")

In [None]:
# Show a few short snippets
for i in range(3):
    print(f"--- AG News sample {i+1} ---")
    print(ds_tr_text[i]["text"][:200].replace("\n"," "))

## mini-UCF — folder stats and grid showing samples frames

In [None]:
from pathlib import Path
from collections import Counter

VID_ROOT = (PROJ / "data" / "mini_ucf")
print("Looking for:", VID_ROOT)

if VID_ROOT.exists():
    # get all classes from train directory
    train_dir = VID_ROOT / "train"
    val_dir = VID_ROOT / "val"
    
    if train_dir.exists() and val_dir.exists():
        # get all class names
        train_classes = [d.name for d in train_dir.iterdir() if d.is_dir()]
        val_classes = [d.name for d in val_dir.iterdir() if d.is_dir()]
        all_classes = sorted(list(set(train_classes + val_classes)))
        
        print(f"{'Class':<12} {'Name':<15} {'Train Count':<12} {'Val Count':<10} {'Train %':<8} {'Val %':<8}")
        print("-" * 75)
        
        train_counts = {}
        val_counts = {}
        
        # count videos for each class
        for class_name in all_classes:
            train_count = len(list((train_dir / class_name).glob("*"))) if (train_dir / class_name).exists() else 0
            val_count = len(list((val_dir / class_name).glob("*"))) if (val_dir / class_name).exists() else 0
            train_counts[class_name] = train_count
            val_counts[class_name] = val_count
        
        train_total = sum(train_counts.values())
        val_total = sum(val_counts.values())
        
        for i, class_name in enumerate(all_classes):
            train_count = train_counts[class_name]
            val_count = val_counts[class_name]
            train_pct = (train_count / train_total * 100) if train_total > 0 else 0
            val_pct = (val_count / val_total * 100) if val_total > 0 else 0
            
            print(f"{i:<12} {class_name:<15} {train_count:<12} {val_count:<10} {train_pct:<8.1f} {val_pct:<8.1f}")
        
        print("-" * 75)
        print(f"{'Total':<12} {'':<15} {train_total:<12} {val_total:<10} {100.0:<8.1f} {100.0:<8.1f}")
    else:
        print("Train or val directory not found in mini_ucf")
else:
    print("mini_ucf not found. Create it with tools/make_mini_ucf.py")