In [4]:
# Random Baseline Compression Script
# Standalone script to evaluate random baseline compression on all three modalities

import os
import math
import time
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict
import torch
from transformers import AutoTokenizer
from tqdm.auto import tqdm

# Dataset libraries
from datasets import load_dataset
import requests
import zipfile
from PIL import Image

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHUNK_SIZE = 2048
NUM_CHUNKS = 2048
CACHE_DIR = Path("./compression_cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

class RandomArithmeticCoder:
    """
    Implements arithmetic coding with random probability assignments as baseline.
    """
    
    def __init__(self, vocab_size: int = 50304, seed: int = 42):
        """
        Initialize with random probability distribution.
        
        Args:
            vocab_size: Size of vocabulary (Pythia default is 50304)
            seed: Random seed for reproducibility
        """
        self.vocab_size = vocab_size
        np.random.seed(seed)
        
        # Generate random probabilities that sum to 1
        # Use Dirichlet distribution for realistic probability distribution
        alpha = np.ones(vocab_size) * 0.1  # Small alpha for more varied distribution
        self.random_probs = np.random.dirichlet(alpha)
        
        # Ensure minimum probability to avoid log(0)
        self.random_probs = np.maximum(self.random_probs, 1e-10)
        self.random_probs = self.random_probs / self.random_probs.sum()  # Renormalize
        
    def encode_sequence_random(self, tokens: torch.Tensor) -> float:
        """
        Encode token sequence using random probabilities.
        
        Args:
            tokens: Input token sequence
            
        Returns:
            Total compressed bits using random probabilities
        """
        if len(tokens) <= 1:
            return len(tokens) * 16.0  # Fallback for short sequences
        
        total_bits = 0.0
        
        for token_id in tokens[1:]:  # Skip first token (no prediction needed)
            token_id = token_id.item() if isinstance(token_id, torch.Tensor) else token_id
            
            # Use random probability for this token
            if token_id < self.vocab_size:
                prob = self.random_probs[token_id]
            else:
                # Out of vocabulary - assign average probability
                prob = 1.0 / self.vocab_size
            
            # Arithmetic coding: -log2(probability)
            bits = -math.log2(prob)
            total_bits += bits
        
        return total_bits

class DatasetManager:
    """Manages dataset downloading and processing."""
    
    def __init__(self, chunk_size: int = CHUNK_SIZE, num_chunks: int = NUM_CHUNKS):
        self.chunk_size = chunk_size
        self.num_chunks = num_chunks
        self.cache_dir = CACHE_DIR
        
    def fetch_enwik8_chunks(self) -> List[bytes]:
        """Download and process enwik8 Wikipedia data."""
        print("📥 Downloading enwik8 Wikipedia XML...")
        
        try:
            enwik8_path = self.cache_dir / "enwik8"
            if not enwik8_path.exists():
                # Download enwik8
                url = "http://mattmahoney.net/dc/enwik8.zip"
                zip_path = self.cache_dir / "enwik8.zip"
                
                response = requests.get(url, stream=True)
                response.raise_for_status()
                
                with open(zip_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(self.cache_dir)
                zip_path.unlink()
            
            # Read and chunk
            with open(enwik8_path, 'rb') as f:
                data = f.read()
            
            chunks = []
            for i in range(0, min(len(data), self.num_chunks * self.chunk_size), self.chunk_size):
                chunk = data[i:i + self.chunk_size]
                if len(chunk) == self.chunk_size:
                    chunks.append(chunk)
            
            print(f"✅ Loaded {len(chunks)} enwik8 chunks")
            return chunks[:self.num_chunks]
            
        except Exception as e:
            print(f"❌ Failed to load enwik8: {e}")
            return []
    
    def fetch_librispeech_chunks(self) -> List[bytes]:
        """Download and process LibriSpeech audio data."""
        print("📥 Downloading LibriSpeech audio...")
        try:
            ds = load_dataset(
                "librispeech_asr", "clean", split="train.100", streaming=True,
                cache_dir=str(CACHE_DIR)
            )
            chunks: List[bytes] = []
            buffer = bytearray()
            for item in ds:
                audio = item["audio"]
                data, sr = audio["array"], audio["sampling_rate"]
                if sr != 16000:
                    import librosa
                    data = librosa.resample(data, orig_sr=sr, target_sr=16000)
                pcm = np.clip(data * 32767, -32768, 32767).astype(np.int16).tobytes()
                buffer.extend(pcm)
                while len(buffer) >= self.chunk_size and len(chunks) < self.num_chunks:
                    chunks.append(bytes(buffer[:self.chunk_size]))
                    buffer = buffer[self.chunk_size:]
                if len(chunks) >= self.num_chunks:
                    break
            print(f"✅ Loaded {len(chunks)} LibriSpeech chunks")
            return chunks
        except Exception as e:
            print(f"❌ Failed to load LibriSpeech: {e}")
            return []
    
    def fetch_imagenet_chunks(self) -> List[bytes]:
        """Download and process ImageNet image patches."""
        print("📥 Downloading ImageNet images...")
        
        try:
            # Try multiple ImageNet sources
            sources = ["imagenet-1k", "ILSVRC/imagenet-1k"]
            
            for source in sources:
                try:
                    dataset = load_dataset(source, split="validation", streaming=True, trust_remote_code=True)
                    
                    chunks = []
                    processed = 0
                    
                    for item in dataset:
                        if len(chunks) >= self.num_chunks:
                            break
                        
                        try:
                            image = item['image']
                            
                            # Convert to grayscale
                            if image.mode != 'L':
                                image = image.convert('L')
                            
                            # Check size for 32x64 patch
                            if image.size[0] < 64 or image.size[1] < 32:
                                continue
                            
                            # Extract 32x64 patch
                            start_x = np.random.randint(0, max(1, image.size[0] - 64))
                            start_y = np.random.randint(0, max(1, image.size[1] - 32))
                            patch = image.crop((start_x, start_y, start_x + 64, start_y + 32))
                            
                            # Convert to bytes
                            patch_array = np.array(patch, dtype=np.uint8)
                            patch_bytes = patch_array.flatten().tobytes()
                            
                            if len(patch_bytes) >= self.chunk_size:
                                chunks.append(patch_bytes[:self.chunk_size])
                            else:
                                # Pad if needed
                                padding = self.chunk_size - len(patch_bytes)
                                chunks.append(patch_bytes + b'\x00' * padding)
                            
                            processed += 1
                            
                        except Exception:
                            continue
                        
                        if processed > self.num_chunks * 2:
                            break
                    
                    if len(chunks) >= self.num_chunks // 2:
                        print(f"✅ Loaded {len(chunks)} image chunks from {source}")
                        return chunks
                        
                except Exception as e:
                    continue
            
            # Fallback: Generate structured image data
            print("🔄 Generating structured image data...")
            return self._generate_image_data()
            
        except Exception as e:
            print(f"❌ Failed to load images: {e}")
            return self._generate_image_data()
    
    def _generate_image_data(self) -> List[bytes]:
        """Generate structured image-like data."""
        chunks = []
        np.random.seed(42)  # Reproducible
        
        for i in range(self.num_chunks):
            # Create 32x64 image with realistic structure
            patch = np.zeros((32, 64), dtype=np.uint8)
            
            for y in range(32):
                for x in range(64):
                    # Multiple frequency components like natural images
                    base_val = 128
                    low_freq = 40 * math.sin(x/20) * math.cos(y/15)
                    med_freq = 20 * math.sin(x/8) * math.sin(y/6)
                    high_freq = 10 * math.sin(x/3) * math.cos(y/4)
                    noise = np.random.normal(0, 15)
                    
                    pixel_val = base_val + low_freq + med_freq + high_freq + noise
                    patch[y, x] = np.clip(pixel_val, 0, 255)
            
            # Add some structure
            if i % 5 == 0:  # Vertical edges
                patch[:, 30:34] = np.clip(patch[:, 30:34] + 50, 0, 255)
            
            patch_bytes = patch.flatten().tobytes()[:self.chunk_size]
            if len(patch_bytes) < self.chunk_size:
                patch_bytes += b'\x00' * (self.chunk_size - len(patch_bytes))
            
            chunks.append(patch_bytes)
        
        return chunks

def bytes_to_ascii(data_bytes: bytes) -> str:
    """Convert bytes to ASCII string for tokenization."""
    return ''.join(chr(b % 128) for b in data_bytes)

def compute_random_compression_ratio(random_coder: RandomArithmeticCoder, 
                                   tokenizer, 
                                   raw_bytes: bytes) -> float:
    """
    Compute compression ratio using random probabilities.
    
    Returns:
        compression_ratio: compressed_bits / original_bits
    """
    # Convert bytes to ASCII text
    ascii_text = bytes_to_ascii(raw_bytes)
    
    # Tokenize
    tokens = tokenizer.encode(ascii_text, add_special_tokens=False, max_length=1024, truncation=True)
    tokens_tensor = torch.tensor(tokens)
    
    if len(tokens_tensor) < 2:
        return 1.0  # No compression possible
    
    # Compute compressed size using random probabilities
    compressed_bits = random_coder.encode_sequence_random(tokens_tensor)
    
    # Original size in bits
    original_bits = len(raw_bytes) * 8
    
    # Compression ratio
    compression_ratio = compressed_bits / original_bits
    
    return compression_ratio

def run_random_baseline_experiment():
    """Run random baseline compression experiment."""
    print("🎲 RANDOM BASELINE COMPRESSION EXPERIMENT")
    print("=" * 50)
    
    # Load tokenizer
    print("🔧 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m", trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Initialize random coder
    random_coder = RandomArithmeticCoder(vocab_size=tokenizer.vocab_size, seed=42)
    
    # Load datasets
    dataset_manager = DatasetManager()
    print("\n📥 Loading datasets...")
    datasets = {
        'enwik8': dataset_manager.fetch_enwik8_chunks(),
        'imagenet': dataset_manager.fetch_imagenet_chunks(),
        'librispeech': dataset_manager.fetch_librispeech_chunks(),
    }
    
    # Filter successful datasets
    datasets = {k: v for k, v in datasets.items() if v}
    
    if not datasets:
        print("❌ No datasets available!")
        return None
    
    print(f"✅ Loaded {len(datasets)} datasets")
    
    # Run experiments
    results = []
    total_experiments = sum(len(chunks) for chunks in datasets.values())
    
    print(f"\n🚀 Running {total_experiments} compression evaluations...")
    
    with tqdm(total=total_experiments, desc="Random Compression") as pbar:
        for dataset_name, chunks in datasets.items():
            print(f"\n📊 Processing {dataset_name} ({len(chunks)} chunks)...")
            
            compression_ratios = []
            
            for i, chunk_bytes in enumerate(chunks):
                try:
                    ratio = compute_random_compression_ratio(random_coder, tokenizer, chunk_bytes)
                    if 0.01 < ratio < 50.0:  # Sanity check
                        compression_ratios.append(ratio)
                        
                        # Store individual result
                        results.append({
                            'dataset': dataset_name,
                            'chunk_id': i,
                            'compression_ratio': ratio,
                            'model': 'random_baseline',
                            'timestamp': time.time()
                        })
                    
                except Exception as e:
                    continue
                
                pbar.update(1)
                
                # Update progress info
                if len(compression_ratios) > 0:
                    current_mean = np.mean(compression_ratios)
                    pbar.set_postfix({
                        'dataset': dataset_name,
                        'current_ratio': f'{current_mean:.3f}'
                    })
            
            # Print dataset summary
            if compression_ratios:
                mean_ratio = np.mean(compression_ratios)
                std_ratio = np.std(compression_ratios)
                print(f"   ✅ {dataset_name}: {mean_ratio:.4f} ± {std_ratio:.4f} ({len(compression_ratios)} chunks)")
            else:
                print(f"   ❌ {dataset_name}: No valid results")
    
    # Save results
    df = pd.DataFrame(results)
    df.to_csv("random_baseline_results.csv", index=False)
    
    print(f"\n🎉 Random baseline experiment completed!")
    print(f"📁 Results saved to: random_baseline_results.csv")
    print(f"📊 Total evaluations: {len(results)}")
    
    # Summary statistics
    print("\n📊 RANDOM BASELINE SUMMARY:")
    print("=" * 40)
    
    for dataset in df['dataset'].unique():
        subset = df[df['dataset'] == dataset]
        mean_cr = subset['compression_ratio'].mean()
        std_cr = subset['compression_ratio'].std()
        min_cr = subset['compression_ratio'].min()
        max_cr = subset['compression_ratio'].max()
        
        print(f"\n{dataset.upper()}:")
        print(f"   Mean: {mean_cr:.4f} ± {std_cr:.4f}")
        print(f"   Range: {min_cr:.4f} - {max_cr:.4f}")
        print(f"   Chunks: {len(subset)}")
    
    return df

if __name__ == "__main__":
    print("🎲 RANDOM BASELINE COMPRESSION FOR LLM SCALING LAWS")
    print("=" * 60)
    print("Generating random probability baseline for compression comparison")
    print(f"📊 Datasets: enwik8, imagenet, librispeech")
    print(f"📊 Chunks per dataset: {NUM_CHUNKS}")
    print(f"📊 Chunk size: {CHUNK_SIZE} bytes")
    print()
    
    try:
        results_df = run_random_baseline_experiment()
        
        if results_df is not None:
            print("\n✅ RANDOM BASELINE EXPERIMENT COMPLETED!")
            print("📈 Use these results to compare against your LLM compression data")
            print("📁 Results saved in: random_baseline_results.csv")
        else:
            print("❌ Experiment failed")
            
    except KeyboardInterrupt:
        print("\n⏸️  Experiment interrupted by user")
        
    except Exception as e:
        print(f"❌ Experiment failed: {e}")
        raise

🎲 RANDOM BASELINE COMPRESSION FOR LLM SCALING LAWS
Generating random probability baseline for compression comparison
📊 Datasets: enwik8, imagenet, librispeech
📊 Chunks per dataset: 2048
📊 Chunk size: 2048 bytes

🎲 RANDOM BASELINE COMPRESSION EXPERIMENT
🔧 Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]


📥 Loading datasets...
📥 Downloading enwik8 Wikipedia XML...
✅ Loaded 2048 enwik8 chunks
📥 Downloading ImageNet images...
🔄 Generating structured image data...
📥 Downloading LibriSpeech audio...


librispeech_asr.py: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

The repository for librispeech_asr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/librispeech_asr.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


❌ Failed to load LibriSpeech: To support encoding audio data, please install 'soundfile'.
✅ Loaded 2 datasets

🚀 Running 4096 compression evaluations...


Random Compression:   0%|          | 0/4096 [00:00<?, ?it/s]


📊 Processing enwik8 (2048 chunks)...


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



   ✅ imagenet: 1.5313 ± 0.0146 (2048 chunks)

🎉 Random baseline experiment completed!
📁 Results saved to: random_baseline_results.csv
📊 Total evaluations: 4096

📊 RANDOM BASELINE SUMMARY:

ENWIK8:
   Mean: 0.8183 ± 0.1906
   Range: 0.5366 - 1.7031
   Chunks: 2048

IMAGENET:
   Mean: 1.5313 ± 0.0146
   Range: 1.4874 - 1.5788
   Chunks: 2048

✅ RANDOM BASELINE EXPERIMENT COMPLETED!
📈 Use these results to compare against your LLM compression data
📁 Results saved in: random_baseline_results.csv
