# Dataset Size Comparison
Compare regular dataset vs clean dataset (without chat_text column)

In [1]:
import datasets
import os
import subprocess
import pandas as pd
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Dataset paths
regular_path = "/scratch/klambert/dataset/tulu-3-sft-mixture-pretokenized"
clean_path = "/scratch/klambert/dataset/tulu-3-sft-mixture-pretokenized_clean"

print(f"Regular dataset path: {regular_path}")
print(f"Clean dataset path: {clean_path}")
print(f"Regular exists: {os.path.exists(regular_path)}")
print(f"Clean exists: {os.path.exists(clean_path)}")

Regular dataset path: /scratch/klambert/dataset/tulu-3-sft-mixture-pretokenized
Clean dataset path: /scratch/klambert/dataset/tulu-3-sft-mixture-pretokenized_clean
Regular exists: True
Clean exists: True


In [3]:
# Get disk usage for both datasets
def get_directory_size(path):
    """Get directory size in bytes using du command"""
    result = subprocess.run(['du', '-sb', path], capture_output=True, text=True)
    if result.returncode == 0:
        return int(result.stdout.split()[0])
    return None

def format_bytes(bytes_size):
    """Convert bytes to human readable format"""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.2f} PB"

regular_size = get_directory_size(regular_path)
clean_size = get_directory_size(clean_path)

print("=== DISK USAGE COMPARISON ===")
print(f"Regular dataset: {format_bytes(regular_size)} ({regular_size:,} bytes)")
print(f"Clean dataset:   {format_bytes(clean_size)} ({clean_size:,} bytes)")
print(f"Space saved:     {format_bytes(regular_size - clean_size)} ({regular_size - clean_size:,} bytes)")
print(f"Reduction:       {((regular_size - clean_size) / regular_size * 100):.1f}%")

=== DISK USAGE COMPARISON ===
Regular dataset: 2.90 GB (3,114,887,811 bytes)
Clean dataset:   2.42 GB (2,603,123,108 bytes)
Space saved:     488.06 MB (511,764,703 bytes)
Reduction:       16.4%


In [4]:
# Load and compare dataset metadata
print("Loading datasets...")
regular_dataset = datasets.load_from_disk(regular_path)
clean_dataset = datasets.load_from_disk(clean_path)

print("\n=== DATASET COMPARISON ===")
print(f"Regular dataset splits: {list(regular_dataset.keys())}")
print(f"Clean dataset splits: {list(clean_dataset.keys())}")

for split in regular_dataset.keys():
    print(f"\n--- {split.upper()} SPLIT ---")
    print(f"Regular - Samples: {len(regular_dataset[split]):,}, Features: {list(regular_dataset[split].features.keys())}")
    print(f"Clean   - Samples: {len(clean_dataset[split]):,}, Features: {list(clean_dataset[split].features.keys())}")

Loading datasets...

=== DATASET COMPARISON ===
Regular dataset splits: ['train', 'test']
Clean dataset splits: ['train', 'test']

--- TRAIN SPLIT ---
Regular - Samples: 192,704, Features: ['id', 'chat_text', 'input_ids', 'attention_mask', 'labels']
Clean   - Samples: 192,704, Features: ['id', 'input_ids', 'attention_mask', 'labels']

--- TEST SPLIT ---
Regular - Samples: 1,943, Features: ['id', 'chat_text', 'input_ids', 'attention_mask', 'labels']
Clean   - Samples: 1,943, Features: ['id', 'input_ids', 'attention_mask', 'labels']


In [5]:
# Show example of what was removed
print("=== EXAMPLE OF REMOVED CONTENT ===")
print("\nRegular dataset sample (first 300 chars of chat_text):")
if 'chat_text' in regular_dataset['train'].features:
    sample_text = regular_dataset['train'][0]['chat_text']
    print(f"'{sample_text[:300]}...'")
    print(f"\nFull chat_text length: {len(sample_text)} characters")
else:
    print("chat_text not found in regular dataset")

print("\nClean dataset features:")
print(list(clean_dataset['train'].features.keys()))

=== EXAMPLE OF REMOVED CONTENT ===

Regular dataset sample (first 300 chars of chat_text):


KeyError: 'chat_text'

In [6]:
# Memory usage comparison when loading
import sys

def get_dataset_memory_usage(dataset):
    """Estimate memory usage of dataset"""
    total_size = 0
    for split_name, split_data in dataset.items():
        # This is an approximation
        total_size += sys.getsizeof(split_data)
    return total_size

print("=== MEMORY USAGE COMPARISON ===")
regular_memory = get_dataset_memory_usage(regular_dataset)
clean_memory = get_dataset_memory_usage(clean_dataset)

print(f"Regular dataset memory: {format_bytes(regular_memory)}")
print(f"Clean dataset memory:   {format_bytes(clean_memory)}")
print(f"Memory saved:           {format_bytes(regular_memory - clean_memory)}")

=== MEMORY USAGE COMPARISON ===
Regular dataset memory: 112.00 B
Clean dataset memory:   112.00 B
Memory saved:           0.00 B


In [7]:
# Summary table
summary_data = {
    'Dataset': ['Regular', 'Clean', 'Difference'],
    'Disk Size': [format_bytes(regular_size), format_bytes(clean_size), format_bytes(regular_size - clean_size)],
    'Train Samples': [f"{len(regular_dataset['train']):,}", f"{len(clean_dataset['train']):,}", "0"],
    'Test Samples': [f"{len(regular_dataset['test']):,}", f"{len(clean_dataset['test']):,}", "0"],
    'Features': [len(regular_dataset['train'].features), len(clean_dataset['train'].features), 
                len(regular_dataset['train'].features) - len(clean_dataset['train'].features)]
}

summary_df = pd.DataFrame(summary_data)
print("=== SUMMARY TABLE ===")
print(summary_df.to_string(index=False))

=== SUMMARY TABLE ===
   Dataset Disk Size Train Samples Test Samples  Features
   Regular   2.90 GB       192,704        1,943         5
     Clean   2.42 GB       192,704        1,943         4
Difference 488.06 MB             0            0         1


In [None]:
# Check which columns were removed
regular_features = set(regular_dataset['train'].features.keys())
clean_features = set(clean_dataset['train'].features.keys())

removed_features = regular_features - clean_features
print(f"\n=== REMOVED FEATURES ===")
print(f"Removed columns: {list(removed_features)}")
print(f"Remaining columns: {list(clean_features)}")