```json
{
    "name": "my_dataset",                  # Your display name
    "path": "username/dataset-name",        # HuggingFace path
    "config": None,                         # Subset name (or None)
    "split": "train",                       # train/test/validation
    "limit": 1000,                          # Max samples (or None for all)
    "category": "custom",                   # For organization
    
    # Tell it where to find the data:
    "instruction_field": "question",        # Where is the question?
    "output_field": "answer",               # Where is the answer?
    "input_field": None,                    # Extra context? (or None)
    
    # For multiple choice (optional):
    "choices_field": None,                  # Field with choices
    "answer_key_field": None,               # Field with answer key (A/B/C)
    "include_choices_in_instruction": False,
}
```


In [87]:
DATASET_CONFIGS = [
    {
        "name": "gsm8k",                    # Display name
        "path": "openai/gsm8k",              # HuggingFace dataset path
        "config": "main",                    # Dataset config/subset (None if not needed)
        "split": "train",                    # Which split to use
        "limit": None,                       # Max samples (None for all)
        "category": "math",                  # Category for organizing
        
        # Field mappings - tell us where to find the data
        "instruction_field": "question",     # Field containing the question/instruction
        "output_field": "answer",            # Field containing the answer/output
        "input_field": None,                 # Optional: additional context field (None if not needed)
        
        # Optional: For multiple choice questions
        "choices_field": None,               # Field containing choices (None if not applicable)
        "answer_key_field": None,            # Field containing answer key (A, B, C, etc.)
        
        # Optional: Format choices as part of instruction?
        "include_choices_in_instruction": False,
    },
    
    {
        "name": "openbookqa",
        "path": "allenai/openbookqa",
        "config": "main",
        "split": "train",
        "limit": None,
        "category": "science",
        
        "instruction_field": "question_stem",
        "output_field": "answerKey",         # Will be enhanced with choice text
        "input_field": None,
        "choices_field": "choices",          # Contains {"label": [...], "text": [...]}
        "answer_key_field": "answerKey",
        "include_choices_in_instruction": True,
    },
    
    {
        "name": "trivia_qa",
        "path": "mandarjoshi/trivia_qa",
        "config": "rc.wikipedia",
        "split": "train",
        "limit": None,
        "category": "general",
        
        "instruction_field": "question",
        "output_field": "answer.value",      # Use dot notation for nested fields
        "input_field": None,
    },
    
    {
        "name": "commonsense_qa",
        "path": "tau/commonsense_qa",
        "config": None,
        "split": "train",
        "limit": None,
        "category": "general",
        
        "instruction_field": "question",
        "output_field": "answerKey",
        "input_field": None,
        "choices_field": "choices",
        "answer_key_field": "answerKey",
        "include_choices_in_instruction": True,
    },
]

In [88]:
filename="combined_dataset"
OUTPUT_JSONL = f"data/{filename}.jsonl"

In [89]:
import json
import os
from typing import List, Dict, Any, Optional
from datasets import load_dataset

In [90]:
def get_nested_field(item: Dict, field_path: str) -> Any:
    """
    Extract field from item using dot notation for nested fields.
    Example: "answer.value" will get item["answer"]["value"]
    """
    if not field_path:
        return None
    
    try:
        value = item
        for key in field_path.split('.'):
            if isinstance(value, dict):
                value = value.get(key)
            else:
                return None
        return value
    except:
        return None

In [91]:
def extract_text_value(value: Any) -> str:
    """Convert any value to string, handling lists, dicts, etc."""
    if value is None:
        return ""
    elif isinstance(value, str):
        return value
    elif isinstance(value, list):
        # Take first non-empty item
        for item in value:
            text = extract_text_value(item)
            if text:
                return text
        return ""
    elif isinstance(value, dict):
        # Try common keys
        for key in ["value", "text", "answer", "normalized_value"]:
            if key in value:
                return extract_text_value(value[key])
        # If no common key, return first value
        for v in value.values():
            text = extract_text_value(v)
            if text:
                return text
        return ""
    else:
        return str(value)

In [92]:
def format_choices(choices: Any, answer_key: str = None) -> tuple:
    """
    Format multiple choice options and get answer text.
    Returns: (formatted_choices_text, answer_text)
    """
    if not choices:
        return "", ""
    
    formatted_lines = []
    answer_text = ""
    
    try:
        # Handle different choice formats
        if isinstance(choices, dict):
            # Format: {"label": ["A", "B", ...], "text": ["option1", "option2", ...]}
            if "label" in choices and "text" in choices:
                labels = choices["label"]
                texts = choices["text"]
                for label, text in zip(labels, texts):
                    formatted_lines.append(f"{label}. {text}")
                    if answer_key and label == answer_key:
                        answer_text = text
            # Format: {"A": "text1", "B": "text2", ...}
            else:
                for label, text in choices.items():
                    formatted_lines.append(f"{label}. {text}")
                    if answer_key and label == answer_key:
                        answer_text = text
        
        elif isinstance(choices, list):
            # Format: ["option1", "option2", ...]
            for i, text in enumerate(choices):
                label = chr(65 + i)  # A, B, C, ...
                formatted_lines.append(f"{label}. {text}")
                if answer_key and (label == answer_key or str(i) == str(answer_key)):
                    answer_text = text
        
        formatted_text = "\n".join(formatted_lines) if formatted_lines else ""
        return formatted_text, answer_text
        
    except Exception as e:
        print(f"    ‚ö†Ô∏è  Error formatting choices: {e}")
        return "", ""

In [93]:
def process_item(item: Dict, config: Dict) -> Optional[Dict]:
    """
    Process a single item using the configuration.
    Returns None if item is invalid.
    """
    try:
        # Extract instruction
        instruction = extract_text_value(
            get_nested_field(item, config.get("instruction_field"))
        )
        
        if not instruction:
            return None
        
        # Extract input (optional)
        input_text = ""
        if config.get("input_field"):
            input_text = extract_text_value(
                get_nested_field(item, config.get("input_field"))
            )
        
        # Extract output
        output = extract_text_value(
            get_nested_field(item, config.get("output_field"))
        )
        
        # Handle multiple choice questions
        if config.get("choices_field"):
            choices = get_nested_field(item, config.get("choices_field"))
            answer_key = extract_text_value(
                get_nested_field(item, config.get("answer_key_field"))
            ) if config.get("answer_key_field") else None
            
            choices_text, answer_text = format_choices(choices, answer_key)
            
            # Add choices to instruction if requested
            if config.get("include_choices_in_instruction") and choices_text:
                instruction = f"{instruction}\n\nChoices:\n{choices_text}"
            
            # Enhance output with answer text
            if answer_text:
                if answer_key:
                    output = f"The answer is {answer_key}: {answer_text}"
                else:
                    output = answer_text
        
        if not output:
            return None
        
        return {
            "instruction": instruction.strip(),
            "input": input_text.strip(),
            "output": output.strip()
        }
        
    except Exception as e:
        return None

In [94]:
def load_and_process_dataset(config: Dict) -> List[Dict]:
    """Load and process a single dataset using its configuration"""
    name = config["name"]
    path = config["path"]
    config_name = config.get("config")
    split = config.get("split", "train")
    limit = config.get("limit")
    
    print(f"\nüì¶ Loading {name}...")
    print(f"   Path: {path}")
    if config_name:
        print(f"   Config: {config_name}")
    print(f"   Split: {split}")
    
    try:
        # Load dataset
        if config_name:
            dataset = load_dataset(path, config_name, split=split)
        else:
            dataset = load_dataset(path, split=split)
        
        print(f"   ‚úì Loaded {len(dataset)} samples")
        
        # Apply limit
        if limit and len(dataset) > limit:
            dataset = dataset.select(range(limit))
            print(f"   ‚úì Limited to {len(dataset)} samples")
        
        # Process all items
        processed_data = []
        skipped = 0
        
        for item in dataset:
            processed = process_item(item, config)
            if processed:
                processed_data.append(processed)
            else:
                skipped += 1
        
        print(f"   ‚úì Successfully processed {len(processed_data)} samples")
        if skipped > 0:
            print(f"   ‚ö†Ô∏è  Skipped {skipped} invalid samples")
        
        return processed_data
        
    except Exception as e:
        print(f"   ‚úó Error loading {name}: {e}")
        return []

In [95]:
all_data = []
dataset_stats = {}

print("üöÄ Starting dataset processing...")
print("="*60)

for config in DATASET_CONFIGS:
    data = load_and_process_dataset(config)
    all_data.extend(data)
    dataset_stats[config["name"]] = {
        "count": len(data),
        "category": config.get("category", "unknown")
    }

print("\n" + "="*60)
print("üìä PROCESSING SUMMARY")
print("="*60)
print(f"Total samples: {len(all_data):,}\n")

# Group by category
categories = {}
for name, stats in dataset_stats.items():
    cat = stats["category"]
    if cat not in categories:
        categories[cat] = []
    categories[cat].append((name, stats["count"]))

for category, datasets in sorted(categories.items()):
    total = sum(count for _, count in datasets)
    print(f"{category.upper()}:")
    for name, count in datasets:
        percentage = (count / len(all_data) * 100) if all_data else 0
        print(f"  ‚Ä¢ {name:20} {count:>6,} samples ({percentage:>5.1f}%)")
    print(f"  Subtotal: {total:,}\n")

üöÄ Starting dataset processing...

üì¶ Loading gsm8k...
   Path: openai/gsm8k
   Config: main
   Split: train
   ‚úì Loaded 7473 samples
   ‚úì Successfully processed 7473 samples

üì¶ Loading openbookqa...
   Path: allenai/openbookqa
   Config: main
   Split: train
   ‚úì Loaded 4957 samples
   ‚úì Successfully processed 4957 samples

üì¶ Loading trivia_qa...
   Path: mandarjoshi/trivia_qa
   Config: rc.wikipedia
   Split: train


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

   ‚úì Loaded 61888 samples
   ‚úì Successfully processed 61888 samples

üì¶ Loading commonsense_qa...
   Path: tau/commonsense_qa
   Split: train
   ‚úì Loaded 9741 samples
   ‚úì Successfully processed 9741 samples

üìä PROCESSING SUMMARY
Total samples: 84,059

GENERAL:
  ‚Ä¢ trivia_qa            61,888 samples ( 73.6%)
  ‚Ä¢ commonsense_qa        9,741 samples ( 11.6%)
  Subtotal: 71,629

MATH:
  ‚Ä¢ gsm8k                 7,473 samples (  8.9%)
  Subtotal: 7,473

SCIENCE:
  ‚Ä¢ openbookqa            4,957 samples (  5.9%)
  Subtotal: 4,957



In [96]:
def save_to_jsonl(data: List[Dict], filepath: str):
    """Save data to JSONL format"""
    directory = os.path.dirname(filepath)
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    
    file_size = os.path.getsize(filepath) / (1024 * 1024)  # Size in MB
    print(f"‚úÖ Saved {len(data):,} samples to: {filepath}")
    print(f"   File size: {file_size:.2f} MB")

In [97]:
if all_data:
    # Save locally
    local_path = OUTPUT_JSONL
    save_to_jsonl(all_data, local_path)
else:
    print("‚ö†Ô∏è  No data to save!")

‚úÖ Saved 84,059 samples to: data/combined_dataset.jsonl
   File size: 15.42 MB


In [98]:
if all_data:
    print("\n" + "="*60)
    print("üîç DATA PREVIEW")
    print("="*60)
    
    # Show examples from each category
    shown_categories = set()
    examples_shown = 0
    
    for config in DATASET_CONFIGS:
        cat = config.get("category", "unknown")
        if cat not in shown_categories and examples_shown < 5:
            # Find first example from this dataset
            for sample in all_data:
                # Try to match by checking instruction content
                examples_shown += 1
                shown_categories.add(cat)
                
                print(f"\n--- Example {examples_shown} ({config['name']} - {cat}) ---")
                print(f"Instruction: {sample['instruction'][:200]}{'...' if len(sample['instruction']) > 200 else ''}")
                if sample['input']:
                    print(f"Input: {sample['input'][:100]}{'...' if len(sample['input']) > 100 else ''}")
                print(f"Output: {sample['output'][:200]}{'...' if len(sample['output']) > 200 else ''}")
                break
        
        if examples_shown >= 5:
            break


üîç DATA PREVIEW

--- Example 1 (gsm8k - math) ---
Instruction: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Output: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

--- Example 2 (openbookqa - science) ---
Instruction: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Output: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

--- Example 3 (trivia_qa - general) ---
Instruction: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Output: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72

In [99]:
def validate_jsonl(filepath: str) -> bool:
    """Validate that the JSONL file is correctly formatted"""
    print(f"\nüîç Validating {filepath}...")
    
    if not os.path.exists(filepath):
        print(f"   ‚úó File not found!")
        return False
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        print(f"   ‚úì Total lines: {len(lines):,}")
        
        # Check each line
        valid_count = 0
        errors = []
        
        for i, line in enumerate(lines, 1):
            try:
                data = json.loads(line)
                if "instruction" in data and "output" in data and "input" in data:
                    if data["instruction"] and data["output"]:
                        valid_count += 1
                    else:
                        errors.append(f"Line {i}: Empty instruction or output")
                else:
                    errors.append(f"Line {i}: Missing required fields")
            except json.JSONDecodeError as e:
                errors.append(f"Line {i}: Invalid JSON - {e}")
        
        print(f"   ‚úì Valid samples: {valid_count:,}/{len(lines):,}")
        
        if errors and len(errors) <= 5:
            print("\n   Errors found:")
            for error in errors:
                print(f"     ‚Ä¢ {error}")
        elif errors:
            print(f"\n   ‚ö†Ô∏è  {len(errors)} errors found (showing first 5):")
            for error in errors[:5]:
                print(f"     ‚Ä¢ {error}")
        
        if valid_count == len(lines):
            print("   ‚úÖ All samples are valid!")
            return True
        else:
            print(f"   ‚ö†Ô∏è  {len(lines) - valid_count} invalid samples found")
            return False
        
    except Exception as e:
        print(f"   ‚úó Error validating file: {e}")
        return False

if all_data:
    validate_jsonl(local_path)


üîç Validating data/combined_dataset.jsonl...
   ‚úì Total lines: 84,059
   ‚úì Valid samples: 84,059/84,059
   ‚úÖ All samples are valid!
