In [3]:
import nltk
import torch
from datasets import load_dataset
from pathlib import Path
import random
import json
from huggingface_hub import create_repo, upload_folder
import re

nltk.download('punkt')

# Create directory
Path('LitFactTechMix').mkdir(exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/siddhantmedar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Load datasets with streaming
pg19 = load_dataset('deepmind/pg19', split='train', streaming=True)
wikitext = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train', streaming=True)
arxiv = load_dataset('MaartenGr/arxiv_nlp', split='train', streaming=True)

# Sample subsets
random.seed(42)
pg19_indices = random.sample(range(11000), k=100)
wikitext_indices = random.sample(range(100000), k=2000)
arxiv_indices = random.sample(range(1700000), k=1000)

pg19_sample = [entry for i, entry in enumerate(pg19) if i in pg19_indices]
wikitext_sample = [entry for i, entry in enumerate(wikitext) if entry['text'].strip() and i in wikitext_indices]
arxiv_sample = [entry for i, entry in enumerate(arxiv) if i in arxiv_indices]

Got disconnected from remote data host. Retrying in 5sec [1/20]


KeyboardInterrupt: 

In [None]:
# Cleaning functions
def clean_pg19(text):
    start = text.find('*** START OF THIS PROJECT GUTENBERG EBOOK')
    end = text.find('*** END OF THIS PROJECT GUTENBERG EBOOK')
    if start != -1 and end != -1:
        text = text[start+len('*** START OF THIS PROJECT GUTENBERG EBOOK'):end]
    return re.sub(r'\n+', ' ', text.lower()).strip()

def clean_wikitext(text):
    text = re.sub(r'@\S+@', '', text)
    return re.sub(r'\n+', ' ', text.lower()).strip()

def clean_arxiv(text):
    text = re.sub(r'\\{.*?\\}', '', text)
    return re.sub(r'\n+', ' ', text.lower()).strip()

In [None]:

# Extract and clean texts
pg19_texts = [clean_pg19(entry['text']) for entry in pg19_sample if entry['text']]
wikitext_texts = [clean_wikitext(entry['text']) for entry in wikitext_sample if entry['text']]
arxiv_texts = [clean_arxiv(entry['abstract']) for entry in arxiv_sample if entry['abstract']]

In [None]:

# Combine and shuffle
all_texts = pg19_texts + wikitext_texts + arxiv_texts
random.shuffle(all_texts)

# Tokenize with NLTK
def tokenize_text(text):
    return nltk.word_tokenize(text)

all_words = []
for text in all_texts:
    all_words.extend(tokenize_text(text))
vocab = sorted(set(all_words))
vocab_size = len(vocab)
wtoi = {w: i for i, w in enumerate(vocab)}
itow = {i: w for i, w in enumerate(vocab)}

# Cap vocab at 50,000
from collections import Counter
word_counts = Counter(all_words)
vocab = [w for w, _ in word_counts.most_common(50000)]
vocab_size = len(vocab)
wtoi = {w: i for i, w in enumerate(vocab)}
itow = {i: w for i, w in enumerate(vocab)}

# Add <UNK>
wtoi['<UNK>'] = len(vocab)
itow[len(vocab)] = '<UNK>'
vocab_size += 1

def encode(text):
    words = tokenize_text(text)
    return [wtoi.get(w, wtoi['<UNK>']) for w in words]

def decode(indices):
    return ' '.join([itow.get(i, '<UNK>') for i in indices])

# Tokenize dataset
data = []
for text in all_texts:
    if text.strip():
        data.extend(encode(text))
data = torch.tensor(data, dtype=torch.long)

# Split train/val
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Save dataset
torch.save(train_data, 'custom_mini_mix/train.pt')
torch.save(val_data, 'custom_mini_mix/val.pt')
torch.save((wtoi, itow, vocab_size), 'custom_mini_mix/vocab.pt')

# Save metadata
metadata = {
    "name": "custom_mini_mix",
    "description": "A dataset for training language models, combining 100 books from PG-19, 2000 Wikipedia articles from WikiText-103, and 1000 NLP abstracts from ArXiv NLP. Preprocessed for word-based tokenization with a 50,000-word vocabulary plus <UNK>.",
    "source_datasets": [
        {"name": "deepmind/pg19", "license": "Apache 2.0", "size": "100 books (~20 MB)"},
        {"name": "wikitext/wikitext-103-raw-v1", "license": "CC BY-SA 4.0", "size": "2000 articles (~15 MB)"},
        {"name": "MaartenGr/arxiv_nlp", "license": "Not specified, academic use", "size": "1000 abstracts (~15 MB)"}
    ],
    "size": "Approx. 50 MB uncompressed, 15 MB compressed",
    "vocab_size": vocab_size,
    "splits": {"train": len(train_data), "validation": len(val_data)},
    "usage": "Load train.pt, val.pt, and vocab.pt using torch.load. Use wtoi/itow for encoding/decoding.",
    "license": "CC BY-SA 4.0"
}
with open('custom_mini_mix/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

# Create README
readme_content = """---
dataset_info:
  dataset_name: custom_mini_mix
  description: A dataset for training language models, combining 100 books from PG-19, 2000 Wikipedia articles from WikiText-103, and 1000 NLP abstracts from ArXiv NLP. Preprocessed for word-based tokenization with a 50,000-word vocabulary plus <UNK>.
  license: CC BY-SA 4.0
  size: Approx. 50 MB uncompressed, 15 MB compressed
  splits:
    train: {train_size} tokens
    validation: {val_size} tokens
  source_datasets:
    - name: deepmind/pg19
      license: Apache 2.0
      size: 100 books (~20 MB)
    - name: wikitext/wikitext-103-raw-v1
      license: CC BY-SA 4.0
      size: 2000 articles (~15 MB)
    - name: MaartenGr/arxiv_nlp
      license: Not specified, academic use
      size: 1000 abstracts (~15 MB)
  vocab_size: {vocab_size}
---

# Custom Mini Mix Dataset

## Overview
This dataset is designed for training small to medium language models, offering a diverse mix of literary, factual, and technical text. It combines:
- **PG-19**: 100 books from classic literature for narrative and poetic styles.
- **WikiText-103**: 2000 Wikipedia articles for factual, structured text.
- **ArXiv NLP**: 1000 abstracts from NLP papers for technical, academic language.

The dataset is preprocessed for word-based tokenization, with a vocabulary of 50,000 words plus an `<UNK>` token for unknown words. It is split into training ({train_size} tokens) and validation ({val_size} tokens) sets.

## Files
- `train.pt`: Training data (PyTorch tensor of word indices).
- `val.pt`: Validation data (PyTorch tensor of word indices).
- `vocab.pt`: Vocabulary (tuple of word-to-index and index-to-word dictionaries, vocab size).
- `metadata.json`: Dataset metadata.

## Usage
Load the dataset in Python using PyTorch:
```python
import torch
train_data = torch.load('train.pt')
val_data = torch.load('val.pt')
wtoi, itow, vocab_size = torch.load('vocab.pt')

def decode(indices):
    return ' '.join([itow.get(i, '<UNK>') for i in indices])
```

## Size
- **Uncompressed**: ~50 MB
- **Compressed**: ~15 MB
- **Download Time**: ~1–2 minutes on a 10 Mbps connection
- **Disk Usage**: ~50 MB (plus temporary space during training)

## License
Licensed under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/), inherited from WikiText-103 (most restrictive). Source datasets include Apache 2.0 (PG-19) and unspecified academic use (ArXiv NLP).

## Source Datasets
- [deepmind/pg19](https://huggingface.co/datasets/deepmind/pg19)
- [wikitext/wikitext-103-raw-v1](https://huggingface.co/datasets/wikitext)
- [MaartenGr/arxiv_nlp](https://huggingface.co/datasets/MaartenGr/arxiv_nlp)

## Creation Process
The dataset was created by:
1. Sampling 100 books from PG-19, 2000 articles from WikiText-103, and 1000 abstracts from ArXiv NLP.
2. Cleaning texts to remove boilerplate, markup, and LaTeX.
3. Tokenizing with NLTK for word-based tokenization, capping vocabulary at 50,000 words.
4. Saving as PyTorch tensors for training and validation.

## Intended Use
Ideal for training small language models (e.g., GPT-like architectures) for research or educational purposes. Suitable for generating literary, factual, or technical text.

## Contact
For issues or questions, open an issue on the [Hugging Face repository](https://huggingface.co/datasets/{repo_id}) or contact the creator.

---
Created on May 10, 2025.
"""

# Write README
with open('custom_mini_mix/README.md', 'w') as f:
    f.write(readme_content.format(
        train_size=len(train_data),
        val_size=len(val_data),
        vocab_size=vocab_size,
        repo_id="your_username/custom_mini_mix"  # Replace with your username
    ))

# Create repository and upload
repo_id = "your_username/custom_mini_mix"  # Replace with your username
create_repo(repo_id, repo_type="dataset", private=False)
upload_folder(
    repo_id=repo_id,
    folder_path="custom_mini_mix",
    path_in_repo="",
    repo_type="dataset"
)