### Training Collection and Curation

### <font color='red'> Data Collection and Curation Pipeline </font> 
The following Python script demonstrates how to:
- Gather text corpora from multiple sources
- Enforce simple license checks
- Track provenance metadata for each document
- Consolidate and save the curated collection

In [1]:
import os
import json
from datetime import datetime
from datasets import load_dataset
import arxiv
from github import Github
from dotenv import load_dotenv

# Load GitHub token (for private/public-domain repo access)
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

OUTPUT_PATH = "corpus.jsonl"
ALLOWED_LICENSES = {"Public domain", "CC0-1.0", "CC-BY-4.0"}

def save_document(doc):
    with open(OUTPUT_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(doc, ensure_ascii=False) + "\n")

# 1) Replace Common Crawl with Wikipedia (English)
def collect_wikipedia(n_samples=500):
    # Wikipedia 2022-03-01 English dump
    ds = load_dataset("wikipedia", "20220301.en", split=f"train[:{n_samples}]")
    for example in ds:
        text = example["text"].strip()
        if not text:
            continue
        save_document({
            "text": text,
            "source": "wikipedia",
            "url": f"https://en.wikipedia.org/wiki?curid={example['id']}",
            "license": "CC-BY-SA-3.0",
            "fetched_at": datetime.utcnow().isoformat()
        })

# 2) arXiv abstracts (assume CC-BY)
def collect_arxiv(query="machine learning", max_results=100):
    search = arxiv.Search(query=query, max_results=max_results)
    for res in search.results():
        if "CC-BY" not in ALLOWED_LICENSES:
            continue
        save_document({
            "text": res.summary or "",
            "source": "arxiv",
            "url": res.pdf_url,
            "license": "CC-BY-4.0",
            "fetched_at": datetime.utcnow().isoformat(),
            "title": res.title,
            "authors": [a.name for a in res.authors]
        })

# 3) Project Gutenberg (public domain)
def collect_gutenberg(book_ids=[1342, 11]):
    ds = load_dataset("gutenberg", split="train")
    for ex in ds:
        if ex["id"] in book_ids:
            save_document({
                "text": ex["text"],
                "source": "gutenberg",
                "url": f"https://www.gutenberg.org/ebooks/{ex['id']}",
                "license": "Public domain",
                "fetched_at": datetime.utcnow().isoformat(),
                "title": ex.get("title")
            })

# 4) GitHub READMEs (public-domain/CC0/CC-BY)
def collect_github_repos(user="pallets", max_repos=3):
    gh   = Github(GITHUB_TOKEN)
    org  = gh.get_user(user)
    count = 0
    for repo in org.get_repos():
        if count >= max_repos:
            break
        lic = None
        try:
            lic = repo.get_license().license.spdx_id
        except:
            pass
        if lic not in ALLOWED_LICENSES:
            continue
        try:
            readme = repo.get_readme().decoded_content.decode("utf-8")
        except:
            continue
        save_document({
            "text": readme,
            "source": "github",
            "url": repo.html_url,
            "license": lic,
            "fetched_at": datetime.utcnow().isoformat(),
            "repo": repo.full_name
        })
        count += 1

if __name__ == "__main__":
    # start fresh
    if os.path.exists(OUTPUT_PATH):
        os.remove(OUTPUT_PATH)

    collect_wikipedia(n_samples=500)
    collect_arxiv(query="deep learning", max_results=100)
    # collect_gutenberg(book_ids=[1342, 11, 84])
    collect_github_repos(user="pallets", max_repos=3)

    print(f"Corpus assembled → {OUTPUT_PATH}")

  from .autonotebook import tqdm as notebook_tqdm
  for res in search.results():


Corpus assembled → corpus.jsonl


##### How It Works
- Common Crawl via Hugging Face: samples raw web text, tags with CC0 license.
- arXiv API: pulls paper abstracts/summaries, assumes CC-BY licensing.
- Gutenberg: uses public-domain ebook texts.
- GitHub: selects repositories with approved licenses, fetches README content.
    
##### Each document is annotated with:
- source: origin of the text
- url: link to original
- license: license label
- fetched_at: timestamp for provenance
- (optional) metadata like title, authors
All documents are appended to corpus.jsonl for downstream cleaning, deduplication, and training.
