# Wikipedia Data Extraction

This notebook extracts Korean and English Wikipedia articles for building a bilingual synonym dataset.

**Updated**: Now using direct Wikipedia XML dumps from Wikimedia for the latest data (November 2025).

## Steps
1. Load Wikipedia data from Wikimedia dumps
2. Parse XML and extract article text
3. Clean and filter articles  
4. Save processed data

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../..')

from src.data.wikipedia_xml_parser import WikipediaXMLParser
from pathlib import Path
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Setup Paths

In [None]:
# Output directory
output_dir = Path("../../dataset/wikipedia")
output_dir.mkdir(parents=True, exist_ok=True)

# We'll split files into chunks to avoid very large files
ARTICLES_PER_FILE = 50000  # 50K articles per file

## 2. Extract Korean Wikipedia Articles

**Processing all Korean Wikipedia articles** (no limit)

Files will be saved in chunks of 50,000 articles each to avoid very large files.

**Note**: First run will download the Wikipedia dump (~GB size). Subsequent runs will use cached file.

In [None]:
# Initialize Korean parser (using latest dump)
ko_parser = WikipediaXMLParser(
    language="ko",
    date="latest",  # Will automatically use the most recent dump
    cache_dir="../../dataset/wikipedia/cache"
)

# Download the dump first
dump_path = ko_parser.download_dump()

print("\n" + "="*80)
print("Processing ALL Korean Wikipedia articles")
print("Files will be split into chunks of 50,000 articles")
print("="*80 + "\n")

# Process articles in streaming mode and save in chunks
from tqdm import tqdm

ko_articles_total = []
chunk_num = 0
current_chunk = []

iterator = ko_parser.iter_articles(dump_path)
pbar = tqdm(iterator, desc="Processing Korean Wikipedia")

for raw_article in pbar:
    # Parse wikitext to plain text
    text = ko_parser.parse_wikitext(raw_article["wikitext"])
    
    article = {
        "id": raw_article["id"],
        "url": raw_article["url"],
        "title": raw_article["title"],
        "text": text,
        "language": "ko",
    }
    
    # Apply filters
    if ko_parser.filter_article(article, min_length=200, max_length=100000):
        current_chunk.append(article)
        ko_articles_total.append(article)
        
        # Save chunk when it reaches the limit
        if len(current_chunk) >= ARTICLES_PER_FILE:
            chunk_num += 1
            output_file = output_dir / f"ko_articles_chunk_{chunk_num:03d}.jsonl"
            ko_parser.save_articles(current_chunk, output_file)
            pbar.set_postfix({
                'chunks': chunk_num, 
                'articles': len(ko_articles_total),
                'current_chunk': len(current_chunk)
            })
            current_chunk = []

# Save remaining articles in last chunk
if current_chunk:
    chunk_num += 1
    output_file = output_dir / f"ko_articles_chunk_{chunk_num:03d}.jsonl"
    ko_parser.save_articles(current_chunk, output_file)

print(f"\n✓ Processed {len(ko_articles_total):,} Korean articles")
print(f"✓ Saved in {chunk_num} chunk files")
if ko_articles_total:
    print(f"✓ Sample article: {ko_articles_total[0]['title']}")

## 3. Extract English Wikipedia Articles

**Processing all English Wikipedia articles** (no limit)

Files will be saved in chunks of 50,000 articles each.

In [None]:
# Initialize English parser (using latest dump)
en_parser = WikipediaXMLParser(
    language="en",
    date="latest",  # Will automatically use the most recent dump
    cache_dir="../../dataset/wikipedia/cache"
)

# Download the dump first
dump_path = en_parser.download_dump()

print("\n" + "="*80)
print("Processing ALL English Wikipedia articles")
print("Files will be split into chunks of 50,000 articles")
print("="*80 + "\n")

# Process articles in streaming mode and save in chunks
en_articles_total = []
chunk_num = 0
current_chunk = []

iterator = en_parser.iter_articles(dump_path)
pbar = tqdm(iterator, desc="Processing English Wikipedia")

for raw_article in pbar:
    # Parse wikitext to plain text
    text = en_parser.parse_wikitext(raw_article["wikitext"])
    
    article = {
        "id": raw_article["id"],
        "url": raw_article["url"],
        "title": raw_article["title"],
        "text": text,
        "language": "en",
    }
    
    # Apply filters
    if en_parser.filter_article(article, min_length=200, max_length=100000):
        current_chunk.append(article)
        en_articles_total.append(article)
        
        # Save chunk when it reaches the limit
        if len(current_chunk) >= ARTICLES_PER_FILE:
            chunk_num += 1
            output_file = output_dir / f"en_articles_chunk_{chunk_num:03d}.jsonl"
            en_parser.save_articles(current_chunk, output_file)
            pbar.set_postfix({
                'chunks': chunk_num, 
                'articles': len(en_articles_total),
                'current_chunk': len(current_chunk)
            })
            current_chunk = []

# Save remaining articles in last chunk
if current_chunk:
    chunk_num += 1
    output_file = output_dir / f"en_articles_chunk_{chunk_num:03d}.jsonl"
    en_parser.save_articles(current_chunk, output_file)

print(f"\n✓ Processed {len(en_articles_total):,} English articles")
print(f"✓ Saved in {chunk_num} chunk files")
if en_articles_total:
    print(f"✓ Sample article: {en_articles_total[0]['title']}")

## 4. Inspect Sample Articles

In [None]:
# Display Korean article sample
if len(ko_articles_total) > 0:
    # Use first available article or 10th if available
    sample_idx = min(10, len(ko_articles_total) - 1)
    sample_ko = ko_articles_total[sample_idx]
    
    print("=" * 80)
    print(f"Article #{sample_idx + 1} of {len(ko_articles_total):,}")
    print(f"Title: {sample_ko['title']}")
    print(f"URL: {sample_ko['url']}")
    print(f"Language: {sample_ko['language']}")
    print(f"Text length: {len(sample_ko['text'])} characters")
    print("\nFirst 300 characters:")
    print(sample_ko['text'][:300])
    print("=" * 80)
else:
    print("No articles found. Check filtering criteria.")

In [None]:
# Display English article sample
if len(en_articles_total) > 0:
    # Use first available article or 10th if available
    sample_idx = min(10, len(en_articles_total) - 1)
    sample_en = en_articles_total[sample_idx]
    
    print("=" * 80)
    print(f"Article #{sample_idx + 1} of {len(en_articles_total):,}")
    print(f"Title: {sample_en['title']}")
    print(f"URL: {sample_en['url']}")
    print(f"Language: {sample_en['language']}")
    print(f"Text length: {len(sample_en['text'])} characters")
    print("\nFirst 300 characters:")
    print(sample_en['text'][:300])
    print("=" * 80)
else:
    print("No articles found. Check filtering criteria.")

## 5. Statistics

In [None]:
import numpy as np

# Korean articles stats
if len(ko_articles_total) > 0:
    ko_lengths = [len(a['text']) for a in ko_articles_total]
    print("Korean Wikipedia Articles:")
    print(f"  Total: {len(ko_articles_total):,}")
    print(f"  Mean length: {np.mean(ko_lengths):.0f} chars")
    print(f"  Median length: {np.median(ko_lengths):.0f} chars")
    print(f"  Min length: {np.min(ko_lengths):.0f} chars")
    print(f"  Max length: {np.max(ko_lengths):.0f} chars")
else:
    print("Korean Wikipedia Articles: No articles found")

print()

# English articles stats
if len(en_articles_total) > 0:
    en_lengths = [len(a['text']) for a in en_articles_total]
    print("English Wikipedia Articles:")
    print(f"  Total: {len(en_articles_total):,}")
    print(f"  Mean length: {np.mean(en_lengths):.0f} chars")
    print(f"  Median length: {np.median(en_lengths):.0f} chars")
    print(f"  Min length: {np.min(en_lengths):.0f} chars")
    print(f"  Max length: {np.max(en_lengths):.0f} chars")
else:
    print("English Wikipedia Articles: No articles found")

## 6. Verify Saved Files

In [None]:
import os
import glob

print("Saved chunk files:")

# Find all Korean chunk files
ko_chunks = sorted(glob.glob(str(output_dir / "ko_articles_chunk_*.jsonl")))
if ko_chunks:
    print(f"\n  Korean: {len(ko_chunks)} chunk files")
    total_size = sum(os.path.getsize(f) for f in ko_chunks)
    total_lines = sum(sum(1 for _ in open(f)) for f in ko_chunks)
    print(f"    Total size: {total_size / 1024 / 1024:.2f} MB")
    print(f"    Total articles: {total_lines:,}")
    print(f"    Files:")
    for chunk in ko_chunks:
        size = os.path.getsize(chunk) / 1024 / 1024
        lines = sum(1 for _ in open(chunk))
        print(f"      - {os.path.basename(chunk):30s} ({size:>6.2f} MB, {lines:>6,} articles)")
else:
    print("  Korean: No chunk files found")

# Find all English chunk files
en_chunks = sorted(glob.glob(str(output_dir / "en_articles_chunk_*.jsonl")))
if en_chunks:
    print(f"\n  English: {len(en_chunks)} chunk files")
    total_size = sum(os.path.getsize(f) for f in en_chunks)
    total_lines = sum(sum(1 for _ in open(f)) for f in en_chunks)
    print(f"    Total size: {total_size / 1024 / 1024:.2f} MB")
    print(f"    Total articles: {total_lines:,}")
    print(f"    Files:")
    for chunk in en_chunks:
        size = os.path.getsize(chunk) / 1024 / 1024
        lines = sum(1 for _ in open(chunk))
        print(f"      - {os.path.basename(chunk):30s} ({size:>6.2f} MB, {lines:>6,} articles)")
else:
    print("  English: No chunk files found")

## Summary

We've successfully extracted and cleaned **ALL** Korean and English Wikipedia articles.

**Key Features:**
- ✅ Processes complete Wikipedia dumps (no article limit)
- ✅ Saves data in manageable chunks (50,000 articles per file)
- ✅ Filters out redirects, special pages, and low-quality articles
- ✅ Cleans MediaWiki markup to plain text
- ✅ Ready for synonym extraction and model training

**Output Structure:**
```
dataset/wikipedia/
├── ko_articles_chunk_001.jsonl  (50,000 articles)
├── ko_articles_chunk_002.jsonl  (50,000 articles)
├── ...
├── en_articles_chunk_001.jsonl  (50,000 articles)
├── en_articles_chunk_002.jsonl  (50,000 articles)
└── ...
```

**Next steps:**
- Extract inter-language links from chunks
- Extract synonym pairs from article text
- Build comprehensive bilingual dictionary