# CC-NEWS WARC Explorer

This notebook fetches and analyzes Common Crawl News (CC-NEWS) WARC files in a Google Colab environment using the `cc_news_analyzer` library.

It automatically downloads the current month's WARC index, lists available files, and downloads the most recent WARC file to `/content/data/` for analysis.

In [1]:
# Install from GitHub — change 'main' to a branch name to test unreleased features
BRANCH = "main"
!pip install -q "cc-news-analyzer @ git+https://github.com/richardm/cc_news.git@{BRANCH}"

import os
from datetime import datetime

from cc_news_analyzer.index import (
    build_index_url,
    build_warc_urls,
    download_warc,
    fetch_warc_paths,
)
from cc_news_analyzer.warc import count_articles, count_records, list_warc_files

# Data directory in the Colab environment (ephemeral, not persisted across sessions)
DATA_DIR = "/content/data"
os.makedirs(DATA_DIR, exist_ok=True)

# Derive the current year and month
now = datetime.now()
YEAR = now.year
MONTH = now.month

print(f"Data directory: {DATA_DIR}")
print(f"Current month index: {build_index_url(YEAR, MONTH)}")

[31mERROR: Could not find a version that satisfies the requirement cc-news-analyzer (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cc-news-analyzer[0m[31m
[0m

ModuleNotFoundError: No module named 'cc_news_analyzer'

In [None]:
# Fetch the WARC index for the current month
warc_paths = fetch_warc_paths(YEAR, MONTH, DATA_DIR)
warc_urls = build_warc_urls(warc_paths)

print(f"Found {len(warc_urls)} WARC file(s) for {YEAR}-{MONTH:02d}:\n")
for i, url in enumerate(warc_urls):
    marker = " <-- most recent" if i == len(warc_urls) - 1 else ""
    print(f"  [{i}] {os.path.basename(url)}{marker}")

Found 97 WARC file(s) for 2026-02:

  [0] CC-NEWS-20260201022924-06627.warc.gz
  [1] CC-NEWS-20260201054311-06628.warc.gz
  [2] CC-NEWS-20260201080436-06629.warc.gz
  [3] CC-NEWS-20260201101223-06630.warc.gz
  [4] CC-NEWS-20260201120545-06631.warc.gz
  [5] CC-NEWS-20260201140019-06632.warc.gz
  [6] CC-NEWS-20260201155838-06633.warc.gz
  [7] CC-NEWS-20260201180040-06634.warc.gz
  [8] CC-NEWS-20260201202721-06635.warc.gz
  [9] CC-NEWS-20260201230325-06636.warc.gz
  [10] CC-NEWS-20260202015611-06637.warc.gz
  [11] CC-NEWS-20260202045006-06638.warc.gz
  [12] CC-NEWS-20260202065605-06639.warc.gz
  [13] CC-NEWS-20260202083608-06640.warc.gz
  [14] CC-NEWS-20260202100546-06641.warc.gz
  [15] CC-NEWS-20260202112041-06642.warc.gz
  [16] CC-NEWS-20260202123035-06643.warc.gz
  [17] CC-NEWS-20260202134409-06644.warc.gz
  [18] CC-NEWS-20260202145304-06645.warc.gz
  [19] CC-NEWS-20260202161229-06646.warc.gz
  [20] CC-NEWS-20260202173258-06647.warc.gz
  [21] CC-NEWS-20260202185402-06648.warc.gz
  [22]

In [None]:
# Download the most recent WARC file (last entry in the index)
download_url = warc_urls[-1]
print(f"Downloading most recent WARC file: {os.path.basename(download_url)}")
print(f"URL: {download_url}\n")

warc_dest = download_warc(download_url, DATA_DIR)

size_mb = os.path.getsize(warc_dest) / (1024 * 1024)
print(f"Download complete: {os.path.basename(warc_dest)} ({size_mb:.1f} MB)")

Downloading most recent WARC file: CC-NEWS-20260207200705-06723.warc.gz
URL: https://data.commoncrawl.org/crawl-data/CC-NEWS/2026/02/CC-NEWS-20260207200705-06723.warc.gz
Destination: /content/data/CC-NEWS-20260207200705-06723.warc.gz


Download complete: CC-NEWS-20260207200705-06723.warc.gz (1023.0 MB)


In [None]:
# List downloaded WARC files in DATA_DIR
warc_file_info = list_warc_files(DATA_DIR)

print(f"Found {len(warc_file_info)} WARC file(s) in {DATA_DIR}:")
for info in warc_file_info:
    print(f"  - {info['name']} ({info['size_mb']} MB)")

Collecting warcio
  Downloading warcio-1.7.5-py2.py3-none-any.whl.metadata (16 kB)
Downloading warcio-1.7.5-py2.py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: warcio
Successfully installed warcio-1.7.5


In [None]:
# Count WARC records in the first available file
if warc_file_info:
    target = warc_file_info[0]
    print(f"Reading: {target['path']}")

    total = count_records(target["path"])
    print(f"Total WARC records with WARC-Record-ID: {total}")
else:
    print("No WARC files found. Place .warc or .warc.gz files in the data/ directory.")

Found 1 WARC file(s) in /content/data:
  - CC-NEWS-20260207200705-06723.warc.gz (1023.0 MB)


In [None]:
# Count articles (HTML response records) in the WARC file
if warc_file_info:
    target = warc_file_info[0]
    print(f"Reading: {target['path']}")

    articles = count_articles(target["path"])
    total = count_records(target["path"])
    print(f"Total articles (HTML responses): {articles}")
    print(f"Total WARC records: {total}")
    print(f"Article ratio: {articles / total * 100:.1f}%" if total else "No records found.")
else:
    print("No WARC files found. Place .warc or .warc.gz files in the data/ directory.")

Reading: /content/data/CC-NEWS-20260207200705-06723.warc.gz
Total WARC records with WARC-Record-ID: 45741
