In [None]:
# Setting up username and environment
USERNAME = "sabira"

import datetime
import json

import boto3
import requests

Extract (Top viewed pages for a given day)

In [8]:
# Try different dates to see how the data changes
DATE_PARAM = "2025-11-22"

date = datetime.datetime.strptime(DATE_PARAM, "%Y-%m-%d")

# Pageviews top endpoint (daily)
# project: en.wikipedia.org
# access: all-access
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/{date.strftime('%Y/%m/%d')}"
print(f"Requesting REST API URL: {url}")

resp = requests.get(url, headers={"User-Agent": "curl/7.68.0"})
status = resp.status_code
body_text = resp.text

print(f"Wikipedia REST API Response body: {body_text[:500]}...")
print(f"Wikipedia REST API Response Code: {status}")

if status != 200:
    raise Exception(f"Received non-OK status code from Wiki Server: {status} - {body_text[:200]}")

data = resp.json()
print("Top-level keys:", list(data.keys()))
print("First item keys:", list(data["items"][0].keys()))
print("First article example:", data["items"][0]["articles"][0])

Requesting REST API URL: https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/2025/11/22
Wikipedia REST API Response body: {"items":[{"project":"en.wikipedia","access":"all-access","year":"2025","month":"11","day":"22","articles":[{"article":"Main_Page","views":6050032,"rank":1},{"article":"Special:Search","views":792194,"rank":2},{"article":"Marjorie_Taylor_Greene","views":324661,"rank":3},{"article":"Google_Chrome","views":293360,"rank":4},{"article":"Wikipedia:Featured_pictures","views":245330,"rank":5},{"article":"Wicked:_For_Good","views":227960,"rank":6},{"article":"Tatiana_Schlossberg","views":175274,"rank":7...
Wikipedia REST API Response Code: 200
Top-level keys: ['items']
First item keys: ['project', 'access', 'year', 'month', 'day', 'articles']
First article example: {'article': 'Main_Page', 'views': 6050032, 'rank': 1}


Transform (to JSON Lines)

In [9]:
articles = data["items"][0]["articles"]

current_time = datetime.datetime.now(datetime.timezone.utc)

json_lines = ""
for a in articles:
    record = {
        "title": a["article"],         # note: pageviews API calls it "article"
        "views": a["views"],
        "rank": a["rank"],
        "date": date.strftime("%Y-%m-%d"),
        # keep same style as edits pipeline
        "retrieved_at": current_time.replace(tzinfo=None).isoformat(),
    }
    json_lines += json.dumps(record) + "\n"

print(f"Transformed {len(articles)} records to JSON Lines")
print("First few lines:\n", json_lines[:500], "...")

Transformed 1000 records to JSON Lines
First few lines:
 {"title": "Main_Page", "views": 6050032, "rank": 1, "date": "2025-11-22", "retrieved_at": "2025-12-19T00:30:17.616724"}
{"title": "Special:Search", "views": 792194, "rank": 2, "date": "2025-11-22", "retrieved_at": "2025-12-19T00:30:17.616724"}
{"title": "Marjorie_Taylor_Greene", "views": 324661, "rank": 3, "date": "2025-11-22", "retrieved_at": "2025-12-19T00:30:17.616724"}
{"title": "Google_Chrome", "views": 293360, "rank": 4, "date": "2025-11-22", "retrieved_at": "2025-12-19T00:30:17.616724"}
{ ...


Load (upload to S3 under raw-views/)

In [10]:
S3_WIKI_BUCKET = f"sabira-wikidata"
s3 = boto3.client("s3")

s3_key = f"raw-views/raw-views-{date.strftime('%Y-%m-%d')}.json"
s3.put_object(
    Bucket=S3_WIKI_BUCKET,
    Key=s3_key,
    Body=json_lines,
)

print(f"Uploaded {len(articles)} records to s3://{S3_WIKI_BUCKET}/{s3_key}")

# quick check
s3.head_object(Bucket=S3_WIKI_BUCKET, Key=s3_key)
print("Upload verified.")

Uploaded 1000 records to s3://sabira-wikidata/raw-views/raw-views-2025-11-22.json
Upload verified.


I re-executed this notebook two more times.
I changed DATE_PARAM to 2025-11-21 and 2025-11-22 and re-run all cells so I ended up with 3 files in S3 bucket file named raw-views.