See https://analytics.wikimedia.org/published/datasets/country_project_page/00_README.html

In [14]:
import datetime as dt
import csv
import pandas as pd
import requests

from pathlib import Path

In [10]:
start_date = dt.date(year=2024, month=2, day=1)
end_date = dt.date(year=2024, month=2, day=12)

Download data for the date range locally, into files in the current directory. Only need to do this once, unless you change the date range.

In [12]:
for d in pd.date_range(start_date, end_date):
    fn = d.strftime('%Y-%m-%d') + '.tsv'
    tsv = requests.get(f'https://analytics.wikimedia.org/published/datasets/country_project_page/{fn}')
    Path(fn).write_bytes(tsv.content)

Create a dictionary for the en.wikipedia project containing DP page views by (country, page).

In [23]:
data = {}
for d in pd.date_range(start_date, end_date):
    fn = d.strftime('%Y-%m-%d') + '.tsv'
    content = Path(fn).read_text()
    for row in csv.reader(content.splitlines(), delimiter='\t', escapechar='\\'):
        country = row[0]
        project = row[2]
        page_title = row[4]
        gbc = row[5]

        if project != 'en.wikipedia':
            continue

        key = (country, page_title)
        gbcs = data.get(key, [])

        if gbcs == []:
            data[key] = gbcs

        gbcs.append(int(gbc))

Identify page / country pairs with a "small" and "fairly consistent" number of views.

In [33]:
MIN_VIEWS = 100
MAX_VIEWS = 500
ACCEPTABLE_RANGE = 30

days = len(pd.date_range(start_date, end_date))
for k, v in data.items():
    gbc_min = min(v)
    gbc_max = max(v)
    if len(v) == days and gbc_min >= MIN_VIEWS and gbc_max <= MAX_VIEWS and gbc_max - gbc_min <= ACCEPTABLE_RANGE:
        print(k, v)

('United States', 'Ford_Models') [133, 120, 123, 145, 120, 140, 127, 134, 133, 118, 125, 120]
('India', 'Kaushiki_Chakraborty') [171, 176, 172, 186, 187, 157, 185, 178, 170, 184, 162, 166]
('United States', 'Imagine_(John_Lennon_album)') [161, 151, 149, 150, 158, 140, 157, 150, 153, 145, 163, 150]
('United States', 'Pontiac_Sunbird') [128, 137, 122, 135, 115, 122, 128, 143, 119, 139, 130, 132]
('United States', 'Morris_dance') [124, 116, 122, 134, 138, 132, 134, 146, 140, 137, 139, 123]
('United States', 'Brearley_School') [135, 128, 108, 118, 124, 133, 122, 133, 136, 130, 118, 117]
('United States', 'Rooting_(Android)') [127, 139, 151, 146, 143, 150, 122, 133, 146, 147, 130, 126]
('United States', 'Black_separatism') [164, 151, 171, 150, 160, 149, 173, 167, 148, 152, 152, 176]
('India', '2016_United_States_presidential_election') [138, 160, 162, 154, 138, 162, 140, 144, 147, 160, 155, 145]
('Ireland', 'Barents_Sea') [298, 305, 288, 292, 297, 302, 284, 296, 308, 280, 306, 283]
('United