# Wikipedia Pageviews Analysis

Explore patterns, summary statistics, and trends in Wikipedia pageview data.

## Setup

In [None]:
import json
import os
from pathlib import Path
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load all JSON files from data/
DATA_DIR = Path('../data')

def load_pageviews(data_dir: Path) -> pd.DataFrame:
    """Load all pageview JSON files into a single DataFrame."""
    records = []
    for f in sorted(data_dir.glob('pageviews_*.json')):
        with open(f) as fp:
            records.extend(json.load(fp))
    
    df = pd.DataFrame(records)
    df['date'] = pd.to_datetime(df['date'])
    return df

df = load_pageviews(DATA_DIR)
print(f"Loaded {len(df):,} records from {df['date'].nunique()} days")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
df.head()

## Filter Content Pages

Exclude special pages (Main_Page, Special:*, User:*, etc.) to focus on actual articles.

In [None]:
NON_CONTENT_PREFIXES = (
    'Special:', 'User:', 'Wikipedia:', 'Template:', 
    'Category:', 'Portal:', 'Draft:', 'Help:', 
    'Module:', 'MediaWiki:', 'File:', 'TimedText:'
)

def is_content(article: str) -> bool:
    """Return True if article is actual content (not a special page)."""
    if article == 'Main_Page':
        return False
    if article.startswith(NON_CONTENT_PREFIXES):
        return False
    if '_talk:' in article:
        return False
    return True

df['is_content'] = df['article'].apply(is_content)
content = df[df['is_content']].copy()

print(f"Content pages: {len(content):,} ({100*len(content)/len(df):.1f}%)")
print(f"Filtered out: {len(df) - len(content):,} non-content records")

## Summary Statistics

In [None]:
# Daily totals
daily = content.groupby('date').agg(
    total_views=('views', 'sum'),
    unique_articles=('article', 'nunique'),
    avg_views=('views', 'mean'),
    max_views=('views', 'max')
).round(0)

print("Daily Statistics:")
daily.describe().round(0)

In [None]:
# Top articles by total views
top_overall = content.groupby('article')['views'].sum().sort_values(ascending=False)

print("Top 20 Articles (by total views across all days):")
top_overall.head(20)

## Trends Over Time

In [None]:
# Total daily views
fig, ax = plt.subplots()
daily['total_views'].plot(ax=ax, marker='o')
ax.set_title('Total Daily Pageviews (Content Pages)')
ax.set_xlabel('Date')
ax.set_ylabel('Views')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.1f}M'))
plt.tight_layout()

In [None]:
# Track top 5 articles over time
top5 = top_overall.head(5).index.tolist()
top5_daily = content[content['article'].isin(top5)].pivot(
    index='date', columns='article', values='views'
)

fig, ax = plt.subplots()
top5_daily.plot(ax=ax, marker='o')
ax.set_title('Top 5 Articles Over Time')
ax.set_xlabel('Date')
ax.set_ylabel('Views')
ax.legend(title='Article', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()

## Interesting Patterns

In [None]:
# Find articles with biggest day-over-day spikes
# (articles that suddenly became popular)

def find_spikes(df, min_views=50000, min_ratio=3.0):
    """Find articles with sudden popularity spikes."""
    pivoted = df.pivot(index='date', columns='article', values='views').fillna(0)
    
    spikes = []
    for article in pivoted.columns:
        series = pivoted[article]
        for i in range(1, len(series)):
            prev = series.iloc[i-1]
            curr = series.iloc[i]
            if prev > 0 and curr >= min_views:
                ratio = curr / prev
                if ratio >= min_ratio:
                    spikes.append({
                        'article': article,
                        'date': series.index[i],
                        'views': int(curr),
                        'prev_views': int(prev),
                        'spike_ratio': round(ratio, 1)
                    })
    
    return pd.DataFrame(spikes).sort_values('spike_ratio', ascending=False)

spikes = find_spikes(content)
print(f"Found {len(spikes)} spike events")
spikes.head(15)

In [None]:
# Views vs Rank distribution (log scale)
fig, ax = plt.subplots()
ax.scatter(content['rank'], content['views'], alpha=0.3, s=5)
ax.set_yscale('log')
ax.set_title('Views vs Rank (Content Pages)')
ax.set_xlabel('Rank')
ax.set_ylabel('Views (log scale)')
plt.tight_layout()

In [None]:
# Day of week patterns
daily['day_of_week'] = daily.index.day_name()
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

fig, ax = plt.subplots()
daily.groupby('day_of_week')['total_views'].mean().reindex(dow_order).plot(
    kind='bar', ax=ax, color='steelblue'
)
ax.set_title('Average Views by Day of Week')
ax.set_xlabel('')
ax.set_ylabel('Average Daily Views')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.1f}M'))
plt.xticks(rotation=45)
plt.tight_layout()

## Consistency Analysis

Which articles consistently rank high vs. which are one-hit wonders?

In [None]:
# Articles that appear in top 100 most consistently
n_days = content['date'].nunique()
top100_per_day = content[content['rank'] <= 100]

consistency = top100_per_day.groupby('article').agg(
    days_in_top100=('date', 'nunique'),
    avg_rank=('rank', 'mean'),
    avg_views=('views', 'mean'),
    total_views=('views', 'sum')
).sort_values('days_in_top100', ascending=False)

consistency['pct_days'] = (100 * consistency['days_in_top100'] / n_days).round(1)

print(f"Most Consistent Top-100 Articles ({n_days} days):")
consistency.head(20)

In [None]:
# One-hit wonders: high views but only appeared once in top 100
one_hit = consistency[
    (consistency['days_in_top100'] == 1) & 
    (consistency['total_views'] > 100000)
].sort_values('total_views', ascending=False)

print(f"One-Hit Wonders (top 100 for only 1 day, >100k views):")
one_hit.head(15)