In [1]:
# Author: Harshit Rai
# Importing libraries
import pandas as pd
import requests
import json
import time
from tqdm import tqdm
import urllib.parse

In [2]:
# Setup constants
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

API_LATENCY_ASSUMED = 0.002       
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

REQUEST_HEADERS = {
    'User-Agent': '<harshit@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      None,         # This value will be replaced by the access type
    "agent":       "user",
    "article":     None,         # This value will be replaced by the article title
    "granularity": "monthly",
    "start":       "20150701",   # Start date is 1st July, 2015
    "end":         "20230930"    # End date is 30th September, 2023
}

In [3]:
# Read the CSV file from which we need the article titles
df = pd.read_csv('thank_the_academy.AUG.2023.csv')

In [4]:
# Check if data was read correctly
df.head()

Unnamed: 0,name,url
0,Everything Everywhere All at Once,https://en.wikipedia.org/wiki/Everything_Every...
1,All Quiet on the Western Front (2022 film),https://en.wikipedia.org/wiki/All_Quiet_on_the...
2,The Whale (2022 film),https://en.wikipedia.org/wiki/The_Whale_(2022_...
3,Top Gun: Maverick,https://en.wikipedia.org/wiki/Top_Gun:_Maverick
4,Black Panther: Wakanda Forever,https://en.wikipedia.org/wiki/Black_Panther:_W...


In [5]:
# Order using article title
df.sort_values(by=['name'], inplace=True)

In [6]:
# Confirm if the order is correct
df.head()

Unnamed: 0,name,url
137,12 Years a Slave (film),https://en.wikipedia.org/wiki/12_Years_a_Slave...
53,1917 (2019 film),https://en.wikipedia.org/wiki/1917_(2019_film)
148,20 Feet from Stardom,https://en.wikipedia.org/wiki/20_Feet_from_Sta...
819,"20,000 Leagues Under the Sea (1954 film)","https://en.wikipedia.org/wiki/20,000_Leagues_U..."
766,2001: A Space Odyssey (film),https://en.wikipedia.org/wiki/2001:_A_Space_Od...


In [7]:
# Function to request pageviews for a given article
def request_pageviews_per_article(article_title = None, 
                                  access_type = None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):

    # Access type can be as a parameter to the call or in the request_template
    if access_type:
        request_template['access'] = access_type
    else:
        raise Exception("Must supply an access type to make a pageviews request.")
    
    # Article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['article'] = article_title

    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")

    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    # Create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # Make the request
    try:
        # Wait first to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        
        # Convert the response to JSON
        json_response = response.json()

        if 'items' in json_response:
            pageviews_per_month = {}
            for item in json_response['items']:
                month = item['timestamp'][:6]
                pageviews = item['views']
                pageviews_per_month[month] = pageviews
            return {article_title: pageviews_per_month}
        else:
            print(f"No 'items' in the response for {article_title}. Response: {json_response}")
            return {article_title: None}

    except Exception as e:
        print(f"Error Occured for {article_title}: {e}")
        json_response = None
    return json_response


In [8]:
# Loop through each row of the dataframe and request pageviews for each article for cumulative access type
cumulative_views = {}
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    cumulative_views.update(request_pageviews_per_article(
                                    article_title = row[0], 
                                    access_type = 'all-access',
                                    endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                    endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                    request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                    headers = REQUEST_HEADERS
                                ))

# Remove any entries that have None as the value
cumulative_views = {k: v for k, v in cumulative_views.items() if v is not None}

with open("academy_monthly_cumulative_201507-202310.json", "w") as cumulative_file:
    json.dump(cumulative_views, cumulative_file)

  0%|          | 5/1359 [00:01<04:55,  4.59it/s]

 95%|█████████▌| 1292/1359 [04:18<00:15,  4.42it/s]

No 'items' in the response for Victor/Victoria. Response: {'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found#route', 'title': 'Not found.', 'method': 'get', 'uri': '/wikimedia.org/v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/Victor/Victoria/monthly/20150701/20230930'}


100%|██████████| 1359/1359 [04:58<00:00,  4.55it/s]


In [9]:
# Loop through each row of the dataframe and request pageviews for each article for desktop access type
desktop_views = {}
for index, row in tqdm(df.iterrows()):
    desktop_views.update(request_pageviews_per_article(
                                    article_title = row[0], 
                                    access_type = 'desktop',
                                    endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                    endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                    request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                    headers = REQUEST_HEADERS
                                ))
    
# Remove any entries that have None as the value
desktop_views = {k: v for k, v in desktop_views.items() if v is not None}

with open("academy_monthly_desktop_201507-202310.json", "w") as desktop_file:
    json.dump(desktop_views, desktop_file)

1292it [13:04,  2.11it/s]

No 'items' in the response for Victor/Victoria. Response: {'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found#route', 'title': 'Not found.', 'method': 'get', 'uri': '/wikimedia.org/v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/Victor/Victoria/monthly/20150701/20230930'}


1359it [13:46,  1.64it/s]


In [10]:
# Loop through each row of the dataframe and request pageviews for each article for mobile-app access type
mobile_app_views = {}
for index, row in tqdm(df.iterrows()):
    mobile_app_views.update(request_pageviews_per_article(
                                    article_title = row[0], 
                                    access_type = 'mobile-app',
                                    endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                    endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                    request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                    headers = REQUEST_HEADERS
                                ))

# Remove any entries that have None as the value
mobile_app_views = {k: v for k, v in mobile_app_views.items() if v is not None}

1292it [13:10,  2.03it/s]

No 'items' in the response for Victor/Victoria. Response: {'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found#route', 'title': 'Not found.', 'method': 'get', 'uri': '/wikimedia.org/v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Victor/Victoria/monthly/20150701/20230930'}


1359it [13:52,  1.63it/s]


In [11]:
# Loop through each row of the dataframe and request pageviews for each article for mobile-web access type
mobile_web_views = {}
for index, row in tqdm(df.iterrows()):
    mobile_web_views.update(request_pageviews_per_article(
                                    article_title = row[0], 
                                    access_type = 'mobile-web',
                                    endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                    endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                    request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                    headers = REQUEST_HEADERS
                                ))

# Remove any entries that have None as the value
mobile_web_views = {k: v for k, v in mobile_web_views.items() if v is not None}

1292it [13:21,  2.01it/s]

No 'items' in the response for Victor/Victoria. Response: {'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found#route', 'title': 'Not found.', 'method': 'get', 'uri': '/wikimedia.org/v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-web/user/Victor/Victoria/monthly/20150701/20230930'}


1359it [14:04,  1.61it/s]


In [12]:
# Add view counts from mobile-app and mobile-web together
for key in mobile_web_views:
        # Check if the key exists in the second dictionary
        if key in mobile_app_views:
            # Now, for each month-key inside the movie's dictionary:
            for sub_key in mobile_web_views[key]:
                # Add the values of the matching keys together
                mobile_web_views[key][sub_key] += mobile_app_views[key].get(sub_key, 0)

with open("academy_monthly_mobile_201507-202310.json", "w") as mobile_file:
    json.dump(mobile_web_views, mobile_file)