# Article Page Views API

This code is to extract the data for page views using an API. We use a subset of movie names to limit the number of items extracted from the API. The data extracted and saved in three different files: mobile data, desktop data and cumulative data. 

In [1]:
#importing python modules
import pandas as pd
import json, time, urllib.parse
import requests

### Defining the subset of the articles

Provided is a csv file (academy_movie_titles.csv) containing all the names of the Academy Award Winning Film titles. We use these titles to get the information of only the mentioned movie names

In [2]:
# ---------------------------- Subset Movie Names ---------------------------- #
df = pd.read_csv('subset-data/academy_movie_titles.csv')

# converting the name column to a list
academy_award_titles = df['name'].to_list()
len(academy_award_titles)

1359

### API Function

This part defines the API constants and the function that uses the API to extract the data.

In [3]:
# ------------------------------- API CONSTANTS ------------------------------ #

API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

API_LATENCY_ASSUMED = 0.002
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

ARTICLE_TITLES = academy_award_titles

"""
The access and article params are left empty for reproducibility.
This ensures that the code doesn't have to run over and over for
different access types

The start and end dates are set to July 01, 2015 and Sept 30, 2023
Format for dates: YYYYMMDDHH (where HH indicates the hour)
"""

ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "", # three options: desktop, mobile-app, mobile-web
    "agent":       "user",
    "article":     "", # title name - changes with every request
    "granularity": "monthly",
    "start":       "2015070100",
    "end":         "2023093000"
}

In [4]:
# ----------------------------- FUNCTION FOR API ----------------------------- #

"""
This part of the code is taken from the example notebook provided.
With a few changes to make access callable in the function late
"""

def request_pageviews_per_article(article_title = None,
                                  access = None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT,
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS,
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['article'] = article_title

    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")

    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_').replace('/','_'))
    request_template['article'] = article_title_encoded
    request_template['access'] = access

    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    print(request_url)

    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

## Saving the Json Files

In this part we define the file names and also use the function from the above step to call the API with different access names and store them into three json files:

1. Monthly Mobile Access: Combines two access types - mobile-app and mobile-web
2. Monthly Desktop Access: Contains only one access type - desktop
3. Month Cumulative - Combines monthly mobile and monthly desktop

In [5]:
# Defining file names
start_date_str = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE['start'][:6]
end_date_str = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE['end'][:6]

mobile_filename = f"json-data/academy_monthly_mobile_{start_date_str}-{end_date_str}.json"
desktop_filename = f"json-data/academy_monthly_desktop_{start_date_str}-{end_date_str}.json"
cumulative_filename = f"json-data/academy_monthly_cumulative_{start_date_str}-{end_date_str}.json"

In [6]:
# ----------------------------- MOBILE VIEWS ----------------------------- #

# Mobile App Views
mobile_app = {}
for movie in ARTICLE_TITLES[0:]:
  mobile_app[movie] = request_pageviews_per_article(movie, 'mobile-app')

# Mobile Web Views
mobile_web = {}
for movie in ARTICLE_TITLES[0:]:
  mobile_web[movie] = request_pageviews_per_article(movie, 'mobile-web')

# ---------------------- COMBINING THE TWO ACCESS TYPES ---------------------- #
mobile_data = {}

for key in mobile_app:
    if key in mobile_web:
        mobile_data[key] = {
            "items": []
        }
        for item1, item2 in zip(mobile_app[key]["items"], mobile_web[key]["items"]):
            merged_item = {
                "project": item1["project"],
                "article": item1["article"],
                "granularity": item1["granularity"],
                "timestamp": item1["timestamp"],
                "agent": item1["agent"],
                "views": item1["views"] + item2["views"]  # Sum the views
            }
            mobile_data[key]["items"].append(merged_item)

with open(mobile_filename, "w") as mobile_file:
  mobile_file.write(json.dumps(mobile_data, indent=4))

print("file saved successfully")

https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Everything_Everywhere_All_at_Once/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/All_Quiet_on_the_Western_Front_%282022_film%29/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/The_Whale_%282022_film%29/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Top_Gun%3A_Maverick/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Black_Panther%3A_Wakanda_Forever/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Avatar%3A_The_Way_of_Water/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews

In [7]:
# ------------------------------- DESKTOP VIEWS ------------------------------ #

desktop_data = {}
for movie in ARTICLE_TITLES[0:]:
  desktop_data[movie] = request_pageviews_per_article(movie, 'desktop')

for key in desktop_data:
    for item in desktop_data[key]["items"]:
        del item["access"]

with open(desktop_filename, "w") as desktop_file:
  desktop_file.write(json.dumps(desktop_data, indent=4))

print("file saved successfully")

https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/Everything_Everywhere_All_at_Once/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/All_Quiet_on_the_Western_Front_%282022_film%29/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/The_Whale_%282022_film%29/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/Top_Gun%3A_Maverick/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/Black_Panther%3A_Wakanda_Forever/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/Avatar%3A_The_Way_of_Water/monthly/2015070100/2023093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wi

In [8]:
# ----------------------------- CUMULATIVE VIEWS ----------------------------- #

mobile_data = pd.read_json("json-data/academy_monthly_mobile_201507-202309.json")
desktop_data = pd.read_json("json-data/academy_monthly_desktop_201507-202309.json")

cumulative_data = {}

for key in mobile_data:
    if key in desktop_data:
        cumulative_data[key] = {
            "items": []
        }
        for item1, item2 in zip(mobile_data[key]["items"], desktop_data[key]["items"]):
            merged_item = {
                "project": item1["project"],
                "article": item1["article"],
                "granularity": item1["granularity"],
                "timestamp": item1["timestamp"],
                "agent": item1["agent"],
                "views": item1["views"] + item2["views"]  # Sum the views
            }
            cumulative_data[key]["items"].append(merged_item)

with open(cumulative_filename, "w") as cumulative_file:
    json.dump(cumulative_data, cumulative_file, indent=4)

print("File saved successfully")


File saved successfully
