# Step 2 - Retrieving ORES Data

This short notebook is designed to query ORES, a model that that rates Wikpedia articles' quality and was trained on user-made evaluations, for its predictions regarding the quality of articles on American cities. As input, it takes in the individual JSON files obtained by the Step 1 notebook, and it outputs another set of individual JSONs containing all of the ORES results, as well as the data used to make ORES requests.

In [2]:
#This simple cell takes in the list of files in the output folder of Step 1
import os
pageInfo_files_path = "raw_api_data/Pageinfo/"
file_list = os.listdir(pageInfo_files_path)

In [3]:
#this cell loops through the list of files in Step 1's output folder
#and makes a list of dicts containing all the relevant article revIDs from each JSON
#that we can use to make ORES requests, as well as additional useful fields. 

import json 

revid_list = []
#file_list = ['Abbeville, Alabama.json']
for filename in file_list:
    json_path = pageInfo_files_path + filename
    file = open(json_path)
    example_json = json.load(file)
    file.close()

    pages_object = example_json['query']['pages']

    for page in pages_object:
        page = (pages_object[page])#['lastrevid'])
        #print(page['title'])
        new_list_object = {
            'page_title': page['title'],
            'lastrevid' : page['lastrevid'],
            'url': page['canonicalurl'],
            'touched': page['touched']
        }
        revid_list.append(new_list_object)

#print(revid_list)

with open(("revid_list.json"), "w") as outfile:
        json.dump(revid_list, outfile)


The below code was was developed by Dr. David W. McDonald for use in DATA 512, a course in the UW MS Data Science degree program, and made available under the [Creative a Commons](https://creativecommons.org/) [CC-BY license](https://creativecommons.org/licenses/by/4.0/). Revision 1.2 - August 14, 2023, and defines a function and accompanying constants needed to query ORES for the article quality evaluation for an individual revid. The username and Access Token constants were modified to use my Wikimedia account and Personal Access Token.

In [15]:
import time, requests, json

#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = { 'Bison':1085687913 , 'Northern flicker':1086582504 , 'Red squirrel':1083787665 , 'Chinook salmon':1085406228 , 'Horseshoe bat':1060601936 }

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = "RamirostUW"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiIyZjgzODVmMTA5NTQ2YWQ1YTdhYTJhZDhmOGI0M2E1YiIsImp0aSI6Ijc4MWEwM2FhNzg4ZjhiN2M1YmJlNGM0YzM4NjkzYjcwNzA0NzIxNGVkMGNhN2UyMmUwOGZlZWRjNWQyNGJhZDlhNDgzYWNmZmVhNzU3YzY1IiwiaWF0IjoxNjk3NDkxNzQ5LjY4NjIxMSwibmJmIjoxNjk3NDkxNzQ5LjY4NjIxNCwiZXhwIjozMzI1NDQwMDU0OS42ODM4OCwic3ViIjoiNzQwMjEwMzEiLCJpc3MiOiJodHRwczovL21ldGEud2lraW1lZGlhLm9yZyIsInJhdGVsaW1pdCI6eyJyZXF1ZXN0c19wZXJfdW5pdCI6NTAwMCwidW5pdCI6IkhPVVIifSwic2NvcGVzIjpbImJhc2ljIl19.qdG1zzLz4s4ECRRYtjz0tje9SNnDG6oYzIHjq1wtdyoSYXrCkC2J81RJygW8_QelGL-4mGaC7GPBoHWhFD12rhDfpbW1A_ZDncF9oPFeXmZLno9oJEi2rCDBEzWpn5rqW2UUCKI-d_Sl15XixwkxWfvM4GkAdMqR892csJmB6aW1gIf89QvqqvKJQKza1uaJw4zARK3qPF5S2Tgwv45oaGQw3MYOdsiaLpQrRFLTXhQ1YdaWLz4e3EimwimTEFJ74jlWKIfHEfns-fATMCJPyiqut0xh9Ctdg9KesCaPYOuEz-7ec_xbE0adcvFY5GBUCsu7zKZXmqbIW8F6Mnj3Mz3tUeeEBD3QW7SLJhwkMokTnAKCeY23yjCF4ukCG-6aUigBZkzdK6zb-5wwFpwEzrTFASIqLzmGN4XKE1pps8jDV3MIkjt7Kxdvvxu3jRTO2KlGGZLEzHrKI2bm55Nlh3s_SvNuT-BxfU9mH29ofvSiD8zHqjFm9HeAX-ylRRAre0_vAvlMpMBd8U044aKFZ970csHjYdO2cxTbUx64aaRsM9KxF2A6wXwRzj3-8VOyLnm74CwkbgVZ4Enh14nOOVgYQ__uZ1jFTjh8xi8N3bm-pS9hFGzECt1-skKaaDRX-f8oyMCQC9lN4gYphM7GCw6kmU7aszu7hYNi3WccBmk"
#

#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [5]:
#This line creates a duplicate reference for the list of article data dicts
#we will use to query ORES
original_revid_list = revid_list

In [30]:
#This cell scans the output folder for the ORES JSONs that the next cell outputs, and filters
#out all of the articles for which existing ORES data was found in order to not download it again. 
#This was made because the ORES download loop was prone to returning errors and takes a long time to 
#run, and this allows it to resume where it left off. 

import os
ores_files_path = "raw_api_data/ores/"
revid_list = original_revid_list
existing_file_list = os.listdir(ores_files_path)
existing_file_list = [sub.replace('.json', '') for sub in existing_file_list]
print(len(revid_list))

def already_downloaded(revid_object):
    return(existing_file_list.count(str(revid_object['lastrevid'])) == 0)

filtered_revid_list = [revid_object for revid_object in revid_list if already_downloaded(revid_object)]

print(len(filtered_revid_list))



21519


0


In [29]:
#This cell loops through a list of article data dicts that includes a revid 
#and queries ORES for its article quality evaluation of that revid.
#It then stores the article quality data, as well as the input dict the list read in,
#to an individual JSON file named after the revid.


ores_data_folder = "raw_api_data/ores/"
ores_json = []
#revid_list = revid_list[0:5]
for revid_object in filtered_revid_list:
    try:
        revid = revid_object['lastrevid']
            #print(revid_object['page_title'])
        score = request_ores_score_per_article(article_revid= revid ,#ARTICLE_REVISIONS[article_title],
                                        email_address="dwmc@uw.edu",
                                        access_token=ACCESS_TOKEN)
        score_data = score['enwiki']
        score_data['request_info'] = revid_object
        with open((ores_data_folder +  str(revid_object['lastrevid']) + ".json"), "w") as outfile:
             json.dump(score_data, outfile)
        # ores_json.append(score_data)
    
    except Exception as exception:
        print(exception)
        print("Error on " + revid_object['page_title'] + ", revid: " + str(revid_object['lastrevid']))

