# Considering Bias in Data

## Import Required PAckages

In [1]:
# standard python packages 
import json, time, urllib.parse
import requests

# packages for data manipulation
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime

# packages for visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.rcParams["figure.figsize"] = (26,12)

# package to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

Making an API call to get the articles revisionid

In [2]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<vpragya@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [16]:
# importing the politicians name list
politicians = pd.read_excel('./data/politicians_by_country_SEPT.2022.csv.xlsx')
politicians.head()

Unnamed: 0,name,url,country
0,Shahjahan Noori,https://en.wikipedia.org/wiki/Shahjahan_Noori,Afghanistan
1,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan
2,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
3,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
4,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan


In [5]:
# list of politicians Wikipedia article titles
ARTICLE_TITLES = politicians['name'].tolist()

In [6]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [7]:
all_info = []
for i in range(0, len(ARTICLE_TITLES), 50):
    info = request_pageinfo_per_article("|".join(ARTICLE_TITLES[i:i+50]))
    all_info.extend(list(info['query']['pages'].values()))

In [8]:
# converting the list to dataframe
df_info = pd.DataFrame.from_dict(all_info)[['title', 'lastrevid']]
df_info.head()

Unnamed: 0,title,lastrevid
0,Abas Basir,1098420000.0
1,Abdul Baqi Turkistani,889226500.0
2,Abdul Ghafar Lakanwal,943562300.0
3,Abdul Ghani Ghani,1072442000.0
4,Abdul Malik Hamwar,1100875000.0


In [9]:
df_info[df_info.isnull().any(axis=1)]

Unnamed: 0,title,lastrevid
2400,Prince Ofosu Sefah,
2950,Harjit Kaur Talwandi,
3200,Abd al-Razzaq al-Hasani,
3750,Kang Sun-nam,


In [10]:
df_info.dropna(inplace=True)

In [11]:
#########
#
#    CONSTANTS
#

# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
#API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"
API_ORES_SCORE_PARAMS = "/scores/{context}/?models={model}&revids={revid}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<vpragya@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}

In [12]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [13]:
# A list revision IDs that can be used for this ORES scoring example
ARTICLE_REVISIONS = df_info['lastrevid'].map(int).map(str)

In [81]:
idx = []
all_scores = []
for i in range(0, len(ARTICLE_REVISIONS), 50):
    score = request_ores_score_per_article("|".join(ARTICLE_REVISIONS[i:i+50]))
    keys = list((score['enwiki']['scores'].keys()))
    idx.extend(keys)
    #print(keys)
    all_scores.extend(list(map(lambda key: score['enwiki']['scores'][key]['articlequality']['score']['prediction'], keys)))

In [82]:
# converting the list to dataframe
df_scores = pd.DataFrame(list(zip(idx, all_scores)), columns=['lastrevid', 'score'])
df_scores['lastrevid'] = df_scores['lastrevid'].astype(int)
df_scores.head()

Unnamed: 0,lastrevid,score
0,1013838830,Stub
1,1033383351,Stub
2,1038918070,Start
3,1041460606,B
4,1060707209,Start


In [84]:
df = pd.merge(politicians, df_info, left_on='name', right_on='title', how='right').merge(df_scores, on='lastrevid')
df.head()

Unnamed: 0,name,url,country,title,lastrevid,score
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0,C
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start
3,Abdul Ghani Ghani,https://en.wikipedia.org/wiki/Abdul_Ghani_Ghani,Afghanistan,Abdul Ghani Ghani,1072442000.0,Stub
4,Abdul Malik Hamwar,https://en.wikipedia.org/wiki/Abdul_Malik_Hamwar,Afghanistan,Abdul Malik Hamwar,1100875000.0,Stub


In [46]:
# importing the population name list
population = pd.read_excel('./data/population_by_country_2022.csv.xlsx')
population.head()

Unnamed: 0,Geography,Population (millions)
0,WORLD,7963.0
1,AFRICA,1419.0
2,NORTHERN AFRICA,251.0
3,Algeria,44.9
4,Egypt,103.5


In [57]:
population['region'] = population.apply(lambda x: x['Geography'] if x['Geography'].isupper() else None, axis = 1)
population['region'] = population['region'].fillna(method="ffill")
dropIndex = population[(population['Geography'].str.isupper())].index
population = population.drop(dropIndex)
population.head()

Unnamed: 0,Geography,Population (millions),region
3,Algeria,44.9,NORTHERN AFRICA
4,Egypt,103.5,NORTHERN AFRICA
5,Libya,6.8,NORTHERN AFRICA
6,Morocco,36.7,NORTHERN AFRICA
7,Sudan,46.9,NORTHERN AFRICA


In [85]:
df = pd.merge(df, population, left_on='country', right_on='Geography', how='outer')
df.head()

Unnamed: 0,name,url,country,title,lastrevid,score,Geography,Population (millions),region
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0,C,Afghanistan,41.1,SOUTH ASIA
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub,Afghanistan,41.1,SOUTH ASIA
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start,Afghanistan,41.1,SOUTH ASIA
3,Abdul Ghani Ghani,https://en.wikipedia.org/wiki/Abdul_Ghani_Ghani,Afghanistan,Abdul Ghani Ghani,1072442000.0,Stub,Afghanistan,41.1,SOUTH ASIA
4,Abdul Malik Hamwar,https://en.wikipedia.org/wiki/Abdul_Malik_Hamwar,Afghanistan,Abdul Malik Hamwar,1100875000.0,Stub,Afghanistan,41.1,SOUTH ASIA


In [87]:
not_exist = df[df.isnull().any(axis=1)]['Geography'].unique()

In [90]:
# open file in write mode
with open(r'./output_data/wp_countries-no_match.txt', 'w') as fp:
    for item in not_exist:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


In [91]:
df.dropna(inplace=True)
df = df.drop(columns=['name', 'url', 'Geography'])

In [95]:
df = df.rename(columns={'title':'title_name', 'lastrevid':'revision_id', 'score':'article_quality', 'Population (millions)':'population'})
df.head()

Unnamed: 0,country,title_name,revision_id,article_quality,population,region
0,Afghanistan,Abas Basir,1098420000.0,C,41.1,SOUTH ASIA
1,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub,41.1,SOUTH ASIA
2,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start,41.1,SOUTH ASIA
3,Afghanistan,Abdul Ghani Ghani,1072442000.0,Stub,41.1,SOUTH ASIA
4,Afghanistan,Abdul Malik Hamwar,1100875000.0,Stub,41.1,SOUTH ASIA


In [96]:
df.to_csv('./output_data/wp_politicians_by_country.csv')