# Identifying bias in the politicians' Wikipedia articles

## Import Required Packages

In [1]:
# standard python packages 
import json, time, urllib.parse
import requests

# packages for data manipulation
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

# package to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

First, making an API call to Wikipedia info API to get the politicians' articles `lastrevid`.

In [2]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<vpragya@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

Now we will import the dataset that contains name of politicians and the country of politiicans, and using this we dataset we will fetch the `lastrevid` of politicians' article.

In [3]:
# importing the politicians name list
politicians = pd.read_excel('./data/politicians_by_country_SEPT.2022.csv.xlsx')
politicians.head()

Unnamed: 0,name,url,country
0,Shahjahan Noori,https://en.wikipedia.org/wiki/Shahjahan_Noori,Afghanistan
1,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan
2,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
3,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
4,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan


In [4]:
# list of politicians Wikipedia article titles
ARTICLE_TITLES = politicians['name'].tolist()

In [5]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [6]:
# fetching the articles lastrevid
all_info = []
for i in range(0, len(ARTICLE_TITLES), 50):

    # joining 50 articles name in one string and then passing it through the API call function
    info = request_pageinfo_per_article("|".join(ARTICLE_TITLES[i:i+50]))

    # appending the output to the final list
    all_info.extend(list(info['query']['pages'].values()))

In [7]:
# converting the list to dataframe
df_info = pd.DataFrame.from_dict(all_info)[['title', 'lastrevid']]
df_info.head()

Unnamed: 0,title,lastrevid
0,Abas Basir,1098420000.0
1,Abdul Baqi Turkistani,889226500.0
2,Abdul Ghafar Lakanwal,943562300.0
3,Abdul Ghani Ghani,1072442000.0
4,Abdul Malik Hamwar,1100875000.0


In [8]:
# checking if any atricles doesn't have revid
df_info[df_info.isnull().any(axis=1)]

Unnamed: 0,title,lastrevid
2400,Prince Ofosu Sefah,
2950,Harjit Kaur Talwandi,
3200,Abd al-Razzaq al-Hasani,
3750,Kang Sun-nam,
4850,Abiodun Abimbola Orekoya,
5800,Roman Konoplev,


In [9]:
# dropping rows with no revid
df_info.dropna(inplace=True)

Now we have the `lastrevid` of each article, so making an API call to ORES to get the Score of article.

In [10]:
#########
#
#    CONSTANTS
#

# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
#API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"
API_ORES_SCORE_PARAMS = "/scores/{context}/?models={model}&revids={revid}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<vpragya@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}

In [11]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [12]:
# A list revision IDs that can be used for this ORES scoring example
ARTICLE_REVISIONS = df_info['lastrevid'].map(int).map(str)

In [13]:
idx = []
all_scores = []
for i in range(0, len(ARTICLE_REVISIONS), 50):
    
    # joining 50 articles name in one string and then passing it through the API call function
    score = request_ores_score_per_article("|".join(ARTICLE_REVISIONS[i:i+50]))
    keys = list((score['enwiki']['scores'].keys()))

    # appending the output to the final list
    idx.extend(keys)
    #print(keys)
    all_scores.extend(list(map(lambda key: score['enwiki']['scores'][key]['articlequality']['score']['prediction'], keys)))

In [14]:
# converting the list to dataframe
df_scores = pd.DataFrame(list(zip(idx, all_scores)), columns=['lastrevid', 'score'])
df_scores['lastrevid'] = df_scores['lastrevid'].astype(int)
df_scores.head()

Unnamed: 0,lastrevid,score
0,1013838830,Stub
1,1033383351,Stub
2,1038918070,Start
3,1041460606,B
4,1060707209,Start


## Combining the Datasets

We have fetched the required datasets. So now, we will merge all the dataframes to create one final data.

In [15]:
df = pd.merge(politicians, df_info, left_on='name', right_on='title', how='right').merge(df_scores, on='lastrevid')
df.head()

Unnamed: 0,name,url,country,title,lastrevid,score
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0,C
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start
3,Abdul Ghani Ghani,https://en.wikipedia.org/wiki/Abdul_Ghani_Ghani,Afghanistan,Abdul Ghani Ghani,1072442000.0,Stub
4,Abdul Malik Hamwar,https://en.wikipedia.org/wiki/Abdul_Malik_Hamwar,Afghanistan,Abdul Malik Hamwar,1100875000.0,Stub


In [16]:
# importing the population name list
population = pd.read_excel('./data/population_by_country_2022.csv.xlsx')
population.head()

Unnamed: 0,Geography,Population (millions)
0,WORLD,7963.0
1,AFRICA,1419.0
2,NORTHERN AFRICA,251.0
3,Algeria,44.9
4,Egypt,103.5


Now, the above population data have countries, region and continent. But we only need countries population. So we will remove the region and continent. However, we will map the region row-wise to each country entry, as region is needed for further analysis.

In [17]:
# removing the continents and region from the population data
population['region'] = population.apply(lambda x: x['Geography'] if x['Geography'].isupper() else None, axis = 1)
population['region'] = population['region'].fillna(method="ffill")
dropIndex = population[(population['Geography'].str.isupper())].index
population = population.drop(dropIndex)
population.head()

Unnamed: 0,Geography,Population (millions),region
3,Algeria,44.9,NORTHERN AFRICA
4,Egypt,103.5,NORTHERN AFRICA
5,Libya,6.8,NORTHERN AFRICA
6,Morocco,36.7,NORTHERN AFRICA
7,Sudan,46.9,NORTHERN AFRICA


After mapping the countries name and population with its respective region, we will now add the region's population as well.

In [18]:
# extracting regions's population
region_population = population.groupby('region')['Population (millions)'].sum().to_frame(name='region_population').reset_index()
region_population.head()

Unnamed: 0,region,region_population
0,CARIBBEAN,43.5
1,CENTRAL AMERICA,177.9
2,CENTRAL ASIA,78.0
3,EAST ASIA,1673.9
4,EASTERN AFRICA,472.8


In [19]:
# merging the above two dataframes
population = population.merge(region_population, on = 'region', how='left')
population.head()

Unnamed: 0,Geography,Population (millions),region,region_population
0,Algeria,44.9,NORTHERN AFRICA,251.2
1,Egypt,103.5,NORTHERN AFRICA,251.2
2,Libya,6.8,NORTHERN AFRICA,251.2
3,Morocco,36.7,NORTHERN AFRICA,251.2
4,Sudan,46.9,NORTHERN AFRICA,251.2


Now after getting the region and countries population we will now merge this with the article and its score dataset.

In [20]:
# combining two dataframes
df = pd.merge(df, population, left_on='country', right_on='Geography', how='outer')
df.head()

Unnamed: 0,name,url,country,title,lastrevid,score,Geography,Population (millions),region,region_population
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0,C,Afghanistan,41.1,SOUTH ASIA,2008.6
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub,Afghanistan,41.1,SOUTH ASIA,2008.6
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start,Afghanistan,41.1,SOUTH ASIA,2008.6
3,Abdul Ghani Ghani,https://en.wikipedia.org/wiki/Abdul_Ghani_Ghani,Afghanistan,Abdul Ghani Ghani,1072442000.0,Stub,Afghanistan,41.1,SOUTH ASIA,2008.6
4,Abdul Malik Hamwar,https://en.wikipedia.org/wiki/Abdul_Malik_Hamwar,Afghanistan,Abdul Malik Hamwar,1100875000.0,Stub,Afghanistan,41.1,SOUTH ASIA,2008.6


It is possible that there maybe some entries which doesn't exist for particular countries. So we will store those names in a separate file in the path `./output_data/wp_countries-no_match.txt`. 

In [21]:
# fetching rows with null geography field
not_exist = df[df.isnull().any(axis=1)]['Geography'].unique()

In [22]:
# open file in write mode
with open(r'./output_data/wp_countries-no_match.txt', 'w') as fp:
    for item in not_exist:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


Now dropping the empty country rows and redundant columns.

In [23]:
# drop empty rows
df.dropna(inplace=True)

# drop columns
df = df.drop(columns=['name', 'url', 'Geography'])

In [24]:
# restructuring the dataframe as needed
df = df.rename(columns={'title':'title_name', 'lastrevid':'revision_id', 'score':'article_quality', 'Population (millions)':'population'})
df = df.drop_duplicates()
df.head()

Unnamed: 0,country,title_name,revision_id,article_quality,population,region,region_population
0,Afghanistan,Abas Basir,1098420000.0,C,41.1,SOUTH ASIA,2008.6
1,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub,41.1,SOUTH ASIA,2008.6
2,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start,41.1,SOUTH ASIA,2008.6
3,Afghanistan,Abdul Ghani Ghani,1072442000.0,Stub,41.1,SOUTH ASIA,2008.6
4,Afghanistan,Abdul Malik Hamwar,1100875000.0,Stub,41.1,SOUTH ASIA,2008.6


In [25]:
# exporting the required dataframe to csv
df.to_csv('./output_data/wp_politicians_by_country.csv')

## Analysis

### Total Articles Per Population

This is the ratio representing the number of articles per person. We are calculating it on country-by-country and regional basis.


*Note: The population given in the dataset is in millions. So we will multiply population by 10^6 to get the result per capita.*

In [26]:
# article capita by country
articles_per_capita_country = (df.groupby('country')['title_name'].count() / (df.groupby('country')['population'].mean() * 1000000)).to_frame(name='article_per_capita').reset_index()
articles_per_capita_country.head()

Unnamed: 0,country,article_per_capita
0,Afghanistan,2.871046e-06
1,Albania,2.964286e-05
2,Algeria,7.572383e-07
3,Andorra,0.0001
4,Angola,1.179775e-06


In [27]:
# article capita by region
articles_per_capita_region = (df.groupby('region')['title_name'].count() / (df.groupby('region')['region_population'].mean() * 1000000)).to_frame(name='article_per_capita').reset_index()
articles_per_capita_region.head()

Unnamed: 0,region,article_per_capita
0,CARIBBEAN,4.62069e-06
1,CENTRAL AMERICA,1.096121e-06
2,CENTRAL ASIA,1.358974e-06
3,EAST ASIA,1.463648e-07
4,EASTERN AFRICA,1.370558e-06


### High Quality Articles Per Population

This is the ratio representing the number of good quality articles per person. We are calculating it on country-by-country and regional basis.

High Quality Articles have the ORES predicted score as "FA"(featured article) or "GA"(good article).

*Note: The population given in the dataset is in millions. So we will multiply population by 10^6 to get the result per capita.*

In [28]:
# fetching high quality articles
high_quality_articles = df[(df['article_quality'] == 'GA') | (df['article_quality'] == 'FA')]

In [29]:
# article capita by country
quality_articles_country = (high_quality_articles.groupby('country')['title_name'].count() / (high_quality_articles.groupby('country')['population'].mean() * 1000000)).to_frame(name='article_per_capita').reset_index()
quality_articles_country.head()

Unnamed: 0,country,article_per_capita
0,Afghanistan,1.459854e-07
1,Albania,2.142857e-06
2,Andorra,2e-05
3,Armenia,3.333333e-07
4,Azerbaijan,9.803922e-08


In [30]:
# article capita by region
quality_articles_region = (high_quality_articles.groupby('region')['title_name'].count() / (high_quality_articles.groupby('region')['region_population'].mean() * 1000000)).to_frame(name='article_per_capita').reset_index()
quality_articles_region.head()

Unnamed: 0,region,article_per_capita
0,CARIBBEAN,1.83908e-07
1,CENTRAL AMERICA,5.621135e-08
2,CENTRAL ASIA,3.846154e-08
3,EAST ASIA,9.558516e-09
4,EASTERN AFRICA,3.172589e-08


## Results

### Top 10 countries by coverage

The 10 countries with the highest total articles per capita (in descending order) .


In [31]:
articles_per_capita_country.sort_values(by='article_per_capita', ascending=False).replace([np.inf, -np.inf], np.nan).dropna().head(10)

Unnamed: 0,country,article_per_capita
5,Antigua and Barbuda,0.00017
54,Federated States of Micronesia,0.00013
3,Andorra,0.0001
13,Barbados,9.3e-05
104,Marshall Islands,9e-05
110,Montenegro,6e-05
143,Seychelles,6e-05
97,Luxembourg,5.3e-05
18,Bhutan,5.1e-05
64,Grenada,5e-05


### Bottom 10 countries by coverage

The 10 countries with the lowest total articles per capita (in ascending order).

In [32]:
articles_per_capita_country.sort_values(by='article_per_capita').head(10)

Unnamed: 0,country,article_per_capita
32,China,1.392176e-09
106,Mexico,7.843137e-09
140,Saudi Arabia,8.174387e-08
134,Romania,1.052632e-07
73,India,1.255998e-07
153,Sri Lanka,1.339286e-07
48,Egypt,1.352657e-07
53,Ethiopia,2.025932e-07
161,Taiwan,2.155172e-07
180,Vietnam,2.716298e-07


### Top 10 countries by high quality

The 10 countries with the highest high quality articles per capita (in descending order) .

In [33]:
quality_articles_country.sort_values(by='article_per_capita', ascending=False).head(10)

Unnamed: 0,country,article_per_capita
86,Tuvalu,inf
2,Andorra,2e-05
53,Montenegro,5e-06
1,Albania,2.142857e-06
80,Suriname,1.666667e-06
9,Bosnia-Herzegovina,1.470588e-06
49,Lithuania,1.071429e-06
19,Croatia,1.052632e-06
74,Slovenia,9.52381e-07
61,Palestinian Territory,9.259259e-07


### Bottom 10 countries by high quality

 The 10 countries with the lowest high quality articles per capita (in ascending order).


In [34]:
quality_articles_country.sort_values(by='article_per_capita').head(10)

Unnamed: 0,country,article_per_capita
35,India,4.2337e-09
84,Thailand,1.497006e-08
39,Japan,1.601281e-08
58,Nigeria,1.830664e-08
91,Vietnam,2.012072e-08
17,Colombia,2.03666e-08
87,Uganda,2.118644e-08
60,Pakistan,2.120441e-08
79,Sudan,2.132196e-08
37,Iran,2.257336e-08


### Geographic regions by total coverage

A rank ordered list of geographic regions (in descending order) by total articles per capita.


In [35]:
articles_per_capita_region.sort_values(by='article_per_capita', ascending=False)

Unnamed: 0,region,article_per_capita
14,SOUTHERN EUROPE,5.897946e-06
0,CARIBBEAN,4.62069e-06
17,WESTERN EUROPE,3.550025e-06
5,EASTERN EUROPE,2.557411e-06
8,NORTHERN EUROPE,2.460094e-06
16,WESTERN ASIA,2.330955e-06
9,OCEANIA,1.977011e-06
13,SOUTHERN AFRICA,1.702742e-06
4,EASTERN AFRICA,1.370558e-06
2,CENTRAL ASIA,1.358974e-06


### Geographic regions by high quality coverage

Rank ordered list of geographic regions (in descending order) by high quality articles per capita.


In [36]:
quality_articles_region.sort_values(by='article_per_capita', ascending = False)

Unnamed: 0,region,article_per_capita
14,SOUTHERN EUROPE,3.048376e-07
0,CARIBBEAN,1.83908e-07
5,EASTERN EUROPE,1.322199e-07
17,WESTERN EUROPE,1.117318e-07
16,WESTERN ASIA,9.514101e-08
8,NORTHERN EUROPE,7.511737e-08
13,SOUTHERN AFRICA,5.772006e-08
1,CENTRAL AMERICA,5.621135e-08
9,OCEANIA,4.597701e-08
2,CENTRAL ASIA,3.846154e-08
