# A2: Bias in the Data

**Objective**: The goal of this assignment is to explore the concept of bias through data on Wikipedia articles - specifically, articles on political figures from a variety of countries.

*Import the necessary libraries*

In [95]:
import json
import requests
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from pprint import pprint

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

*Define the function for the API call.*

Here we also collect the predictions of available data from the API and ensure to label the ones we don't get data for as `Not Found`.

In [96]:
#function for API endpoint call: ORES
def api_call(rids):
    """
    Takes the rev_ids to give us the ORES data.
    Also checks for case where required data is not returned
    """
    
    #building the url for the API call
    headers = {
        'User-Agent': 'https://github.com/smuktevi',
        'From': 'vmuktev1@uw.edu'
    }
    
    endpoint = "https://ores.wikimedia.org/v3/scores/enwiki/?models={model}&revids={rids}"

    parameters = {
    'context' : 'enwiki',
    'rids' : '|'.join(str(x) for x in rids),
    'model' : 'articlequality'
    } 
    
    #call the API
    call = requests.get(endpoint.format(**parameters), headers=headers)
    res = call.json()
    
    #store predictions and check for data
    predictions = []
    for rid in rids:
        rev_id = str(rid)
        if res['enwiki']['scores'][rev_id]['articlequality'].get('score') is None:
            predictions.append("NotFound")
        else:
            predictions.append(res['enwiki']['scores'][rev_id]['articlequality']['score']['prediction'])
    return predictions

## Step 1: Get Data

The first step is getting the data, which lives in several different places. The Wikipedia politicians by country dataset can be found on Figshare. Read through the documentation for this repository, then download and unzip it to extract the data file, which is called `page_data.csv`.  

The population data is available in CSV format as `WPDS_2020_data.csv`. This dataset is drawn from the world population data sheet published by the Population Reference Bureau.


In [97]:
page_data = pd.read_csv("./data_raw/page_data.csv")
population_data = pd.read_csv("./data_raw/WPDS_2020_data.csv")

display(page_data.head())
display(population_data.head())

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


## Step 2: Cleaning the Data

*Remove pages starting with `Template: ` and filtering out only Countries*

In [102]:
page_data_filtered = page_data[~page_data['page'].astype(str).str.startswith('Template:')].reset_index().drop(columns='index')
country_pop_data = population_data[population_data['Type']=='Country']

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000
5,LY,Libya,Country,2019,6.891,6891000
6,MA,Morocco,Country,2019,35.952,35952000
7,SD,Sudan,Country,2019,43.849,43849000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.010,10000


## Step 3: Getting Article Quality Predictions

*Call API to get data in batches of size 50.*

In [None]:
batch_size=50
rev_id_list = page_data_filtered["rev_id"].tolist()
predictions = []

#create batches of 50
batches = [rev_id_list[i:i + batch_size] for i in range(0, len(rev_id_list), batch_size)]
for batch in batches:
    res = api_call(batch)
    if len(res)==0:
        print("Error!")
        break
    predictions.extend(res)
len(predictions)
predictions

*Save data retrieved in intermediary files as the API calls run for a long time.*

In [98]:
#saving data
page_data_filtered.to_csv("page_data_filtered_final.csv")
country_pop_data.to_csv("country_pop_data.csv")

page_data_filtered[page_data_filtered["predictions"]=="NotFound"].to_csv("page_data_not_found.csv")
page_data_found = page_data_filtered[page_data_filtered["predictions"]!="NotFound"]

## Step 4: Combining the Datasets

*Store the data with `no match`.*

In [245]:
left = page_data_found.merge(country_pop_data,  how='left', left_on='country', right_on='Name', indicator=True)
nomatch_df = left[left['_merge'] == 'left_only']
nomatch_df.to_csv('data_result/wp_wpds_countries-no_match.csv')

*Store resulting merged data with population and article information.*

In [246]:
#check schema
merged_df = page_data_found.merge(country_pop_data, how='inner', left_on='country', right_on='Name').drop(columns=['Name','Type', 'FIPS', 'TimeFrame', 'Data (M)'])
merged_df.columns = ['article_name', 'country', 'revision_id', 'article_quality_est', 'population']
merged_df = merged_df[['country', 'article_name', 'revision_id', 'article_quality_est', 'population']]
merged_df.to_csv("data_result/wp_wpds_politicians_by_country.csv")
merged_df

Unnamed: 0,country,article_name,revision_id,article_quality_est,population
0,Chad,Bir I of Kanem,355319463,Stub,16877000
1,Chad,Abdullah II of Kanem,498683267,Stub,16877000
2,Chad,Salmama II of Kanem,565745353,Stub,16877000
3,Chad,Kuri I of Kanem,565745365,Stub,16877000
4,Chad,Mohammed I of Kanem,565745375,Stub,16877000
...,...,...,...,...,...
44563,Seychelles,Rita Sinon,800323154,Stub,98000
44564,Seychelles,Sylvette Frichot,800323798,Stub,98000
44565,Seychelles,May De Silva,800969960,Start,98000
44566,Seychelles,Vincent Meriton,802051093,Stub,98000


## Step 5: Analysis

*Get article per population proportion*

In [210]:
#number of articles per country
num_articles_per_country = pd.DataFrame(merged_df.groupby("country")["article_name"].count()).reset_index()
population = country_pop_data[["Name", "Population"]].reset_index().drop(columns= ["index"])

percentage_per_population_df = num_articles_per_country.merge(population, how="inner", left_on="country", right_on="Name").drop(columns="Name")
percentage_per_population_df["percentage_per_population"] = percentage_per_population_df["article_name"]/percentage_per_population_df["Population"] * 100
percentage_per_population_df.columns=["country", "article_count", "population", "percentage_per_population"]
percentage_per_population_df

Unnamed: 0,country,article_count,population,percentage_per_population
0,Afghanistan,319,38928000,0.000819
1,Albania,456,2838000,0.016068
2,Algeria,116,44357000,0.000262
3,Andorra,34,82000,0.041463
4,Angola,106,32522000,0.000326
...,...,...,...,...
178,Venezuela,130,28645000,0.000454
179,Vietnam,187,96209000,0.000194
180,Yemen,116,29826000,0.000389
181,Zambia,25,18384000,0.000136


*Get FA and GA quality proportion.*

In [230]:
fa_ga_bool= merged_df[(merged_df["article_quality_est"]=="FA") | (merged_df["article_quality_est"]=="GA")].groupby("country")["article_name"].count()
high_quality_articles_df = pd.DataFrame(fa_ga_bool).reset_index().merge(num_articles_per_country, how="inner", on="country")
high_quality_articles_df["high_quality_proportion"] =  high_quality_articles_df["article_name_x"]/high_quality_articles_df["article_name_y"]
high_quality_articles_df.columns = ["country", "num_fa_ga", "num_articles", "high_quality_proportion"]
high_quality_articles_prop_df = high_quality_articles_df[["country", "high_quality_proportion"]] 
high_quality_articles_prop_df

Unnamed: 0,country,high_quality_proportion
0,Afghanistan,0.040752
1,Albania,0.006579
2,Algeria,0.017241
3,Argentina,0.032587
4,Armenia,0.025907
...,...,...
141,Vanuatu,0.051724
142,Venezuela,0.023077
143,Vietnam,0.069519
144,Yemen,0.025862


*Similar analysis at regional level.*

In [222]:
#map countries to region
regions = []
prev = population_data.iloc[0, :]
for i in range(len(population_data)):
    curr = population_data.iloc[i, :]
    prev = population_data.iloc[i-1, :]
    if curr["Type"] != "Country":
        regions.append(curr["Name"])
    else:
        regions.append(regions[-1])
population_data["region"] = regions
region_country_mapping = population_data[["Name", "region"]]

#regional 
region_pop_df_prop = percentage_per_population_df.merge(region_country_mapping, how="left", left_on="country", right_on="Name").drop(columns="Name")
region_pop_df = pd.DataFrame(region_pop_df_prop.groupby("region")[["article_count","population"]].sum()).reset_index()
region_pop_df["regional_prop"] =  region_pop_df["article_count"]/region_pop_df["population"] * 100
region_pop_df

Unnamed: 0,Name,region
0,WORLD,WORLD
1,AFRICA,AFRICA
2,NORTHERN AFRICA,NORTHERN AFRICA
3,Algeria,NORTHERN AFRICA
4,Egypt,NORTHERN AFRICA
...,...,...
229,Samoa,OCEANIA
230,Solomon Islands,OCEANIA
231,Tonga,OCEANIA
232,Tuvalu,OCEANIA


In [240]:
temp_region_df = high_quality_articles_df.merge(region_country_mapping, how="left", left_on="country", right_on="Name").drop(columns="Name")
region_quality_df = temp_region_df.groupby("region")[["num_fa_ga", "num_articles"]].sum().reset_index()
region_quality_df["region_quality_prop"] = region_quality_df["num_fa_ga"]/region_quality_df["num_articles"]
region_quality_df

Unnamed: 0,region,num_fa_ga,num_articles,region_quality_prop
0,CARIBBEAN,13,552,0.023551
1,CENTRAL AMERICA,23,1380,0.016667
2,CENTRAL ASIA,7,135,0.051852
3,Channel Islands,102,3046,0.033487
4,EAST ASIA,76,2473,0.030732
5,EASTERN AFRICA,35,2294,0.015257
6,EASTERN EUROPE,118,3311,0.035639
7,MIDDLE AFRICA,16,538,0.02974
8,NORTHERN AFRICA,19,761,0.024967
9,NORTHERN AMERICA,104,1901,0.054708


## Step 6: Results

### 1. Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [218]:
percentage_per_population_df.sort_values(by='percentage_per_population', ascending=False)[:10]

Unnamed: 0,country,article_count,population,percentage_per_population
169,Tuvalu,54,10000,0.54
117,Nauru,52,11000,0.472727
138,San Marino,81,34000,0.238235
110,Monaco,40,38000,0.105263
95,Liechtenstein,28,39000,0.071795
104,Marshall Islands,37,57000,0.064912
164,Tonga,63,99000,0.063636
70,Iceland,201,368000,0.05462
3,Andorra,34,82000,0.041463
52,Federated States of Micronesia,36,106000,0.033962


### 2. Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population 

In [219]:
percentage_per_population_df.sort_values(by='percentage_per_population', ascending=False)[-10:]

Unnamed: 0,country,article_count,population,percentage_per_population
13,Bangladesh,317,169809000,0.000187
114,Mozambique,58,31166000,0.000186
162,Thailand,112,66534000,0.000168
84,"Korea, North",36,25779000,0.00014
181,Zambia,25,18384000,0.000136
51,Ethiopia,101,114916000,8.8e-05
176,Uzbekistan,28,34174000,8.2e-05
34,China,1129,1402385000,8.1e-05
72,Indonesia,209,271739000,7.7e-05
71,India,968,1400100000,6.9e-05


### 3. Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality  

In [242]:
high_quality_articles_prop_df.sort_values(by='high_quality_proportion', ascending=False)[:10]

Unnamed: 0,country,high_quality_proportion
63,"Korea, North",0.222222
109,Saudi Arabia,0.128205
106,Romania,0.122449
23,Central African Republic,0.121212
140,Uzbekistan,0.107143
82,Mauritania,0.104167
46,Guatemala,0.084337
33,Dominica,0.083333
125,Syria,0.078125
11,Benin,0.076923


### 4. Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality 

In [243]:
high_quality_articles_prop_df.sort_values(by='high_quality_proportion', ascending=False)[-10:]

Unnamed: 0,country,high_quality_proportion
87,Morocco,0.004854
73,Lithuania,0.004098
27,Colombia,0.003509
104,Portugal,0.003145
94,Nigeria,0.002959
101,Peru,0.002857
89,Nepal,0.002809
124,Switzerland,0.002488
128,Tanzania,0.002475
10,Belgium,0.001927


### 5. Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population  

In [225]:
pd.set_option("display.max_rows", 30, "display.max_columns", 30)
region_pop_df.sort_values(by="regional_prop", ascending=False).reset_index().drop(columns="index")

Unnamed: 0,region,article_count,population,regional_prop
0,OCEANIA,3126,42031000,0.007437
1,Channel Islands,3763,105680000,0.003561
2,SOUTHERN EUROPE,3710,151136000,0.002455
3,WESTERN EUROPE,4560,195479000,0.002333
4,CARIBBEAN,695,39056000,0.001779
5,EASTERN EUROPE,3732,281186000,0.001327
6,SOUTHERN AFRICA,634,66628000,0.000952
7,CENTRAL AMERICA,1543,162267000,0.000951
8,WESTERN ASIA,2563,272499000,0.000941
9,MIDDLE AFRICA,665,90189000,0.000737


### 6. Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality  

In [241]:
region_quality_df.sort_values(by="region_quality_prop", ascending=False).reset_index().drop(columns="index")

Unnamed: 0,region,num_fa_ga,num_articles,region_quality_prop
0,NORTHERN AMERICA,104,1901,0.054708
1,CENTRAL ASIA,7,135,0.051852
2,SOUTHEAST ASIA,73,2020,0.036139
3,EASTERN EUROPE,118,3311,0.035639
4,WESTERN ASIA,89,2521,0.035303
5,Channel Islands,102,3046,0.033487
6,EAST ASIA,76,2473,0.030732
7,MIDDLE AFRICA,16,538,0.02974
8,NORTHERN AFRICA,19,761,0.024967
9,CARIBBEAN,13,552,0.023551
