# Import Statements

In [1]:
import pandas as pd
import numpy as np
import requests
import json
from urllib.parse import urlencode
from IPython.core.interactiveshell import InteractiveShell

# This allows for multiple outputs in a single jupyter notebook codeblock.
InteractiveShell.ast_node_interactivity = "all"

# Aquire Data

The page_data.csv is the Wikimedia [politicians by county dataset](https://figshare.com/articles/Untitled_Item/5513449) and was downloaded on Figshare. I unzipped the folder and stored page_data.csv in the same working directory as my notebook. 

The [WPDS_2020_data.csv](https://docs.google.com/spreadsheets/d/1CFJO2zna2No5KqNm9rPK5PCACoXKzb-nycJFhV689Iw/edit?usp=sharing) file is published by Population Reference Bureau and can be drawn from the [world population data sheet](https://www.prb.org/international/indicator/population/table/). I downloaded the WPDS_2020_data.csv from the google spreadsheet and stored it in the same working directory as my notebook.


In [2]:
# use pandas to read the csv files as dataframes

# politicians by country (pbc)
pbc = pd.read_csv('raw_data/page_data.csv')
# world population data sheet (wpds)
wpds = pd.read_csv('raw_data/WPDS_2020_data.csv')

# Data cleaning

### Clean pbc data by removing the Template data.

These pages are not Wikipedia articles, and should not be included in the analysis.


In [3]:
pbc.shape
pbc = pbc[~pbc['page'].str.contains('Template:')]
pbc.shape

(47197, 3)

(46701, 3)

In [4]:
# sanity check data
pbc.head()

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568


### Clean WPDS data by separating out the cumulative regional population count rows. 

These rows are distinguished by having ALL CAPS values in the 'Name' field.

In [5]:
wpds.shape
wpds_orig = wpds
wpds_regional = wpds[wpds['Name'].str.isupper()]
wpds = wpds[~wpds['Name'].str.isupper()]
wpds.shape

(234, 6)

(210, 6)

# Get Article Quality Predictions

In [6]:
# I use requests package to make the calls. 
def api_call(endpoint):
    call = requests.get(endpoint)
    response = call.json()
    return response

In [7]:
endpoint = 'https://ores.wikimedia.org/v3/scores/{context}?'
context_value = 'enwiki'
model = 'articlequality'

pbc[model] = np.NaN
pbc.set_index('rev_id', inplace=True)
revids = pbc.index.to_list()

#batch the calls into batches of 50. 50 works for me, feel free to change this.

batched_revids = list(map(list, np.array_split(revids, round(len(revids)/50))))
revids_batched = np.array_split(revids, round(len(revids)/50))

#### Make multiple batch calls until all revids have been passed to ORES for a prediction.

This step takes about 10 minutes to run on my machine. YMMV.

In [8]:
no_match_revids = []

for batch in revids_batched:    
    
    parameters = {
        'revids': '|'.join(str(int(x)) for x in batch),
        'models': model
    }
    final_endpoint = endpoint.format(context=context_value) + urlencode(parameters)
    response = api_call(final_endpoint)
    try:
        scores = response[context_value]['scores']
    except:
        continue
    
    for revid in scores.keys():
        
        try:
            prediction = scores[revid][model]['score']['prediction']
        except:
            no_match_revids.append(revid)
            continue
        
        pbc.loc[int(revid), model] = prediction

## We save the articles with missing revids under articles_with_missing_revids.csv

In [9]:
no_match_df = pd.DataFrame(no_match_revids, columns=['rev_id'])
no_match_df.to_csv('processed_data/articles_with_missing_revids.csv')

# Combining the Datasets

I remove any rows that do not have matching data, and output them to the CSV file called:
wp_wpds_countries-no_match.csv

I consolidate the remaining data into a single CSV file called:
wp_wpds_politicians_by_country.csv

In [10]:
pbc.reset_index(inplace=True)

In [11]:
wpds_pbc = pd.merge(left=pbc, right=wpds, left_on='country', right_on='Name')

#rename columns to match requirements of the assignment
wpds_pbc.rename(columns={'page':'article_name', 
                         'rev_id':'revision_id',
                         'articlequality': 'article_quality_est',
                         'Population':'population'}, inplace=True)
wpds_pbc.drop(columns=['FIPS', 'Name', 'Type', 'TimeFrame', 'Data (M)'], inplace=True)

# write the combined data to csv.
wpds_pbc.to_csv('processed_data/wp_wpds_politicians_by_country.csv')

In [12]:
pbc_country_no_match = pbc[~pbc['country'].isin(wpds_pbc['country'])].country.unique()
wpds_country_no_match = wpds[~wpds['Name'].isin(wpds_pbc['country'])].Name.unique()
all_country_no_match = np.unique(np.append(pbc_country_no_match, wpds_country_no_match))

all_country_no_match_df = pd.DataFrame(all_country_no_match, columns=['country'])

#write the no-match data to a csv.
all_country_no_match_df.to_csv('processed_data/wp_wpds_countries-no_match.csv')

# Analysis

Pivot tables are used to summarize the data. I'm primarily interested in the occurance of articles and good articles compared to population size.

* FA - Featured article
* GA - Good article
* B - B-class article
* C - C-class article
* Start - Start-class article
* Stub - Stub-class article



In [13]:
# Article rankings are explained in the readme.
article_rankings = ['B', 'C', 'FA', 'GA', 'Start', 'Stub']
hq_article_rankings = ['FA', 'GA']

The pivot table below summarized the number of articles in each ORES category for each country in the data.

In [14]:
analysis_df = pd.pivot_table(wpds_pbc,
                             index=['country'],
                             columns=['article_quality_est'],
                             aggfunc={'article_quality_est': 'count'},
                             fill_value=0
                            )
analysis_df.columns = analysis_df.columns.droplevel() #clean up multilevel index
analysis_df.head()

article_quality_est,B,C,FA,GA,Start,Stub
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,8,46,1,12,99,153
Albania,3,59,0,3,147,244
Algeria,3,10,0,2,44,57
Andorra,0,2,0,0,8,24
Angola,2,6,0,0,23,74


I add population data to the table and calulate the various metrics I want to observe in the Results section

In [15]:
country_pop = wpds_pbc.groupby(['country'])['population'].mean()
analysis_df = pd.merge(left=analysis_df, 
                       right=country_pop, 
                       left_index=True, 
                       right_index=True)

analysis_df['article_count'] = analysis_df[article_rankings].sum(axis=1)
analysis_df['percent_articles_per_person'] = (analysis_df['article_count'] / analysis_df['population']) * 100
analysis_df['hq_article_count'] = analysis_df[hq_article_rankings].sum(axis=1)
analysis_df['percent_hq_articles_per_person'] = (analysis_df['hq_article_count'] / analysis_df['article_count']) * 100
analysis_df.head()

Unnamed: 0_level_0,B,C,FA,GA,Start,Stub,population,article_count,percent_articles_per_person,hq_article_count,percent_hq_articles_per_person
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,8,46,1,12,99,153,38928000,319,0.000819,13,4.075235
Albania,3,59,0,3,147,244,2838000,456,0.016068,3,0.657895
Algeria,3,10,0,2,44,57,44357000,116,0.000262,2,1.724138
Andorra,0,2,0,0,8,24,82000,34,0.041463,0,0.0
Angola,2,6,0,0,23,74,32522000,105,0.000323,0,0.0


Next I want to repeat this analysis but for regions instead of countries.

In [16]:
# This links countries to their respective region.
# Need to set region to the first value in wpds_orig
region = wpds_orig.Name[0]
regions = []
for i in range(len(wpds_orig)):
    if wpds_orig.iloc[i]['Type'] == 'Sub-Region':
        region = wpds_orig.iloc[i]['Name']
    regions.append(region)

wpds_orig['region'] = regions

In [17]:
# Merge the per country population and articles by country datatset
wpds_country_region = pd.merge(left=wpds_pbc,
                                            right=wpds_orig,
                                            left_on='country', 
                                            right_on='Name',
                                           how='left')



In [18]:
#Drop unneeded columns
wpds_country_region.drop(columns={'FIPS', 'Name', 'Type', 'TimeFrame', 'Data (M)', 'Population'}, inplace=True)


I add regional population data to the table and calulate the various metrics I want to observe in the Results section

In [19]:
regional_analysis_df = pd.pivot_table(wpds_country_region,
                             index=['region'],
                             columns=['article_quality_est'],
                             aggfunc={'article_quality_est': 'count'},
                             fill_value=0
                            )
regional_analysis_df.columns = regional_analysis_df.columns.droplevel()


region_pop_lookup = dict(zip(wpds_regional.Name, wpds_regional.Population))
regional_analysis_df['population'] = regional_analysis_df.index.map(region_pop_lookup)

In [20]:
regional_analysis_df['article_count'] = regional_analysis_df[article_rankings].sum(axis=1)
regional_analysis_df['percent_articles_per_person'] = (regional_analysis_df['article_count'] / regional_analysis_df['population']) * 100
regional_analysis_df['hq_article_count'] = regional_analysis_df[hq_article_rankings].sum(axis=1)
regional_analysis_df['percent_hq_articles_per_person'] = (regional_analysis_df['hq_article_count'] / regional_analysis_df['article_count']) * 100

# Results

### Top 10 countries by coverage: 

10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [21]:
analysis_df.nlargest(10,'percent_articles_per_person')[['population', 'article_count', 'percent_articles_per_person']]

Unnamed: 0_level_0,population,article_count,percent_articles_per_person
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,10000,54,0.54
Nauru,11000,52,0.472727
San Marino,34000,81,0.238235
Monaco,38000,40,0.105263
Liechtenstein,39000,28,0.071795
Marshall Islands,57000,37,0.064912
Tonga,99000,63,0.063636
Iceland,368000,201,0.05462
Andorra,82000,34,0.041463
Federated States of Micronesia,106000,36,0.033962


### Bottom 10 countries by coverage: 

10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [22]:
analysis_df.nsmallest(10,'percent_articles_per_person')[['population', 'article_count', 'percent_articles_per_person']]

Unnamed: 0_level_0,population,article_count,percent_articles_per_person
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
India,1400100000,967,6.9e-05
Indonesia,271739000,209,7.7e-05
China,1402385000,1126,8e-05
Uzbekistan,34174000,28,8.2e-05
Ethiopia,114916000,101,8.8e-05
Zambia,18384000,25,0.000136
"Korea, North",25779000,36,0.00014
Thailand,66534000,112,0.000168
Mozambique,31166000,58,0.000186
Bangladesh,169809000,317,0.000187


### Top 10 countries by relative quality: 

10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [23]:
analysis_df.nlargest(10,'percent_hq_articles_per_person')[['population', 'article_count','hq_article_count', 'percent_hq_articles_per_person']]

Unnamed: 0_level_0,population,article_count,hq_article_count,percent_hq_articles_per_person
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Korea, North",25779000,36,8,22.222222
Saudi Arabia,35041000,117,15,12.820513
Romania,19241000,343,42,12.244898
Central African Republic,4830000,66,8,12.121212
Uzbekistan,34174000,28,3,10.714286
Mauritania,4650000,48,5,10.416667
Guatemala,18066000,83,7,8.433735
Dominica,72000,12,1,8.333333
Syria,19398000,128,10,7.8125
Benin,12209000,91,7,7.692308


In [24]:
analysis_df.nsmallest(10,'percent_hq_articles_per_person')[['population', 'article_count','hq_article_count', 'percent_hq_articles_per_person']]

Unnamed: 0_level_0,population,article_count,hq_article_count,percent_hq_articles_per_person
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Andorra,82000,34,0,0.0
Angola,32522000,105,0,0.0
Antigua and Barbuda,98000,24,0,0.0
Bahamas,393000,20,0,0.0
Bahrain,1465000,42,0,0.0
Barbados,287000,14,0,0.0
Belize,419000,16,0,0.0
Cape Verde,556000,36,0,0.0
Comoros,870000,51,0,0.0
Costa Rica,5111000,147,0,0.0


### Top regions by coverage (all articles): 

Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population

In [25]:
regional_analysis_df.nlargest(len(regional_analysis_df),'percent_articles_per_person')[['population', 'article_count', 'percent_articles_per_person']]

article_quality_est,population,article_count,percent_articles_per_person
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OCEANIA,43155000.0,3126,0.007244
SOUTHERN EUROPE,153251000.0,3705,0.002418
WESTERN EUROPE,195479000.0,4559,0.002332
CARIBBEAN,43233000.0,695,0.001608
EASTERN EUROPE,291902000.0,3729,0.001277
SOUTHERN AFRICA,67732000.0,633,0.000935
WESTERN ASIA,280927000.0,2560,0.000911
CENTRAL AMERICA,178611000.0,1542,0.000863
SOUTH AMERICA,429191000.0,3031,0.000706
EASTERN AFRICA,444970000.0,2499,0.000562


### Top regions by coverage (quality articles): 

Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [26]:
regional_analysis_df.nlargest(len(regional_analysis_df),'percent_hq_articles_per_person')[['population', 'article_count', 'percent_hq_articles_per_person']]

article_quality_est,population,article_count,percent_hq_articles_per_person
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NORTHERN AMERICA,368193000.0,1896,5.485232
SOUTHEAST ASIA,661845000.0,2017,3.619236
WESTERN ASIA,280927000.0,2560,3.476562
EASTERN EUROPE,291902000.0,3729,3.164387
EAST ASIA,1641063000.0,2469,3.078169
CENTRAL ASIA,74961000.0,245,2.857143
Channel Islands,,3761,2.712045
MIDDLE AFRICA,179757000.0,659,2.427921
NORTHERN AFRICA,244344000.0,898,2.115813
OCEANIA,43155000.0,3126,2.015355
