In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# Data acquisition - politician wikipedia data

In [38]:
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<shwet695@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}


# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}


## Loading Politician csv data

In [39]:
politician_df = pd.read_csv(r"./data/politicians_by_country_SEPT.2022.csv")
politician_df

Unnamed: 0,name,url,country
0,Shahjahan Noori,https://en.wikipedia.org/wiki/Shahjahan_Noori,Afghanistan
1,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan
2,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
3,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
4,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan
...,...,...,...
7579,Rekayi Tangwena,https://en.wikipedia.org/wiki/Rekayi_Tangwena,Zimbabwe
7580,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe
7581,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe
7582,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe


In [40]:
# No null data in politician_df
politician_df.isna().sum()

name       0
url        0
country    0
dtype: int64

In [41]:
# This is just a list of politicians Wikipedia article titles
ARTICLE_TITLES = list(politician_df.name)

In [51]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


# Calling API for article names in batches of 25
rev_id_data = []
for i in range(0, len(ARTICLE_TITLES), 25):
    data = request_pageinfo_per_article("|".join(ARTICLE_TITLES[i:i+25]))
    #print(data)
    rev_id_data.extend(list(data['query']['pages'].values()))

In [52]:
rev_id_data

[{'pageid': 65412901,
  'ns': 0,
  'title': 'Abas Basir',
  'contentmodel': 'wikitext',
  'pagelanguage': 'en',
  'pagelanguagehtmlcode': 'en',
  'pagelanguagedir': 'ltr',
  'touched': '2022-10-11T01:20:40Z',
  'lastrevid': 1098419766,
  'length': 19306,
  'talkid': 65415333,
  'fullurl': 'https://en.wikipedia.org/wiki/Abas_Basir',
  'editurl': 'https://en.wikipedia.org/w/index.php?title=Abas_Basir&action=edit',
  'canonicalurl': 'https://en.wikipedia.org/wiki/Abas_Basir'},
 {'pageid': 27428272,
  'ns': 0,
  'title': 'Abdul Baqi Turkistani',
  'contentmodel': 'wikitext',
  'pagelanguage': 'en',
  'pagelanguagehtmlcode': 'en',
  'pagelanguagedir': 'ltr',
  'touched': '2022-10-11T03:06:55Z',
  'lastrevid': 889226470,
  'length': 1297,
  'talkid': 27595416,
  'fullurl': 'https://en.wikipedia.org/wiki/Abdul_Baqi_Turkistani',
  'editurl': 'https://en.wikipedia.org/w/index.php?title=Abdul_Baqi_Turkistani&action=edit',
  'canonicalurl': 'https://en.wikipedia.org/wiki/Abdul_Baqi_Turkistani'},


In [57]:
# List to dataframe conversion and keeping only title & lastrevid information
rev_id_df = pd.DataFrame.from_dict(rev_id_data)[['title', 'lastrevid']]
rev_id_df.head()

Unnamed: 0,title,lastrevid
0,Abas Basir,1098420000.0
1,Abdul Baqi Turkistani,889226500.0
2,Abdul Ghafar Lakanwal,943562300.0
3,Abdul Rahim Ayoubi,1108886000.0
4,Ahmad Behzad,1103948000.0


In [58]:
rev_id_df.shape

(7584, 2)

In [59]:
# 7 titles with no revid.
rev_id_df.isna().sum()

title        0
lastrevid    7
dtype: int64

In [60]:
# dropping 7 titles with no revid.
rev_id_df.dropna(inplace=True)

In [61]:
rev_id_df.shape

(7577, 2)

# Data acquisition - ORES data scores for wikipedia articles on politicians

## Calling ORES API for wikipedia article quality prediction

In [62]:
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

#########
#
#    CONSTANTS
#

# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
#API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"
API_ORES_SCORE_PARAMS = "/scores/{context}/?models={model}&revids={revid}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<shwet695@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}


#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [68]:
rev_id_df.lastrevid

0       1.098420e+09
1       8.892265e+08
2       9.435623e+08
3       1.108886e+09
4       1.103948e+09
            ...     
7579    9.042468e+08
7580    1.112149e+09
7581    1.095960e+09
7582    1.108127e+09
7583    1.073819e+09
Name: lastrevid, Length: 7577, dtype: float64

In [77]:
idx = []
all_scores = []
for i in range(0, len(ARTICLE_REVISIONS), 50):
    
    # joining 50 articles name in one string and then passing it through the API call function
    score = request_ores_score_per_article("|".join(ARTICLE_REVISIONS[i:i+50]))
    keys = list((score['enwiki']['scores'].keys()))

    # appending the output to the final list
    idx.extend(keys)
    #print(keys)
    all_scores.extend(list(map(lambda key: score['enwiki']['scores'][key]['articlequality']['score']['prediction'], keys)))
    

In [80]:
# Calling ORES API to fecth ORES scores and prediction for article revids in batches of 25

ARTICLE_REVISIONS = rev_id_df.lastrevid.astype(int).astype(str)

id_info = []
ores_data = []
for i in range(0, len(ARTICLE_REVISIONS), 25):
    data = request_ores_score_per_article("|".join(ARTICLE_REVISIONS[i:i+25]))
    #print(data)
    #print(data['enwiki']['scores'])
    keys_info = list((data['enwiki']['scores'].keys()))

    # appending each batch output to the final ores data
    id_info.extend(keys_info)
    #print(keys)
    ores_data.extend(list(map(lambda key: data['enwiki']['scores'][key]['articlequality']['score']['prediction'], keys_info)))
        

### The article quality estimates are, from best to worst:

FA - Featured article

GA - Good article

B - B-class article

C - C-class article

Start - Start-class article

Stub - Stub-class article

In [83]:
ores_data[:10]

['Stub',
 'Start',
 'Start',
 'Start',
 'Start',
 'Start',
 'Start',
 'B',
 'Start',
 'C']

In [84]:
# converting the list to dataframe
ores_df = pd.DataFrame(list(zip(id_info, ores_data)), columns=['lastrevid', 'score'])
ores_df['lastrevid'] = ores_df['lastrevid'].astype(int)
ores_df.head()

Unnamed: 0,lastrevid,score
0,1013838830,Stub
1,1038918070,Start
2,1060707209,Start
3,1069322182,Start
4,1082489593,Start


# Creating 1 dataframe from combining all data so far

In [285]:
# merging politician and rev id data
df = pd.merge(politician_df, rev_id_df, left_on='name', right_on='title', how='right')
df.head()

Unnamed: 0,name,url,country,title,lastrevid
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0
3,Abdul Rahim Ayoubi,https://en.wikipedia.org/wiki/Abdul_Rahim_Ayoubi,Afghanistan,Abdul Rahim Ayoubi,1108886000.0
4,Ahmad Behzad,https://en.wikipedia.org/wiki/Ahmad_Behzad,Afghanistan,Ahmad Behzad,1103948000.0


In [286]:
df = pd.merge(df, ores_df, on='lastrevid')
df.head()

Unnamed: 0,name,url,country,title,lastrevid,score
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0,C
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start
3,Abdul Rahim Ayoubi,https://en.wikipedia.org/wiki/Abdul_Rahim_Ayoubi,Afghanistan,Abdul Rahim Ayoubi,1108886000.0,Start
4,Ahmad Behzad,https://en.wikipedia.org/wiki/Ahmad_Behzad,Afghanistan,Ahmad Behzad,1103948000.0,Start


In [287]:
df.shape

(7919, 6)

In [288]:
# No null data
df.isna().sum()

name         0
url          0
country      0
title        0
lastrevid    0
score        0
dtype: int64

## Loading population data

In [289]:
population_df = pd.read_csv(r"./data/population_by_country_2022.csv")
population_df

Unnamed: 0,Geography,Population (millions)
0,WORLD,7963.0
1,AFRICA,1419.0
2,NORTHERN AFRICA,251.0
3,Algeria,44.9
4,Egypt,103.5
...,...,...
228,Samoa,0.2
229,Solomon Islands,0.7
230,Tonga,0.1
231,Tuvalu,0.0


In [290]:
population_df.shape

(233, 2)

Please note: The population_by_country_2022.csv contains some rows that provide cumulative regional population counts. 

These rows are distinguished by having ALL CAPS values in the 'geography' field (e.g. AFRICA, OCEANIA). These rows won't match the country values in politicians_by_country.SEPT.2022.csv, but we will retain some of them so that we can report coverage and quality by region as specified in the analysis section below.

In [291]:
# Adding continents/region info to each country data row
population_df['region'] = population_df.apply(lambda x: x['Geography'] if x['Geography'].isupper() else None, axis = 1)
population_df['region'] = population_df['region'].fillna(method="ffill")
population_df

Unnamed: 0,Geography,Population (millions),region
0,WORLD,7963.0,WORLD
1,AFRICA,1419.0,AFRICA
2,NORTHERN AFRICA,251.0,NORTHERN AFRICA
3,Algeria,44.9,NORTHERN AFRICA
4,Egypt,103.5,NORTHERN AFRICA
...,...,...,...
228,Samoa,0.2,OCEANIA
229,Solomon Islands,0.7,OCEANIA
230,Tonga,0.1,OCEANIA
231,Tuvalu,0.0,OCEANIA


In [292]:
#drop rows in Geography column which have continent/region info
population_df.drop(population_df[population_df['Geography'].str.isupper()].index, inplace = True)
population_df.reset_index(drop=True, inplace = True)

In [293]:
population_df

Unnamed: 0,Geography,Population (millions),region
0,Algeria,44.9,NORTHERN AFRICA
1,Egypt,103.5,NORTHERN AFRICA
2,Libya,6.8,NORTHERN AFRICA
3,Morocco,36.7,NORTHERN AFRICA
4,Sudan,46.9,NORTHERN AFRICA
...,...,...,...
204,Samoa,0.2,OCEANIA
205,Solomon Islands,0.7,OCEANIA
206,Tonga,0.1,OCEANIA
207,Tuvalu,0.0,OCEANIA


In [294]:
population_df.isna().sum()

Geography                0
Population (millions)    0
region                   0
dtype: int64

### Group by region to sum population

In [295]:
region_population = population_df.groupby('region')['Population (millions)'].sum().reset_index().rename(columns = {'Population (millions)':'region_population'})
region_population

Unnamed: 0,region,region_population
0,CARIBBEAN,43.5
1,CENTRAL AMERICA,177.9
2,CENTRAL ASIA,78.0
3,EAST ASIA,1673.9
4,EASTERN AFRICA,472.8
5,EASTERN EUROPE,287.4
6,MIDDLE AFRICA,196.1
7,NORTHERN AFRICA,251.2
8,NORTHERN AMERICA,371.6
9,NORTHERN EUROPE,106.5


In [296]:
region_population.isna().sum()

region               0
region_population    0
dtype: int64

In [297]:
# merging region_population data with country population dataframe
population_df = population_df.merge(region_population, on = 'region', how='left')
population_df.head()

Unnamed: 0,Geography,Population (millions),region,region_population
0,Algeria,44.9,NORTHERN AFRICA,251.2
1,Egypt,103.5,NORTHERN AFRICA,251.2
2,Libya,6.8,NORTHERN AFRICA,251.2
3,Morocco,36.7,NORTHERN AFRICA,251.2
4,Sudan,46.9,NORTHERN AFRICA,251.2


In [298]:
population_df.isna().sum()

Geography                0
Population (millions)    0
region                   0
region_population        0
dtype: int64

### COMBINING ALL DATAFRAMES

In [345]:
df_final = pd.merge(df, population_df, left_on='country', right_on='Geography', how='outer')
df_final.head()

Unnamed: 0,name,url,country,title,lastrevid,score,Geography,Population (millions),region,region_population
0,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Abas Basir,1098420000.0,C,Afghanistan,41.1,SOUTH ASIA,2008.6
1,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Abdul Baqi Turkistani,889226500.0,Stub,Afghanistan,41.1,SOUTH ASIA,2008.6
2,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Abdul Ghafar Lakanwal,943562300.0,Start,Afghanistan,41.1,SOUTH ASIA,2008.6
3,Abdul Rahim Ayoubi,https://en.wikipedia.org/wiki/Abdul_Rahim_Ayoubi,Afghanistan,Abdul Rahim Ayoubi,1108886000.0,Start,Afghanistan,41.1,SOUTH ASIA,2008.6
4,Ahmad Behzad,https://en.wikipedia.org/wiki/Ahmad_Behzad,Afghanistan,Ahmad Behzad,1103948000.0,Start,Afghanistan,41.1,SOUTH ASIA,2008.6


In [346]:
df_final.shape

(7944, 10)

In [347]:
df_final.isna().sum()

name                     25
url                      25
country                  25
title                    25
lastrevid                25
score                    25
Geography                76
Population (millions)    76
region                   76
region_population        76
dtype: int64

## Identifying all countries for which there are no matches 

In [348]:
df_final[df_final.isna().any(axis=1)]['country'].dropna().unique().tolist()

['Korean']

In [349]:
df_final[df_final.isna().any(axis=1)]['Geography'].dropna().unique().tolist()

['Western Sahara',
 'Mauritius',
 'Mayotte',
 'Reunion',
 'Sao Tome and Principe',
 'eSwatini',
 'Canada',
 'United States',
 'Curacao',
 'Guadeloupe',
 'Martinique',
 'Puerto Rico',
 'French Guiana',
 'Brunei',
 'Philippines',
 'China,  Hong Kong SAR',
 'China,  Macao SAR',
 'Ireland',
 'United Kingdom',
 'Australia',
 'French Polynesia',
 'Guam',
 'Kiribati',
 'New Caledonia',
 'New Zealand']

In [350]:
no_countrymatches = df_final[df_final.isna().any(axis=1)]['country'].dropna().unique().tolist() + df_final[df_final.isna().any(axis=1)]['Geography'].dropna().unique().tolist()
no_countrymatches

['Korean',
 'Western Sahara',
 'Mauritius',
 'Mayotte',
 'Reunion',
 'Sao Tome and Principe',
 'eSwatini',
 'Canada',
 'United States',
 'Curacao',
 'Guadeloupe',
 'Martinique',
 'Puerto Rico',
 'French Guiana',
 'Brunei',
 'Philippines',
 'China,  Hong Kong SAR',
 'China,  Macao SAR',
 'Ireland',
 'United Kingdom',
 'Australia',
 'French Polynesia',
 'Guam',
 'Kiribati',
 'New Caledonia',
 'New Zealand']

## Saving output of list of these countries, with each country on a separate line called -
wp_countries-no_match.txt

In [351]:
with open(r'./output/wp_countries-no_match.txt', 'w') as fp:
    for i in no_countrymatches:
        fp.write("%s\n" % i)

## Final dataframe -
wp_politicians_by_country.csv

In [352]:
## dropping rows with null information
df_final.dropna(subset=['name'], inplace = True)
df_final.isna().sum()

name                      0
url                       0
country                   0
title                     0
lastrevid                 0
score                     0
Geography                76
Population (millions)    76
region                   76
region_population        76
dtype: int64

In [353]:
df_final = df_final[['country','region','Population (millions)','region_population','title','lastrevid','score']]
df_final.head()

Unnamed: 0,country,region,Population (millions),region_population,title,lastrevid,score
0,Afghanistan,SOUTH ASIA,41.1,2008.6,Abas Basir,1098420000.0,C
1,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Baqi Turkistani,889226500.0,Stub
2,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Ghafar Lakanwal,943562300.0,Start
3,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Rahim Ayoubi,1108886000.0,Start
4,Afghanistan,SOUTH ASIA,41.1,2008.6,Ahmad Behzad,1103948000.0,Start


In [354]:
df_final.columns = ['country','region','population','region_population','article_title','revision_id','article_quality']
df_final.head()

Unnamed: 0,country,region,population,region_population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,41.1,2008.6,Abas Basir,1098420000.0,C
1,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Baqi Turkistani,889226500.0,Stub
2,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Ghafar Lakanwal,943562300.0,Start
3,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Rahim Ayoubi,1108886000.0,Start
4,Afghanistan,SOUTH ASIA,41.1,2008.6,Ahmad Behzad,1103948000.0,Start


In [355]:
df_final.isna().sum()

country               0
region               76
population           76
region_population    76
article_title         0
revision_id           0
article_quality       0
dtype: int64

In [356]:
## Region & population is null for country name "Korean"
df_final[df_final['region'].isna()]['country'].unique()

array(['Korean'], dtype=object)

It is difficult to identify population of country name "Korean", it is North Korea/South Korea.

Therefore we will drop rows with Korean country name

In [357]:
df_final.dropna(inplace = True)

In [358]:
df_final

Unnamed: 0,country,region,population,region_population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,41.1,2008.6,Abas Basir,1.098420e+09,C
1,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Baqi Turkistani,8.892265e+08,Stub
2,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Ghafar Lakanwal,9.435623e+08,Start
3,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Rahim Ayoubi,1.108886e+09,Start
4,Afghanistan,SOUTH ASIA,41.1,2008.6,Ahmad Behzad,1.103948e+09,Start
...,...,...,...,...,...,...,...
7914,Zimbabwe,EASTERN AFRICA,16.3,472.8,Langton Towungana,9.042468e+08,Stub
7915,Zimbabwe,EASTERN AFRICA,16.3,472.8,Leopold Takawira,1.112149e+09,Start
7916,Zimbabwe,EASTERN AFRICA,16.3,472.8,Maxwell Zeb Shumba,1.095960e+09,Start
7917,Zimbabwe,EASTERN AFRICA,16.3,472.8,Nkululeko Mkastos Sibanda,1.108127e+09,Start


## Handling duplicate article entries

In [359]:
df_final[df_final.article_title.duplicated()]

Unnamed: 0,country,region,population,region_population,article_title,revision_id,article_quality
195,Albania,SOUTHERN EUROPE,2.8,150.9,Visar Ymeri,1.036757e+09,Stub
196,Albania,SOUTHERN EUROPE,2.8,150.9,Visar Ymeri,1.036757e+09,Stub
197,Albania,SOUTHERN EUROPE,2.8,150.9,Visar Ymeri,1.036757e+09,Stub
204,Kosovo,SOUTHERN EUROPE,1.8,150.9,Visar Ymeri,1.036757e+09,Stub
205,Kosovo,SOUTHERN EUROPE,1.8,150.9,Visar Ymeri,1.036757e+09,Stub
...,...,...,...,...,...,...,...
6315,Suriname,SOUTH AMERICA,0.6,433.9,Grace Schneiders-Howard,1.095801e+09,GA
6932,Ukraine,EASTERN EUROPE,41.0,287.4,Sergey Abisov,1.113304e+09,Start
6933,Ukraine,EASTERN EUROPE,41.0,287.4,Sergey Abisov,1.113304e+09,Start
6934,Ukraine,EASTERN EUROPE,41.0,287.4,Sergey Abisov,1.113304e+09,Start


In [360]:
#dropping duplicate titles
df_final.drop_duplicates(subset = ['article_title'], inplace = True)

In [361]:
df_final[df_final.duplicated()]

Unnamed: 0,country,region,population,region_population,article_title,revision_id,article_quality


In [362]:
# saving the final dataframe to csv
df_final.to_csv('./output/wp_politicians_by_country.csv')

# ANALYSIS

In [373]:
df_final.head()

Unnamed: 0,country,region,population,region_population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,41.1,2008.6,Abas Basir,1098420000.0,C
1,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Baqi Turkistani,889226500.0,Stub
2,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Ghafar Lakanwal,943562300.0,Start
3,Afghanistan,SOUTH ASIA,41.1,2008.6,Abdul Rahim Ayoubi,1108886000.0,Start
4,Afghanistan,SOUTH ASIA,41.1,2008.6,Ahmad Behzad,1103948000.0,Start


## Calculating total-articles-per-population
(a ratio representing the number of articles per population)

population in millions so we multiply population by 10^6

All of these values are “per capita”.

In [374]:
#By country
ratio_of_articles_per_population_c = (df_final.groupby('country')['article_title'].count() / (df_final.groupby('country')['population'].mean() * 1000000)).to_frame(name='ratio_of_articles_per_population_country').reset_index()
ratio_of_articles_per_population_c.head()


Unnamed: 0,country,ratio_of_articles_per_population_country
0,Afghanistan,2.871046e-06
1,Albania,2.964286e-05
2,Algeria,7.572383e-07
3,Andorra,0.0001
4,Angola,1.179775e-06


In [375]:
#By region
ratio_of_articles_per_population_r = (df_final.groupby('region')['article_title'].count() / (df_final.groupby('region')['region_population'].mean() * 1000000)).to_frame(name='ratio_of_articles_per_population_region').reset_index()
ratio_of_articles_per_population_r.head()


Unnamed: 0,region,ratio_of_articles_per_population_region
0,CARIBBEAN,4.62069e-06
1,CENTRAL AMERICA,1.079258e-06
2,CENTRAL ASIA,1.320513e-06
3,EAST ASIA,1.469622e-07
4,EASTERN AFRICA,1.366328e-06


## High-quality-articles-per-population
(a ratio representing the number of high quality articles per person)

All of these values are “per capita”.

### High quality articles - 

FA - Featured article

GA - Good article

In [376]:
# high quality articles dataframe
high_quality_articles_df = df_final[(df_final['article_quality'] == 'FA') | (df_final['article_quality'] == 'GA')]

# high quality articles by country
high_quality_articles_country = (high_quality_articles_df.groupby('country')['article_title'].count() / (high_quality_articles_df.groupby('country')['population'].mean() * 1000000)).to_frame(name='high_quality_article_ratio_countrywise').reset_index()
high_quality_articles_country.head()

Unnamed: 0,country,high_quality_article_ratio_countrywise
0,Afghanistan,1.459854e-07
1,Albania,2.142857e-06
2,Andorra,2e-05
3,Armenia,3.333333e-07
4,Azerbaijan,9.803922e-08


In [377]:
# high quality articles by region
high_quality_articles_region = (high_quality_articles_df.groupby('region')['article_title'].count() / (high_quality_articles_df.groupby('region')['region_population'].mean() * 1000000)).to_frame(name='high_quality_article_ratio_regionwise').reset_index()
high_quality_articles_region.head()

Unnamed: 0,region,high_quality_article_ratio_regionwise
0,CARIBBEAN,1.83908e-07
1,CENTRAL AMERICA,5.621135e-08
2,CENTRAL ASIA,3.846154e-08
3,EAST ASIA,9.558516e-09
4,EASTERN AFRICA,3.172589e-08


# RESULTS

## Top 10 countries by coverage:
The 10 countries with the highest total articles per capita (in descending order)

In [378]:
ratio_of_articles_per_population_c.sort_values(by = ['ratio_of_articles_per_population_country'], ascending = False).replace([np.inf, -np.inf], np.nan).dropna().head(10)


Unnamed: 0,country,ratio_of_articles_per_population_country
5,Antigua and Barbuda,0.00017
54,Federated States of Micronesia,0.00013
3,Andorra,0.0001
13,Barbados,9.3e-05
104,Marshall Islands,9e-05
143,Seychelles,6e-05
110,Montenegro,5.5e-05
97,Luxembourg,5.3e-05
18,Bhutan,5.1e-05
64,Grenada,5e-05


## Bottom 10 countries by coverage:
The 10 countries with the lowest total articles per capita (in ascending order)

In [379]:
ratio_of_articles_per_population_c.sort_values(by = ['ratio_of_articles_per_population_country'], ascending = True).head(10)


Unnamed: 0,country,ratio_of_articles_per_population_country
32,China,1.392176e-09
106,Mexico,7.843137e-09
140,Saudi Arabia,8.174387e-08
134,Romania,1.052632e-07
73,India,1.248942e-07
153,Sri Lanka,1.339286e-07
48,Egypt,1.352657e-07
53,Ethiopia,1.944895e-07
161,Taiwan,2.155172e-07
180,Vietnam,2.716298e-07


## Top 10 countries by high quality:
The 10 countries with the highest high quality articles per capita (in descending order)

In [380]:
high_quality_articles_country.sort_values(by = ['high_quality_article_ratio_countrywise'], ascending = False).replace([np.inf, -np.inf], np.nan).dropna().head(10)


Unnamed: 0,country,high_quality_article_ratio_countrywise
2,Andorra,2e-05
53,Montenegro,5e-06
1,Albania,2.142857e-06
9,Bosnia-Herzegovina,1.470588e-06
49,Lithuania,1.071429e-06
19,Croatia,1.052632e-06
74,Slovenia,9.52381e-07
61,Palestinian Territory,9.259259e-07
28,Gabon,8.333333e-07
25,Estonia,7.692308e-07


## Bottom 10 countries by high quality:
The 10 countries with the lowest high quality articles per capita (in ascending order).

In [381]:
high_quality_articles_country.sort_values(by = ['high_quality_article_ratio_countrywise'], ascending = True).head(10)


Unnamed: 0,country,high_quality_article_ratio_countrywise
35,India,4.2337e-09
83,Thailand,1.497006e-08
39,Japan,1.601281e-08
58,Nigeria,1.830664e-08
90,Vietnam,2.012072e-08
17,Colombia,2.03666e-08
86,Uganda,2.118644e-08
60,Pakistan,2.120441e-08
79,Sudan,2.132196e-08
37,Iran,2.257336e-08


## Geographic regions by total coverage:
A rank ordered list of geographic regions (in descending order) by total articles per capita.

In [384]:
ratio_of_articles_per_population_r.sort_values(by = ['ratio_of_articles_per_population_region'], ascending = False)

Unnamed: 0,region,ratio_of_articles_per_population_region
14,SOUTHERN EUROPE,5.805169e-06
0,CARIBBEAN,4.62069e-06
17,WESTERN EUROPE,3.539868e-06
5,EASTERN EUROPE,2.522617e-06
8,NORTHERN EUROPE,2.441315e-06
16,WESTERN ASIA,2.324159e-06
9,OCEANIA,1.977011e-06
13,SOUTHERN AFRICA,1.688312e-06
4,EASTERN AFRICA,1.366328e-06
10,SOUTH AMERICA,1.327495e-06


## Geographic regions by high quality coverage:
Rank ordered list of geographic regions (in descending order) by high quality articles per capita.

In [385]:
high_quality_articles_region.sort_values(by = ['high_quality_article_ratio_regionwise'], ascending = False)

Unnamed: 0,region,high_quality_article_ratio_regionwise
14,SOUTHERN EUROPE,3.048376e-07
0,CARIBBEAN,1.83908e-07
5,EASTERN EUROPE,1.322199e-07
17,WESTERN EUROPE,1.117318e-07
16,WESTERN ASIA,9.514101e-08
8,NORTHERN EUROPE,7.511737e-08
13,SOUTHERN AFRICA,5.772006e-08
1,CENTRAL AMERICA,5.621135e-08
9,OCEANIA,4.597701e-08
2,CENTRAL ASIA,3.846154e-08
