### Getting the data from csvs and putting it into a DataFrame

In [1]:
import csv
import pandas as pd
import math

# Reading the csv files
page_data = pd.read_csv('page_data.csv')
population_data = pd.read_csv( 'Population_Mid_2015.csv' )                                   

# Renaming the columns and removing redundant columns
population_data['country'] = population_data['Location']
population_data['Data'] = population_data['Data'].str.replace(',', '')
population_data['population'] = pd.to_numeric(population_data['Data'], errors='ignore')
population_data = population_data[ ['country', 'population'] ]

#Merging the page_data and population_data to get overall_data
overall_data = page_data.merge( population_data, on = 'country')
overall_data.columns = ['article_name', 'country', 'revision_id', 'population']


### Using ORES APIs to extract the quality of articles

In [2]:
import requests
import json

def get_ores_data(rev_ids, headers):
    
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - smushing all the revision IDs together separated by | marks.
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in rev_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    
    articlesQuality = []
    
    # Combining all the article quality scores in a list and returning 
    for a in response['enwiki']['scores']:
        if 'error' in response['enwiki']['scores'][a]['wp10']:
            articlesQuality.append('No Revision')
        else:
            articlesQuality.append( response['enwiki']['scores'][a]['wp10']['score']['prediction'] )
       
    return(articlesQuality)


#### Iterating over the articles and obtaining corresponding article quality using ORES API

In [None]:
headers = {'User-Agent' : 'https://github.com/r1rajiv92', 'From' : 'rajiv92@uw.edu'}

numRows = len(overall_data)
articlesQuality = []
j = 0

# Iterating over articles 50 at a time to make sure te URL length and API call works properly
for i in range( math.ceil(numRows/50) ):
    rev_ids = overall_data.iloc[j:j+50]['revision_id']
    articlesQuality += get_ores_data(rev_ids, headers)
    j+= 50

# Appending article quality class to all the articles in data
overall_data['article_quality'] = articlesQuality

#### Calculating number of articles per country and joining the Populating table for per article per population calculation

In [7]:
numArticlesPerCountry = overall_data.groupby(['country']).size().reset_index(name='numArticles')
numArticlesPopulationPerCountry = numArticlesPerCountry.merge(population_data, on = 'country')
numArticlesPopulationPerCountry['pct articles per population'] = numArticlesPopulationPerCountry['numArticles'] * 100 / \
                                                                 numArticlesPopulationPerCountry['population']
numArticlesPopulationPerCountry = numArticlesPopulationPerCountry.sort_values( ['pct articles per population'], ascending=[0] )


#### 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [61]:
topTEN_articles_per_population = numArticlesPopulationPerCountry.iloc[0:10]
topTEN_articles_per_population

Unnamed: 0,country,numArticles,population,pct articles per population
120,Nauru,53,10860,0.488029
173,Tuvalu,55,11800,0.466102
141,San Marino,82,33000,0.248485
113,Monaco,40,38088,0.10502
97,Liechtenstein,29,37570,0.077189
107,Marshall Islands,37,55000,0.067273
72,Iceland,206,330828,0.062268
168,Tonga,63,103300,0.060987
3,Andorra,34,78000,0.04359
54,Federated States of Micronesia,38,103000,0.036893


#### 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [66]:
lowestTEN_articles_per_population = numArticlesPopulationPerCountry.iloc[-10:]
lowestTEN_articles_per_population

Unnamed: 0,country,numArticles,population,pct articles per population
13,Bangladesh,324,160411000,0.000202
38,"Congo, Dem. Rep. of",142,73340200,0.000194
166,Thailand,112,65121250,0.000172
185,Zambia,26,15473900,0.000168
86,"Korea, North",39,24983000,0.000156
53,Ethiopia,105,98148000,0.000107
180,Uzbekistan,29,31290791,9.3e-05
74,Indonesia,215,255741973,8.4e-05
34,China,1138,1371920000,8.3e-05
73,India,990,1314097616,7.5e-05


#### Calculating percentage of GA/FA articles 

In [None]:
## Function to check if the article is GA or FA
def Is_GA_FA(row):
    if row['article_quality'] == 'FA' or row['article_quality'] == 'GA':
        val = 1
    else:
        val = 0
    return val

overall_data['Is_GA_FA'] = overall_data.apply(Is_GA_FA, axis=1)


num_GA_FA_acticles_per_country = overall_data.groupby(['country'])['Is_GA_FA'].sum().reset_index(name ='num_GA_FA')
num_GA_FA_and_num_articles_per_country = numArticlesPerCountry.merge(num_GA_FA_acticles_per_country, on = 'country')

num_GA_FA_and_num_articles_per_country['pct_GA_FA_articles'] = num_GA_FA_and_num_articles_per_country['num_GA_FA'] * 100 / \
                                                               num_GA_FA_and_num_articles_per_country['numArticles']

num_GA_FA_and_num_articles_per_country = num_GA_FA_and_num_articles_per_country.sort_values( ['pct_GA_FA_articles'], ascending=[0] )

#### 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [79]:
num_GA_FA_and_num_articles_per_country.iloc[0:10]

Unnamed: 0,country,numArticles,num_GA_FA,pct_GA_FA_articles
50,Equatorial Guinea,32,5,15.625
86,"Korea, North",39,5,12.820513
138,Romania,348,41,11.781609
164,Tajikistan,40,4,10.0
18,Benin,94,9,9.574468
183,Vietnam,191,17,8.900524
143,Saudi Arabia,119,10,8.403361
80,Jamaica,85,7,8.235294
59,Gabon,103,8,7.76699
178,United States,1098,81,7.377049


#### 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [80]:
num_GA_FA_and_num_articles_per_country.iloc[-10:]

Unnamed: 0,country,numArticles,num_GA_FA,pct_GA_FA_articles
94,Lesotho,30,0,0.0
97,Liechtenstein,29,0,0.0
14,Barbados,14,0,0.0
100,Macedonia,65,0,0.0
159,Swaziland,32,0,0.0
17,Belize,16,0,0.0
36,Comoros,51,0,0.0
142,Sao Tome and Principe,22,0,0.0
30,Cape Verde,37,0,0.0
89,Kuwait,37,0,0.0


### Reflection on the work

I believe there is inherent bias in the dataset. Firstly, there is huge population bias in the data. Countries such as India and China and very lage population (extremely skewed when compared to the rest of the countries) and hence when we calculated number of articles per population, it is bound to be very less. Similarly, countries that have very low population seemed on the top 10 highest ranked countries by articles per population.

Also, since we are using ORES machine learning service by wikipedia, we are already biased by what wikipedia's think is good or bad article. If the machine learning algorithm used to develop ORES used only data from certain countries, then we are already biased to the results. And, the algorithm can incorrectly categories articles from countries it has not seen before and hence make systematic mistakes which can make our conclusions invalid?

Question: How do we know if the ORES algorithm is doing a good job on our dataset to be sure of our results?
