# Bias in Data

In [172]:
import pandas as pd
import requests
import json

## Data Acquisition

#### Getting data from the CSV files

In [173]:
#Dataframe stores wikipedia articles' data

wiki_df = pd.read_csv('data/page_data.csv')
wiki_df.head(5)

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [174]:
#Dataframe stores countries' population data

country_df = pd.read_csv('data/WPDS_2018_data.csv')
country_df.rename(columns={"Geography": "country"}, inplace=True)                                    
country_df.head(5)

Unnamed: 0,country,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


#### ORES API
#### The function below is referenced from this [github repository](https://github.com/Ironholds/data-512-a2)

In [175]:
headers = {'User-Agent' : 'https://github.com/saylidighde', 'From' : 'sayli@uw.edu'}

def get_ores_data(revision_ids, headers):
    """ Function calls ORES API, a machine learning service and returns the quality of the article
    
    >>> get_ores_data([1, 2, 3], headers)
    
    >>> Quality of any article belongs to one of the following 6 categories in the returned JSON
    FA - Featured article
    GA - Good article
    B - B-class article
    C - C-class article
    Start - Start-class article
    Stub - Stub-class article
    """
    
    # Define API endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - smushing all the revision IDs together separated by | marks. 
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    return response


In [176]:
def generate_chunkwise_ores_data(revision_ids_list, chunkSize):
    """ Function to call ORES API on piecewise chunks of the revision IDs list
        
        Performs network load balancing - 
        larger chunks cause network congestion and smaller chunks increase network calls/load
    """
    ores_data = []
    for i in range(0, len(revision_ids_list), chunkSize):
        chunk = revision_ids_list[i:i+chunkSize]
        ores_data.append(get_ores_data(chunk, headers))
    return ores_data

In [177]:
def extract_article_quality(ores_data):
    """ Function to extract article quality in a list from the JSON obtained from the ORES API 
        
        >>> extract_article_quality(Net ORES data for piecewise chunks of revision IDs)
        Returns quality list
    """
    
    quality_list = []

    for obj in ores_data:
        scores_dict = obj['enwiki']['scores']
        for score_key, value in scores_dict.items():
            revision_id = score_key
            
            #Mark article quality as NA for revision IDs returning error
            if 'error' in value['wp10']:
                article_quality = 'NA'
            else:
                article_quality = value['wp10']['score']['prediction']

            quality_list.append(article_quality)
            
    return quality_list

In [178]:
# Convert rev_ids column of dataframe to list
revision_ids_list = wiki_df['rev_id'].tolist()

# Call function generate_chunkwise_ores_data with chunksize of 100 IDs
ores_data = generate_chunkwise_ores_data(revision_ids_list, 100)

# Call function to extract article quality in a list
quality_list = extract_article_quality(ores_data)


In [179]:
# Append quality list to wiki articles' dataframe
wiki_df['article_quality'] = quality_list

wiki_df.head(5)

Unnamed: 0,page,country,rev_id,article_quality
0,Template:ZambiaProvincialMinisters,Zambia,235107991,
1,Bir I of Kanem,Chad,355319463,Stub
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046,Stub
3,Template:Uganda-politician-stub,Uganda,391862070,Stub
4,Template:Namibia-politician-stub,Namibia,391862409,Stub


## Data Pre-processing

In [180]:
# Remove rows with 'NA' article qualities 
# Articles for which the API returned error are being pruned, causing some data loss

wiki_df = wiki_df[wiki_df.article_quality != 'NA']
print(wiki_df.shape)

# Generate final dataframe by merging the wikipedia and country dataframes on field 'country'
# Rows found in one table and not in the other are pruned

final_df = pd.merge(wiki_df, country_df, on = 'country')
print(final_df.shape)

(47092, 4)
(44973, 5)


In [181]:
#Rename column names
final_df.rename(columns={"page": "article_name", 
                         "Population mid-2018 (millions)": "population", 
                         "rev_id": "revision_id"}, inplace = True)
print(final_df.head(5))

#Dump datafrme to a CSV file
final_df.to_csv('analyses_data.csv', index=False)


           article_name country  revision_id article_quality population
0        Bir I of Kanem    Chad    355319463            Stub       15.4
1  Abdullah II of Kanem    Chad    498683267            Stub       15.4
2   Salmama II of Kanem    Chad    565745353            Stub       15.4
3       Kuri I of Kanem    Chad    565745365            Stub       15.4
4   Mohammed I of Kanem    Chad    565745375            Stub       15.4


## Data Analysis

In [182]:
#Proportion (as a percentage) of articles-per-population

def generate_analysis_table(grouped_df):
    """Funtion to generate analyses
    
    >>> Calculates the proportion, as percentage, of articles-per-population
    >>> Calculates the proportion, as percentage, of high-quality articles for each country
    
    """
    
    #Initialize analysis dataframe
    analysis_table = pd.DataFrame(columns = ['country', 'total_articles', 
                                             'population (in millions)', 'total_quality_articles',
                                             'articles_per_population (%)', 'quality_articles_proportion (%)'])
    
    country_list = []
    total_articles_list = []
    population_list = []
    total_quality_articles_list = []
    proportion_list = []
    quality_list = []

    #Iterate over every country's group
    for country, group_value in grouped_df:
        no_of_articles = len(group_value)
        
        #Remove commas from population strings
        format_population = group_value['population'].iloc[0].replace(",","")
        
        #Convert population strings to corresponding float values in millions
        population = float(format_population)*1000000
        
        #Append current country to country names' list
        country_list.append(country) 
        
        #Append no_of_articles to corresponding list 
        total_articles_list.append(no_of_articles) 
        
        #Append country's population to corresponding list 
        population_list.append(group_value['population'].iloc[0]) 
        
        #Append proportion, as percentage, of articles-per-population
        proportion_list.append( (no_of_articles / population)*100 ) 
        
        quality_articles = group_value[group_value.article_quality.isin(['FA', 'GA'])]
        
        #Append number of quality_articles
        total_quality_articles_list.append(len(quality_articles))
        
        #Append proportion, as percentage, of high-quality articles for each country
        quality_list.append((len(quality_articles)/no_of_articles)*100)

    #Assign generated lists to respective columns in the Analysis dataframe
    analysis_table['country'] = country_list
    analysis_table['total_articles'] = total_articles_list
    analysis_table['population (in millions)'] = population_list
    analysis_table['total_quality_articles'] = total_quality_articles_list
    analysis_table['articles_per_population (%)'] = proportion_list
    analysis_table['quality_articles_proportion (%)'] = quality_list

    return analysis_table


In [183]:
#Group final datafrme by country
grouped_df = final_df.groupby('country')

#Generate Analysis table
analysis_table = generate_analysis_table(grouped_df)
analysis_table.head(10)


Unnamed: 0,country,total_articles,population (in millions),total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
0,Afghanistan,326,36.5,10,0.000893,3.067485
1,Albania,460,2.9,4,0.015862,0.869565
2,Algeria,119,42.7,2,0.000279,1.680672
3,Andorra,34,0.08,0,0.0425,0.0
4,Angola,110,30.4,0,0.000362,0.0
5,Antigua and Barbuda,25,0.1,0,0.025,0.0
6,Argentina,496,44.5,15,0.001115,3.024194
7,Armenia,198,3.0,5,0.0066,2.525253
8,Australia,1566,24.1,42,0.006498,2.681992
9,Austria,340,8.8,3,0.003864,0.882353


## Specialized Tables

#### 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [184]:
analysis_table.sort_values('articles_per_population (%)', ascending=False).head(10).reset_index().drop('index', axis=1)

Unnamed: 0,country,total_articles,population (in millions),total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
0,Tuvalu,55,0.01,5,0.55,9.090909
1,Nauru,53,0.01,0,0.53,0.0
2,San Marino,82,0.03,0,0.273333,0.0
3,Monaco,40,0.04,0,0.1,0.0
4,Liechtenstein,29,0.04,0,0.0725,0.0
5,Tonga,63,0.1,1,0.063,1.587302
6,Marshall Islands,37,0.06,0,0.061667,0.0
7,Iceland,206,0.4,2,0.0515,0.970874
8,Andorra,34,0.08,0,0.0425,0.0
9,Federated States of Micronesia,38,0.1,0,0.038,0.0


#### 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [185]:
analysis_table.sort_values('articles_per_population (%)', ascending=True).head(10).reset_index().drop('index', axis=1)

Unnamed: 0,country,total_articles,population (in millions),total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
0,India,986,1371.3,14,7.2e-05,1.419878
1,Indonesia,214,265.2,8,8.1e-05,3.738318
2,China,1135,1393.8,33,8.1e-05,2.907489
3,Uzbekistan,29,32.9,1,8.8e-05,3.448276
4,Ethiopia,105,107.5,1,9.8e-05,0.952381
5,Zambia,25,17.7,0,0.000141,0.0
6,"Korea, North",39,25.6,7,0.000152,17.948718
7,Thailand,112,66.2,3,0.000169,2.678571
8,Bangladesh,323,166.4,3,0.000194,0.928793
9,Mozambique,60,30.5,0,0.000197,0.0


#### 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [186]:
analysis_table.sort_values('quality_articles_proportion (%)', ascending=False).head(10).reset_index().drop('index', axis=1)

Unnamed: 0,country,total_articles,population (in millions),total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
0,"Korea, North",39,25.6,7,0.000152,17.948718
1,Saudi Arabia,119,33.4,16,0.000356,13.445378
2,Central African Republic,68,4.7,8,0.001447,11.764706
3,Romania,348,19.5,40,0.001785,11.494253
4,Mauritania,52,4.5,5,0.001156,9.615385
5,Bhutan,33,0.8,3,0.004125,9.090909
6,Tuvalu,55,0.01,5,0.55,9.090909
7,Dominica,12,0.07,1,0.017143,8.333333
8,United States,1092,328.0,82,0.000333,7.509158
9,Benin,94,11.5,7,0.000817,7.446809


#### 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [187]:
analysis_table.sort_values('quality_articles_proportion (%)', ascending=True).head(10).reset_index().drop('index', axis=1)

Unnamed: 0,country,total_articles,population (in millions),total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
0,Sao Tome and Principe,22,0.2,0,0.011,0.0
1,Mozambique,60,30.5,0,0.000197,0.0
2,Cameroon,105,25.6,0,0.00041,0.0
3,Guyana,20,0.8,0,0.0025,0.0
4,Turkmenistan,33,5.9,0,0.000559,0.0
5,Monaco,40,0.04,0,0.1,0.0
6,Moldova,426,3.5,0,0.012171,0.0
7,Comoros,51,0.8,0,0.006375,0.0
8,Marshall Islands,37,0.06,0,0.061667,0.0
9,Costa Rica,150,5.0,0,0.003,0.0


In [188]:
#As all 10 values for high quality articles are 0. By exploring further, we get first 37 countries with 0% quality articles.

analysis_table.sort_values('quality_articles_proportion (%)', ascending=True).head(37).reset_index().drop('index', axis=1)

Unnamed: 0,country,total_articles,population (in millions),total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
0,Sao Tome and Principe,22,0.2,0,0.011,0.0
1,Mozambique,60,30.5,0,0.000197,0.0
2,Cameroon,105,25.6,0,0.00041,0.0
3,Guyana,20,0.8,0,0.0025,0.0
4,Turkmenistan,33,5.9,0,0.000559,0.0
5,Monaco,40,0.04,0,0.1,0.0
6,Moldova,426,3.5,0,0.012171,0.0
7,Comoros,51,0.8,0,0.006375,0.0
8,Marshall Islands,37,0.06,0,0.061667,0.0
9,Costa Rica,150,5.0,0,0.003,0.0


#### Statistics of the Analysis Table

In [189]:
analysis_table.describe()

Unnamed: 0,total_articles,total_quality_articles,articles_per_population (%),quality_articles_proportion (%)
count,180.0,180.0,180.0,180.0
mean,249.85,5.444444,0.013616,2.271225
std,290.009795,9.982791,0.06085,2.655663
min,12.0,0.0,7.2e-05,0.0
25%,63.75,1.0,0.000643,0.552783
50%,134.0,2.0,0.001862,1.529667
75%,339.25,6.0,0.004929,2.943345
max,1689.0,82.0,0.55,17.948718
