# Considering Bias in Data - Homework: 2

## Data loading

In [3]:
# Importing libraries

import pandas as pd
import requests
import os
from tqdm.notebook import tqdm  # for progress bars
import json



## Step 1: Getting the Article and Population Data

### Population Data

In [9]:
# Load the politicians by country dataset
politicians_df = pd.read_csv('politicians_by_country_AUG.2024.csv')

# Load the population by country dataset
population_df = pd.read_csv('population_by_country_AUG.2024.csv')

# Rename columns for population dataset for consistency
population_df = population_df.rename(columns={'Geography': 'country', 'Population': 'population'})

# Preview the datasets
print(politicians_df.head())
print(population_df.head())


                   name                                                url  \
0        Majah Ha Adrif       https://en.wikipedia.org/wiki/Majah_Ha_Adrif   
1     Haroon al-Afghani    https://en.wikipedia.org/wiki/Haroon_al-Afghani   
2           Tayyab Agha          https://en.wikipedia.org/wiki/Tayyab_Agha   
3  Khadija Zahra Ahmadi  https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...   
4        Aziza Ahmadyar       https://en.wikipedia.org/wiki/Aziza_Ahmadyar   

       country  
0  Afghanistan  
1  Afghanistan  
2  Afghanistan  
3  Afghanistan  
4  Afghanistan  
           country  population
0            WORLD      8009.0
1           AFRICA      1453.0
2  NORTHERN AFRICA       256.0
3          Algeria        46.8
4            Egypt       105.2


In [10]:
# Split population data into country and region-level data
region_population_df = population_df[population_df['country'].str.isupper()]
country_population_df = population_df[~population_df['country'].str.isupper()]

# Preview region and country level data
print("Region level data:")
print(region_population_df.head())
print("Country level data:")
print(country_population_df.head())

Region level data:
            country  population
0             WORLD      8009.0
1            AFRICA      1453.0
2   NORTHERN AFRICA       256.0
10   WESTERN AFRICA       442.0
27   EASTERN AFRICA       483.0
Country level data:
   country  population
3  Algeria        46.8
4    Egypt       105.2
5    Libya         6.9
6  Morocco        37.0
7    Sudan        48.1


### Article Data

In [62]:
# Extract article title from the URL (last part of the URL after '/')
politicians_df['article_title'] = politicians_df['url'].apply(lambda x: x.split('/')[-1])

# Function to retrieve the curr ent revision ID for a given Wikipedia article
def get_revision_id(article_title):
    """Retrieve the current revision ID for a given Wikipedia article."""
    url = f"https://en.wikipedia.org/w/api.php?action=query&titles={article_title}&prop=revisions&rvprop=ids&format=json"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        page_id = next(iter(data['query']['pages'].keys()))
        if page_id != "-1":  # -1 means page not found
            revision_id = data['query']['pages'][page_id]['revisions'][0]['revid']
            return revision_id
    return None

# Apply the get_revision_id function to each article title
politicians_df['revision_id'] = politicians_df['article_title'].apply(get_revision_id)

# Log missing revision IDs
missing_revision_ids = politicians_df[politicians_df['revision_id'].isnull()]
print(f"Missing revision IDs for {len(missing_revision_ids)} articles.")
missing_revision_ids.to_csv('missing_revision_ids.csv', index=False)

Missing revision IDs for 8 articles.


## Step 2: Getting Article Quality Predictions

### ORES Data

In [64]:
import requests
import time
import os

# get token form .env file
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")

# API endpoint for the ORES LiftWing model
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"
#ACCESS_TOKEN = ""  # Replace this with your valid access token

# Function to get ORES article quality score
def get_ores_quality_score(rev_id, access_token):
    # Set up the request header with authentication token
    headers = {
        'Authorization': f'Bearer {access_token}',
        'User-Agent': "<sparshna@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024"
    }

    # Prepare the API URL for the request
    url = API_ORES_LIFTWING_ENDPOINT.format(model_name=API_ORES_EN_QUALITY_MODEL)

    # Define the request payload with revision ID
    payload = {
        "lang": "en",
        "rev_id": rev_id,
        "features": True
    }

    try:
        # Make the API request
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Check for HTTP errors

        # Parse the response JSON to get the quality prediction
        data = response.json()
        #print(data)
        prediction = data['enwiki']['scores'][str(rev_id)]['articlequality']['score']['prediction']
        
        return prediction

    except requests.exceptions.RequestException as e:
        print(f"Error fetching ORES quality score for rev_id {rev_id}: {e}")
        return None

# Example: Test the function with one revision ID (rev_id is a sample)
sample_rev_id = 1085687913
quality_score = get_ores_quality_score(sample_rev_id, ACCESS_TOKEN)
print(f"Sample ORES quality score: {quality_score}")


Sample ORES quality score: FA


In [67]:
from tqdm import tqdm

# Define the assumed API throttle wait time (5000 requests per hour)
API_LATENCY_ASSUMED = 0.002  # Assume roughly 2ms latency
API_THROTTLE_WAIT = ((60.0 * 10.0) / 5000.0) - API_LATENCY_ASSUMED

# Function with throttling
def get_ores_quality_score_throttled(rev_id, access_token):
    # Get the score
    score = get_ores_quality_score(rev_id, access_token)
    
    # Throttle to avoid exceeding request limits
    time.sleep(API_THROTTLE_WAIT)
    
    return score

# Ensure the 'revision_id' column exists and apply the ORES function to fetch the quality score
politicians_df['article_quality'] = None
#for index, row in politicians_df.iterrows():
for index, row in tqdm(politicians_df.iterrows(), total=politicians_df.shape[0], desc="Processing articles"):
    try:
        rev_id = row['revision_id']
        if rev_id:
            politicians_df.at[index, 'article_quality'] = get_ores_quality_score_throttled(int(rev_id), ACCESS_TOKEN)
            if politicians_df.at[index, 'article_quality'] == None:
                print(f"Error processing row {index}: {e}")
    except Exception as e:
        print(f"Error processing row {index}: {e}")

Processing articles:   0%|          | 0/7155 [00:00<?, ?it/s]

Processing articles:   6%|▌         | 430/7155 [04:24<1:13:54,  1.52it/s]

Error processing row 430: cannot convert float NaN to integer


Processing articles:   7%|▋         | 516/7155 [05:13<1:07:46,  1.63it/s]

Error processing row 516: cannot convert float NaN to integer


Processing articles:  17%|█▋        | 1200/7155 [12:54<59:59,  1.65it/s]  

Error processing row 1200: cannot convert float NaN to integer


Processing articles:  19%|█▉        | 1342/7155 [14:21<47:04,  2.06it/s]  

Error processing row 1342: cannot convert float NaN to integer


Processing articles:  27%|██▋       | 1955/7155 [20:45<55:26,  1.56it/s]  

Error processing row 1955: cannot convert float NaN to integer


Processing articles:  28%|██▊       | 1994/7155 [21:09<46:24,  1.85it/s]  

Error fetching ORES quality score for rev_id 1244463510: 502 Server Error: Bad Gateway for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Error processing row 1993: name 'e' is not defined


Processing articles:  34%|███▍      | 2427/7155 [25:24<50:07,  1.57it/s]  

Error processing row 2427: cannot convert float NaN to integer


Processing articles:  63%|██████▎   | 4496/7155 [46:01<30:51,  1.44it/s]  

Error processing row 4496: cannot convert float NaN to integer


Processing articles:  80%|███████▉  | 5719/7155 [57:30<10:43,  2.23it/s]

Error processing row 5719: cannot convert float NaN to integer


Processing articles:  81%|████████  | 5810/7155 [58:14<11:19,  1.98it/s]

Error fetching ORES quality score for rev_id 1244353459: 503 Server Error: Service Unavailable for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Error processing row 5809: name 'e' is not defined


Processing articles: 100%|██████████| 7155/7155 [1:11:56<00:00,  1.66it/s]


### Error rate

In [112]:
# Print error rate
error_rate = politicians_df['article_quality'].isnull().mean()

print(f"Error rate: {error_rate:.2%}")

Error rate: 0.14%


### Testing data

In [84]:
# Example: Test the function with one revision ID (rev_id is a sample)
sample_rev_id = 1244353459
quality_score = get_ores_quality_score(sample_rev_id, ACCESS_TOKEN)
print(f"Sample ORES quality score: {quality_score}")

Sample ORES quality score: C


In [69]:
politicians_df.head()

Unnamed: 0,name,url,country,article_title,revision_id,article_quality
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Majah_Ha_Adrif,1233203000.0,Start
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,Haroon_al-Afghani,1230460000.0,B
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Tayyab_Agha,1225662000.0,Start
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,Khadija_Zahra_Ahmadi,1234742000.0,Stub
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,Aziza_Ahmadyar,1195651000.0,Start


In [70]:
# Save the updated dataset with article quality scores
politicians_df.to_csv('wiki_data_ores.csv', index=False)

## Step 3: Combining the Datasets

### wp_countries-no_match.txt


In [72]:
# Merge datasets on the country field
merged_df = pd.merge(politicians_df, population_df, how='left', left_on='country', right_on='country')

# Identify and log countries that could not be merged
no_match_countries = merged_df[merged_df['population'].isnull()]['country'].unique()
with open('wp_countries-no_match.txt', 'w') as f:
    for country in no_match_countries:
        f.write(f"{country}\n")

### wp_politicians_by_country.csv

In [194]:
def assign_regions(country_region_list):
    # Dictionary to store regions and their corresponding countries
    region_dict = {}
    
    # Variable to store the most recent uppercase region
    current_region = None
    
    # Loop through the words in the list
    for place in country_region_list:
        # Check if the word is all uppercase (indicating it's a region)
        if place.isupper():
            current_region = place  # Update the current region
            if current_region not in region_dict:
                region_dict[current_region] = []  # Initialize the list for the region
        else:
            if current_region:
                region_dict[current_region].append(place)  # Add the country to the region's list
    
    return region_dict

country_region_list = list(population_df['country'])

# Call the function and print the result
region_country_mapping = assign_regions(country_region_list)
print(region_country_mapping)

{'WORLD': [], 'AFRICA': [], 'NORTHERN AFRICA': ['Algeria', 'Egypt', 'Libya', 'Morocco', 'Sudan', 'Tunisia', 'Western Sahara'], 'WESTERN AFRICA': ['Benin', 'Burkina Faso', 'Cape Verde', "Cote d'Ivoire", 'Gambia', 'Ghana', 'Guinea', 'GuineaBissau', 'Liberia', 'Mali', 'Mauritania', 'Niger', 'Nigeria', 'Senegal', 'Sierra Leone', 'Togo'], 'EASTERN AFRICA': ['Burundi', 'Comoros', 'Djibouti', 'Eritrea', 'Ethiopia', 'Kenya', 'Madagascar', 'Malawi', 'Mauritius', 'Mayotte', 'Mozambique', 'Reunion', 'Rwanda', 'Seychelles', 'Somalia', 'South Sudan', 'Tanzania', 'Uganda', 'Zambia', 'Zimbabwe'], 'MIDDLE AFRICA': ['Angola', 'Cameroon', 'Central African Republic', 'Chad', 'Congo', 'Congo DR', 'Equatorial Guinea', 'Gabon', 'Sao Tome and Principe'], 'SOUTHERN AFRICA': ['Botswana', 'eSwatini', 'Lesotho', 'Namibia', 'South Africa'], 'NORTHERN AMERICA': ['Canada', 'United States'], 'LATIN AMERICA AND THE CARIBBEAN': [], 'CENTRAL AMERICA': ['Belize', 'Costa Rica', 'El Salvador', 'Guatemala', 'Honduras', 'Me

In [195]:
# Add region information to merged_df
merged_df['region'] = None
for region, countries in region_country_mapping.items():
    merged_df.loc[merged_df['country'].isin(countries), 'region'] = region

In [196]:
# Ensure the column names match the required output
merged_df2 = merged_df.rename(columns={'article_title': 'article_title', 'Population': 'population'})

# Select the relevant columns and save to a new CSV file
merged_df2[['country', 'region', 'population', 'article_title', 'revision_id', 'article_quality']].to_csv('wp_politicians_by_country.csv', index=False)

print("Merged dataset saved as 'wp_politicians_by_country.csv' with required columns.")

merged_df2.head()

Merged dataset saved as 'wp_politicians_by_country.csv' with required columns.


Unnamed: 0,name,url,country,article_title,revision_id,article_quality,population,region
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Majah_Ha_Adrif,1233203000.0,Start,42.4,SOUTH ASIA
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,Haroon_al-Afghani,1230460000.0,B,42.4,SOUTH ASIA
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Tayyab_Agha,1225662000.0,Start,42.4,SOUTH ASIA
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,Khadija_Zahra_Ahmadi,1234742000.0,Stub,42.4,SOUTH ASIA
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,Aziza_Ahmadyar,1195651000.0,Start,42.4,SOUTH ASIA


In [6]:
# load wp_politicians_by_country.csv in dataframe merged_df
merged_df2 = pd.read_csv('wp_politicians_by_country.csv')

## Step 4: Analysis

### Total articles per capita

#### Total-articles-per-capita for countries

In [11]:
# total-articles-per-capita for countries

articles_per_country = merged_df2['country'].value_counts().to_dict()

#print(articles_per_country)

# map articles_per_country keys to population data in merged_df2 

articles_per_country_per_capita = {}
for country, article_count in articles_per_country.items():
    population = merged_df2[merged_df2['country'] == country]['population'].values[0]
    if country == 'Marshall Islands':
        print(country, article_count, population)
    articles_per_country_per_capita[country] = article_count / population

#print(articles_per_country_per_capita)


# Make articles_per_region_per_capita a datafrae
articles_per_country_per_capita_df = pd.DataFrame(articles_per_country_per_capita.items(), columns=['country', 'Articles Per Capita Per Million Population'])
articles_per_country_per_capita_df.head()

# sort by Articles Per Capita Per Million Population and eliminate inf values
articles_per_country_per_capita_df = articles_per_country_per_capita_df.replace([float('inf'), float('-inf')], 0)
articles_per_country_per_capita_df = articles_per_country_per_capita_df.sort_values(by='Articles Per Capita Per Million Population', ascending=False)
articles_per_country_per_capita_df.head()

# note: Tuvalu and Monaco have the highest number of articles per capita per million population 
# but they have very small populations (0.0 million here) 
# and Article Per Capita Per Million Population is not a good metric for them

Marshall Islands 13 0.1


  articles_per_country_per_capita[country] = article_count / population


Unnamed: 0,country,Articles Per Capita Per Million Population
82,Antigua and Barbuda,330.0
129,Federated States of Micronesia,140.0
131,Marshall Islands,130.0
138,Tonga,100.0
101,Barbados,83.333333


#### Total-articles-per-capita for region

In [12]:
# total-articles-per-capita for region

articles_per_region = merged_df2['region'].value_counts().to_dict()

print(articles_per_region)

# map articles_per_region keys to population data in merged_df2 

articles_per_region_per_capita = {}
for region, article_count in articles_per_region.items():
    #population = merged_df2[merged_df2['region'] == region]['population'].values[0]
    # map region name to population_df
    population = region_population_df[region_population_df['country'] == region]['population'].values[0]
    articles_per_region_per_capita[region] = article_count / population

# Make articles_per_region_per_capita a datafrae
articles_per_region_per_capita_df = pd.DataFrame(articles_per_region_per_capita.items(), columns=['region', 'Articles Per Capita Per Million Population'])
articles_per_region_per_capita_df.sort_values(by='Articles Per Capita Per Million Population', ascending=False).head()


{'SOUTHERN EUROPE': 797, 'EASTERN EUROPE': 709, 'SOUTH ASIA': 670, 'EASTERN AFRICA': 665, 'WESTERN ASIA': 610, 'SOUTH AMERICA': 569, 'WESTERN AFRICA': 515, 'WESTERN EUROPE': 498, 'SOUTHEAST ASIA': 396, 'NORTHERN AFRICA': 302, 'MIDDLE AFRICA': 231, 'CARIBBEAN': 219, 'NORTHERN EUROPE': 191, 'CENTRAL AMERICA': 188, 'EAST ASIA': 152, 'SOUTHERN AFRICA': 123, 'CENTRAL ASIA': 106, 'OCEANIA': 72}


Unnamed: 0,region,Articles Per Capita Per Million Population
0,SOUTHERN EUROPE,5.243421
11,CARIBBEAN,4.977273
7,WESTERN EUROPE,2.502513
1,EASTERN EUROPE,2.487719
4,WESTERN ASIA,2.040134


#### Total-high-quality-articles-per-capita for countries

In [13]:
# article quality rated as FA by country 

#fa_articles_per_country =merged_df2[merged_df2['article_quality'] == 'FA']['country'].value_counts().reset_index()
fa_articles_per_country = merged_df2[merged_df2['article_quality'].isin(['FA', 'GA'])]['country'].value_counts().reset_index()

fa_articles_per_country.columns = ['country', 'High Quality Articles Per Million Population']

fa_articles_per_country = fa_articles_per_country.reset_index(drop=True)

fa_articles_per_country.head()

Unnamed: 0,country,High Quality Articles Per Million Population
0,Spain,18
1,Indonesia,15
2,"Korea, South",10
3,Russia,9
4,Ukraine,8


#### Total-high-quality-articles-per-capita for region

In [14]:
# article quality rated as FA by region

fa_articles_per_region = merged_df2[merged_df2['article_quality'].isin(['FA', 'GA'])]['region'].value_counts().reset_index()

fa_articles_per_region.columns = ['region', 'High Quality Articles Per Million Population']

fa_articles_per_region = fa_articles_per_region.reset_index(drop=True)

fa_articles_per_region.head()

Unnamed: 0,region,High Quality Articles Per Million Population
0,SOUTHERN EUROPE,53
1,EASTERN EUROPE,38
2,WESTERN ASIA,27
3,SOUTHEAST ASIA,25
4,WESTERN EUROPE,21


## Step 5: Results

### Top 10 countries by coverage: The 10 countries with the highest total articles per capita (in descending order)

In [15]:
top_10_countries_by_coverage = articles_per_country_per_capita_df.sort_values(by='Articles Per Capita Per Million Population', ascending=False).head(10)
top_10_countries_by_coverage

Unnamed: 0,country,Articles Per Capita Per Million Population
82,Antigua and Barbuda,330.0
129,Federated States of Micronesia,140.0
131,Marshall Islands,130.0
138,Tonga,100.0
101,Barbados,83.333333
72,Montenegro,60.0
148,Seychelles,60.0
81,Maldives,55.0
62,Bhutan,55.0
145,Samoa,40.0


### Bottom 10 countries by coverage: The 10 countries with the lowest total articles per capita (in ascending order) .

In [16]:
bottom_10_countries_by_coverage = articles_per_country_per_capita_df.sort_values(by='Articles Per Capita Per Million Population', ascending=True).head(10)
bottom_10_countries_by_coverage

Unnamed: 0,country,Articles Per Capita Per Million Population
168,Tuvalu,0.0
139,Monaco,0.0
118,China,0.011337
2,India,0.105698
155,Ghana,0.117302
153,Saudi Arabia,0.135501
162,Zambia,0.148515
166,Norway,0.181818
163,Israel,0.204082
85,Egypt,0.304183


### Top 10 countries by high quality: The 10 countries with the highest high quality articles per capita (in descending order) .

In [17]:
top_10_countries_by_quality = fa_articles_per_country.sort_values(by='High Quality Articles Per Million Population', ascending=False).head(10)
top_10_countries_by_quality

Unnamed: 0,country,High Quality Articles Per Million Population
0,Spain,18
1,Indonesia,15
2,"Korea, South",10
3,Russia,9
4,Ukraine,8
5,South Africa,8
8,Poland,7
9,Switzerland,7
7,Iraq,7
6,Albania,7


### Bottom 10 countries by high quality: The 10 countries with the lowest high quality articles per capita (in ascending order).

In [22]:
all_countries = merged_df2['country'].unique()
all_countries_df = pd.DataFrame(all_countries, columns=['country'])
complete_articles_per_country = pd.merge(all_countries_df, fa_articles_per_country, on='country', how='left').fillna(0)

complete_articles_per_country['High Quality Articles Per Million Population'] = complete_articles_per_country['High Quality Articles Per Million Population'].astype(int)

# Reset the index
complete_articles_per_country = complete_articles_per_country.reset_index(drop=True)

complete_articles_per_country.head()

# sort complete_articles_per_country by High Quality Articles Per Million Population to find bottom 10
bottom_10_countries_by_quality = complete_articles_per_country.sort_values(by='High Quality Articles Per Million Population', ascending=True).head(10)
bottom_10_countries_by_quality

Unnamed: 0,country,High Quality Articles Per Million Population
168,Zimbabwe,0
106,Namibia,0
108,Nicaragua,0
51,Estonia,0
50,Eritrea,0
49,Equatorial Guinea,0
110,Niger,0
105,Mozambique,0
111,Norway,0
45,Timor Leste,0


### Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita.

In [23]:
articles_per_region_per_capita_df.sort_values(by='Articles Per Capita Per Million Population', ascending=False)


Unnamed: 0,region,Articles Per Capita Per Million Population
0,SOUTHERN EUROPE,5.243421
11,CARIBBEAN,4.977273
7,WESTERN EUROPE,2.502513
1,EASTERN EUROPE,2.487719
4,WESTERN ASIA,2.040134
12,NORTHERN EUROPE,1.768519
15,SOUTHERN AFRICA,1.757143
17,OCEANIA,1.6
3,EASTERN AFRICA,1.376812
5,SOUTH AMERICA,1.335681


### Geographic regions by high quality coverage: Rank ordered list of geographic regions (in descending order) by high quality articles per capita.

In [24]:
fa_articles_per_region.sort_values(by='High Quality Articles Per Million Population', ascending=False)

Unnamed: 0,region,High Quality Articles Per Million Population
0,SOUTHERN EUROPE,53
1,EASTERN EUROPE,38
2,WESTERN ASIA,27
3,SOUTHEAST ASIA,25
4,WESTERN EUROPE,21
5,SOUTH ASIA,21
6,SOUTH AMERICA,19
7,NORTHERN AFRICA,17
8,EASTERN AFRICA,17
9,WESTERN AFRICA,13
