In [2]:
import pandas as pd
import numpy as np
import os
import json
import time

Remove rows in the population dataset that represent regions (rows with all-uppercase "Geography" values). We will focus only on countries and rename 'Geography' to 'country' for merging.

## Step 1: Getting Article Quality Predictions

In [5]:

# construct path file
politicians_df = pd.read_csv('politicians_by_country_AUG.2024.csv')
population_df = pd.read_csv('population_by_country_AUG.2024.csv')

# identify the countries (without the region rows)
country_population_df = population_df[~population_df['Geography'].str.isupper()]
country_population_df.rename(columns={'Geography': 'country'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_population_df.rename(columns={'Geography': 'country'}, inplace=True)


Unnamed: 0,country,Population
3,Algeria,46.8
4,Egypt,105.2
5,Libya,6.9
6,Morocco,37.0
7,Sudan,48.1
...,...,...
228,Samoa,0.2
229,Solomon Islands,0.8
230,Tonga,0.1
231,Tuvalu,0.0


This step extracts the article titles from the URLs in the politicians dataset and applies the function to fetch the revision IDs for each article.

In [6]:
import requests

def get_revision_id(article_title):
    """
    Fetches the current revision ID for a Wikipedia article using the Wikipedia API.
    Handles errors and missing pages.
    """
    url = "https://en.wikipedia.org/w/api.php"

    params = {
        "action": "query",
        "prop": "revisions",
        "titles": article_title,
        "rvprop": "ids",
        "format": "json"
    }

    try:
        # Make the API request
        response = requests.get(url, params=params)
        response.raise_for_status()  # Check if the request was successful
        
        # Parse the JSON response and return the revision ID
        data = response.json()
        pages = data.get("query", {}).get("pages", {})
        
        for page_id, page_info in pages.items():
            if 'missing' in page_info:
                return f"Error: The article '{article_title}' does not exist."
            return page_info.get("revisions", [{}])[0].get("revid", "No revision ID found.")

    except requests.exceptions.RequestException as e:
        # Handle any request exceptions (e.g., network issues, invalid response)
        return f"Error: {e}"

politicians_df['article_title'] = politicians_df['url'].str.split('/').str[-1]

# Apply the get_revision_id function to the 'article_title' column
politicians_df['revision_id'] = politicians_df['article_title'].apply(get_revision_id)

politicians_df


Unnamed: 0,name,url,country,article_title,revision_id
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Majah_Ha_Adrif,1233202991
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,Haroon_al-Afghani,1230459615
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Tayyab_Agha,1225661708
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,Khadija_Zahra_Ahmadi,1234741562
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,Aziza_Ahmadyar,1195651393
...,...,...,...,...,...
7150,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,Josiah_Tongogara,1203429435
7151,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,Langton_Towungana,1246280093
7152,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,Sengezo_Tshabangu,1228478288
7153,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,Herbert_Ushewokunze,959111842


## Step 2: Getting Article Quality Predictions 

This step sets up the necessary constants for making ORES API requests to fetch the quality score predictions for each article based on its revision ID.

In [8]:
USERNAME = "salahelbakri"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI5OTdmMzMxYjI5NjNmYmM2YmYyODY3MWRiODM3YzQzMiIsImp0aSI6IjY4MmIzMzQ5YzNhMWNhMzU5YjY3ZTQ2YzE5MWY2NzRlZDU5ZmVlMzRkNDRkZDM5ZTE5MjI2ZDExMDQ3MzAzN2Y0NTEyN2RlY2JkZmM5YzAxIiwiaWF0IjoxNzI4ODQ4MjU0LjQ5MDMyNywibmJmIjoxNzI4ODQ4MjU0LjQ5MDMzMSwiZXhwIjoxNzI4ODYyNjU0LjQ4NTk4Miwic3ViIjoiNzY2ODgzNDgiLCJpc3MiOiJodHRwczovL21ldGEud2lraW1lZGlhLm9yZyIsInJhdGVsaW1pdCI6eyJyZXF1ZXN0c19wZXJfdW5pdCI6NTAwMCwidW5pdCI6IkhPVVIifSwic2NvcGVzIjpbImJhc2ljIiwiY3JlYXRlZWRpdG1vdmVwYWdlIiwiZWRpdHByb3RlY3RlZCJdfQ.V1um1Wl_MDE3W1PWld8JIK9SjQvo-BJnNfnO7sZ8s9ggNo9Ou2GWmoJIvfkVPQuwpLoOOAaEYw7Qp_fSD2b_wBokhj8UuAQ_QvcLtDEDikGPZOs7V7yMRalKmlT12QWmWobGmC00mpaZa6-Caux81EimrX3YDdpDjVS9odXfWfmQuovmpHj2QvH020nm7CK3ZCgBofrqhaetchDMXOH891MRSPWPLdE3fkUyx_v16sfSHq5IGrxN81-8jZmgTvaZ-U-AMjZo5A_W0jffZzOPqW68j3yx-pPsN_OJegO6QTJWHI5wBbFYpBbXaYmHR_MkVpqfkLfTOP8_qBwoSjVn8zE52pZ-nfezsliraNQmUX1FNNoY6T7n6av-iYhxKra0SPy9crjUcx95aQoO704mNDYCc1m46bkpcVAYH1FiGZlri6cy7wCmjsET7cUe8RnWcskxhg4DciHGVZjwfy_C2wwVKJbsoFdyLSSGmwWMD5wl-vb4wzLa-Q0v_xQ_Ok2KOKxNZQCcs5fKCFsf82cHvFSIxcIOdl1JplMoXNflff92DtGPgDg7aLAYspZsjBpbz4QqrjMciJTfag6sDkYX_9NRx8O4n72hkrL3kX8-DPLnbqJ2Ww4FMyiCLp84zMUm6SrMcZ35CYPdaCvqI8SKLb9gxp6vLjWHtQ1rxeGtvRE"

#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0*60.0)/5000.0)-API_LATENCY_ASSUMED  # The key authorizes 5000 requests per hour

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = { 'Bison':1085687913 , 'Northern flicker':1086582504 , 'Red squirrel':1083787665 , 'Chinook salmon':1085406228 , 'Horseshoe bat':1060601936 }

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [9]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# Wikipedia API URL to get revision ID
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"

# ORES API URL template
ORES_API_URL = "https://ores.wikimedia.org/v3/scores/{wiki}/{revid}?models=articlequality"

# Session for making requests
session = requests.Session()

def get_quality_score(revision_id):
    """
    Fetches the quality score from ORES for a given revision ID.
    Ensures the revision ID is passed as an integer to avoid unprocessable entity errors.
    Handles missing (NaN) revision IDs.
    """
    if pd.isna(revision_id):
        return "No revision ID"
    
    try:
        # Convert revision_id to an integer
        revision_id = int(revision_id)
    except ValueError:
        return "Invalid revision ID"
    
    # Clear any session cookies to avoid state issues
    session.cookies.clear()
    
    # Format the URL with the correct revision ID
    url = ORES_API_URL.format(wiki='enwiki', revid=revision_id)
    
    try:
        response = session.get(url)
        response.raise_for_status()  # Check if the request was successful

        data = response.json()
        score = data.get('enwiki', {}).get('scores', {}).get(str(revision_id), {}).get('articlequality', {}).get('score', {}).get('prediction', None)
        
        if score:
            return score
        else:
            return "No quality score found"
    except requests.exceptions.RequestException as e:
        return "Error fetching score"

# Function to handle parallel requests
def parallel_requests(df, max_workers=10):
    """
    Use ThreadPoolExecutor to fetch quality scores in parallel.
    """
    quality_scores = []
    
    # Use ThreadPoolExecutor to handle parallel requests
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {executor.submit(get_quality_score, revision_id): revision_id for revision_id in df['revision_id']}
        
        for future in as_completed(futures):
            try:
                quality_scores.append(future.result())
            except Exception as exc:
                print(f"Exception occurred: {exc}")
    
    return quality_scores

# Apply the parallel requests function to the 'revision_id' column
politicians_df['quality_score'] = parallel_requests(politicians_df)

# Final dataFrame with revision IDs and quality scores
politicians_df



Unnamed: 0,name,url,country,article_title,revision_id,quality_score
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Majah_Ha_Adrif,1233202991,Start
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,Haroon_al-Afghani,1230459615,Start
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Tayyab_Agha,1225661708,Stub
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,Khadija_Zahra_Ahmadi,1234741562,Start
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,Aziza_Ahmadyar,1195651393,Start
...,...,...,...,...,...,...
7150,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,Josiah_Tongogara,1203429435,Stub
7151,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,Langton_Towungana,1246280093,C
7152,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,Sengezo_Tshabangu,1228478288,Stub
7153,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,Herbert_Ushewokunze,959111842,Start


Now we find missing scores and compute the error rate.

In [10]:
# Define what counts as a missing or invalid score
def is_missing_score(quality_score):
    """
    Identifies missing or invalid quality scores.
    """
    return quality_score in ["No quality score found", "No revision ID", "Error fetching score", "Invalid revision ID"]

# Filter articles for which ORES score could not be retrieved
missing_score_df = politicians_df[politicians_df['quality_score'].apply(is_missing_score)]

# Compute the error rate
total_articles = len(politicians_df)
articles_with_missing_scores = len(missing_score_df)
error_rate = articles_with_missing_scores / total_articles

# error rate is
print(f"\nError rate: {error_rate * 100:.2f}% ({articles_with_missing_scores} out of {total_articles} articles)")



Error rate: 0.24% (17 out of 7155 articles)


Below are the articles with missing ORES scores

In [11]:
missing_score_df

Unnamed: 0,name,url,country,article_title,revision_id,quality_score
421,Leopold Berchtold,https://en.wikipedia.org/wiki/Leopold_Berchtold,Austria,Leopold_Berchtold,1228353209,Invalid revision ID
507,Bakhish bey Rustambeyov,https://en.wikipedia.org/wiki/Bakhish_bey_Rust...,Azerbaijan,Bakhish_bey_Rustambeyov,1246060690,Invalid revision ID
1003,Alércio Dias,https://en.wikipedia.org/wiki/Alércio_Dias,Brazil,Alércio_Dias,1101206546,Error fetching score
1004,Délio dos Santos,https://en.wikipedia.org/wiki/Délio_dos_Santos,Brazil,Délio_dos_Santos,1159365191,Error fetching score
1011,Francisco Julião,https://en.wikipedia.org/wiki/Francisco_Julião,Brazil,Francisco_Julião,1228999761,Error fetching score
1018,Camila Jourdan,https://en.wikipedia.org/wiki/Camila_Jourdan,Brazil,Camila_Jourdan,1240746354,Error fetching score
1160,Christophe Kalenzaga,https://en.wikipedia.org/wiki/Christophe_Kalen...,Burkina Faso,Christophe_Kalenzaga,1188339738,Error fetching score
1191,Khin Kyaw Han,https://en.wikipedia.org/wiki/Khin_Kyaw_Han,Myanmar,Khin_Kyaw_Han,1036478390,Invalid revision ID
1333,John Ebong Ngole,https://en.wikipedia.org/wiki/John_Ebong_Ngole,Cameroon,John_Ebong_Ngole,1151361877,Invalid revision ID
1676,Alberto Fait Lizano,https://en.wikipedia.org/wiki/Alberto_Fait_Lizano,Costa Rica,Alberto_Fait_Lizano,1222118382,No quality score found


## Step 3: Combining the Datasets

In [12]:
region_df = population_df[population_df['Geography'].str.isupper()].copy()
region_df.rename(columns={'Geography': 'region'}, inplace=True)


# Create a dictionary to map countries to their regions
region_mapping = {}

# Step 5: Map each country to the corresponding region
for i, row in region_df.iterrows():
    region = row['region']
    # Get the countries that appear before the next region
    country_indices = country_population_df.index[(country_population_df.index > i) & 
                                                  ((country_population_df.index < region_df.index[region_df.index > i].min()) 
                                                   | pd.isna(region_df.index[region_df.index > i].min()))]
    country_population_df.loc[country_indices, 'region'] = region

# Step 6: Merge the Wikipedia data (politicians_df) with the population data (country_population_df) on the 'country' field
merged_df = pd.merge(politicians_df, country_population_df, on='country', how='outer')

# Step 7: Select the required columns and rename 'quality_score' to 'article_quality' for clarity
final_df = merged_df[['country', 'region', 'Population', 'article_title', 'revision_id', 'quality_score']]

# Rename 'quality_score' to 'article_quality'
final_df.rename(columns={'quality_score': 'article_quality'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_population_df.loc[country_indices, 'region'] = region
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'quality_score': 'article_quality'}, inplace=True)


In [13]:
# Perform an outer join to keep all rows and add the _merge column
merged_df = pd.merge(politicians_df, country_population_df, on='country', how='outer', indicator=True)

# Debug: Print the merged DataFrame and its columns to verify
print(merged_df.head())
print("Columns in merged_df:", merged_df.columns)

# Identify unmatched countries
wp_unmatched_countries = merged_df[merged_df['_merge'] != 'both']

# Extract unmatched Wikipedia countries (countries that were in Wikipedia dataset but not in the population dataset)
wp_no_match_countries = wp_unmatched_countries[wp_unmatched_countries['_merge'] == 'left_only']['country'].dropna().unique()

# Output the list of unmatched countries to a text file
with open('wp_countries-no_match.txt', 'w') as f:
    for country in wp_no_match_countries:
        f.write(f"{country}\n")

# Filter the successfully merged entries (rows that matched in both datasets)
merged_successful_df = merged_df[merged_df['_merge'] == 'both']

# Select only the necessary columns for the final output
final_df = merged_successful_df[['country', 'region', 'Population', 'article_title', 'revision_id', 'quality_score']]

# Rename 'quality_score' to 'article_quality'
final_df.rename(columns={'quality_score': 'article_quality'}, inplace=True)
final_df.rename(columns={'Population': 'population'}, inplace=True)

# Save the final consolidated data to a CSV file
final_df.to_csv('wp_politicians_by_country.csv', index=False)

# Print the final DataFrame for verification
final_df


                   name                                                url  \
0        Majah Ha Adrif       https://en.wikipedia.org/wiki/Majah_Ha_Adrif   
1     Haroon al-Afghani    https://en.wikipedia.org/wiki/Haroon_al-Afghani   
2           Tayyab Agha          https://en.wikipedia.org/wiki/Tayyab_Agha   
3  Khadija Zahra Ahmadi  https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...   
4        Aziza Ahmadyar       https://en.wikipedia.org/wiki/Aziza_Ahmadyar   

       country         article_title revision_id quality_score  Population  \
0  Afghanistan        Majah_Ha_Adrif  1233202991         Start        42.4   
1  Afghanistan     Haroon_al-Afghani  1230459615         Start        42.4   
2  Afghanistan           Tayyab_Agha  1225661708          Stub        42.4   
3  Afghanistan  Khadija_Zahra_Ahmadi  1234741562         Start        42.4   
4  Afghanistan        Aziza_Ahmadyar  1195651393         Start        42.4   

       region _merge  
0  SOUTH ASIA   both  
1  SOUTH ASIA   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'quality_score': 'article_quality'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'Population': 'population'}, inplace=True)


Unnamed: 0,country,region,population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,42.4,Majah_Ha_Adrif,1233202991,Start
1,Afghanistan,SOUTH ASIA,42.4,Haroon_al-Afghani,1230459615,Start
2,Afghanistan,SOUTH ASIA,42.4,Tayyab_Agha,1225661708,Stub
3,Afghanistan,SOUTH ASIA,42.4,Khadija_Zahra_Ahmadi,1234741562,Start
4,Afghanistan,SOUTH ASIA,42.4,Aziza_Ahmadyar,1195651393,Start
...,...,...,...,...,...,...
7150,Zimbabwe,EASTERN AFRICA,16.7,Josiah_Tongogara,1203429435,Stub
7151,Zimbabwe,EASTERN AFRICA,16.7,Langton_Towungana,1246280093,C
7152,Zimbabwe,EASTERN AFRICA,16.7,Sengezo_Tshabangu,1228478288,Stub
7153,Zimbabwe,EASTERN AFRICA,16.7,Herbert_Ushewokunze,959111842,Start


## Step 4: Analysis 

In [14]:
# Define high-quality articles ("FA" and "GA" are considered high quality)
high_quality = ['FA', 'GA']

# Convert population from millions to actual population (multiply by 1,000,000)
final_df['population'] = final_df['population'] * 1_000_000

# Calculate high-quality articles (1 if high quality, 0 otherwise)
final_df['is_high_quality'] = final_df['article_quality'].apply(lambda x: 1 if x in high_quality else 0)

# Group by country and region to calculate the sums for each
analysis_df = final_df.groupby(['country', 'region', 'population'], as_index=False).agg(
    total_articles=('article_title', 'count'),
    high_quality_articles=('is_high_quality', 'sum')
)

# Calculate total articles per capita (per person)
analysis_df['total_articles_per_capita'] = analysis_df['total_articles'] / analysis_df['population']
analysis_df['high_quality_articles_per_capita'] = analysis_df['high_quality_articles'] / analysis_df['population']

# Display the updated analysis
analysis_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['population'] = final_df['population'] * 1_000_000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['is_high_quality'] = final_df['article_quality'].apply(lambda x: 1 if x in high_quality else 0)


Unnamed: 0,country,region,population,total_articles,high_quality_articles,total_articles_per_capita,high_quality_articles_per_capita
0,Afghanistan,SOUTH ASIA,42400000.0,85,4,2.004717e-06,9.433962e-08
1,Albania,SOUTHERN EUROPE,2700000.0,70,6,2.592593e-05,2.222222e-06
2,Algeria,NORTHERN AFRICA,46800000.0,71,1,1.517094e-06,2.136752e-08
3,Angola,MIDDLE AFRICA,36700000.0,58,2,1.580381e-06,5.449591e-08
4,Antigua and Barbuda,CARIBBEAN,100000.0,33,0,3.300000e-04,0.000000e+00
...,...,...,...,...,...,...,...
161,Venezuela,SOUTH AMERICA,28800000.0,56,1,1.944444e-06,3.472222e-08
162,Vietnam,SOUTHEAST ASIA,98900000.0,36,1,3.640040e-07,1.011122e-08
163,Yemen,WESTERN ASIA,34400000.0,32,1,9.302326e-07,2.906977e-08
164,Zambia,EASTERN AFRICA,20200000.0,3,0,1.485149e-07,0.000000e+00


## Step 5: Results

In [15]:
# Sort by total articles per capita and select the top 10
top_10_coverage = analysis_df.sort_values(by='total_articles_per_capita', ascending=False).head(10)
top_10_coverage


Unnamed: 0,country,region,population,total_articles,high_quality_articles,total_articles_per_capita,high_quality_articles_per_capita
96,Monaco,WESTERN EUROPE,0.0,10,1,inf,inf
154,Tuvalu,OCEANIA,0.0,1,0,inf,
4,Antigua and Barbuda,CARIBBEAN,100000.0,33,0,0.00033,0.0
51,Federated States of Micronesia,OCEANIA,100000.0,14,0,0.00014,0.0
93,Marshall Islands,OCEANIA,100000.0,13,0,0.00013,0.0
149,Tonga,OCEANIA,100000.0,10,0,0.0001,0.0
12,Barbados,CARIBBEAN,300000.0,25,1,8.3e-05,3e-06
98,Montenegro,SOUTHERN EUROPE,600000.0,36,3,6e-05,5e-06
125,Seychelles,EASTERN AFRICA,100000.0,6,0,6e-05,0.0
90,Maldives,SOUTH ASIA,600000.0,33,0,5.5e-05,0.0


In [16]:
# Sort by total articles per capita and select the bottom 10
bottom_10_coverage = analysis_df.sort_values(by='total_articles_per_capita', ascending=True).head(10)
bottom_10_coverage


Unnamed: 0,country,region,population,total_articles,high_quality_articles,total_articles_per_capita,high_quality_articles_per_capita
31,China,EAST ASIA,1411300000.0,16,1,1.133707e-08,7.085666e-10
66,India,SOUTH ASIA,1428600000.0,151,0,1.056979e-07,0.0
57,Ghana,WESTERN AFRICA,34100000.0,4,0,1.173021e-07,0.0
122,Saudi Arabia,WESTERN ASIA,36900000.0,5,1,1.355014e-07,2.710027e-08
164,Zambia,EASTERN AFRICA,20200000.0,3,0,1.485149e-07,0.0
108,Norway,NORTHERN EUROPE,5500000.0,1,0,1.818182e-07,0.0
70,Israel,WESTERN ASIA,9800000.0,2,0,2.040816e-07,0.0
45,Egypt,NORTHERN AFRICA,105200000.0,32,1,3.041825e-07,9.505703e-09
37,Cote d'Ivoire,WESTERN AFRICA,30900000.0,10,0,3.236246e-07,0.0
50,Ethiopia,EASTERN AFRICA,126500000.0,44,2,3.478261e-07,1.581028e-08


In [17]:
# Sort by high quality articles per capita and select the top 10
top_10_high_quality = analysis_df.sort_values(by='high_quality_articles_per_capita', ascending=False).head(10)
top_10_high_quality

Unnamed: 0,country,region,population,total_articles,high_quality_articles,total_articles_per_capita,high_quality_articles_per_capita
96,Monaco,WESTERN EUROPE,0.0,10,1,inf,inf
59,Grenada,CARIBBEAN,100000.0,2,1,2e-05,1e-05
98,Montenegro,SOUTHERN EUROPE,600000.0,36,3,6e-05,5e-06
137,St. Lucia,CARIBBEAN,200000.0,3,1,1.5e-05,5e-06
86,Luxembourg,WESTERN EUROPE,700000.0,27,3,3.9e-05,4e-06
12,Barbados,CARIBBEAN,300000.0,25,1,8.3e-05,3e-06
62,Guyana,SOUTH AMERICA,800000.0,17,2,2.1e-05,3e-06
76,Kosovo,SOUTHERN EUROPE,1700000.0,26,4,1.5e-05,2e-06
1,Albania,SOUTHERN EUROPE,2700000.0,70,6,2.6e-05,2e-06
85,Lithuania,NORTHERN EUROPE,2900000.0,58,4,2e-05,1e-06


In [18]:
# Sort by high quality articles per capita and select the bottom 10
bottom_10_high_quality = analysis_df.sort_values(by='high_quality_articles_per_capita', ascending=True).head(10)
bottom_10_high_quality


Unnamed: 0,country,region,population,total_articles,high_quality_articles,total_articles_per_capita,high_quality_articles_per_capita
82,Lesotho,SOUTHERN AFRICA,2300000.0,5,0,2.173913e-06,0.0
93,Marshall Islands,OCEANIA,100000.0,13,0,0.00013,0.0
92,Malta,SOUTHERN EUROPE,600000.0,1,0,1.666667e-06,0.0
90,Maldives,SOUTH ASIA,600000.0,33,0,5.5e-05,0.0
89,Malaysia,SOUTHEAST ASIA,33400000.0,68,0,2.035928e-06,0.0
88,Malawi,EASTERN AFRICA,19800000.0,16,0,8.080808e-07,0.0
83,Liberia,WESTERN AFRICA,5400000.0,25,0,4.62963e-06,0.0
164,Zambia,EASTERN AFRICA,20200000.0,3,0,1.485149e-07,0.0
79,Laos,SOUTHEAST ASIA,7500000.0,5,0,6.666667e-07,0.0
77,Kuwait,WESTERN ASIA,4400000.0,17,0,3.863636e-06,0.0


In [19]:
# Group by region and calculate total articles per capita for each region
region_coverage = analysis_df.groupby('region', as_index=False).agg(
    total_articles=('total_articles', 'sum'),
    population=('population', 'sum')
)

# Calculate total articles per capita for each region
region_coverage['total_articles_per_capita'] = region_coverage['total_articles'] / region_coverage['population']

# Sort by total articles per capita (descending)
region_coverage_sorted = region_coverage.sort_values(by='total_articles_per_capita', ascending=False)
print("Regions by total articles (per capita):")
print(region_coverage_sorted[['region', 'total_articles_per_capita']])


Regions by total articles (per capita):
             region  total_articles_per_capita
8   NORTHERN EUROPE               6.870504e-06
9           OCEANIA               6.486486e-06
0         CARIBBEAN               5.983607e-06
14  SOUTHERN EUROPE               5.260726e-06
1   CENTRAL AMERICA               3.664717e-06
17   WESTERN EUROPE               2.746828e-06
5    EASTERN EUROPE               2.663411e-06
16     WESTERN ASIA               2.064997e-06
13  SOUTHERN AFRICA               1.800878e-06
4    EASTERN AFRICA               1.382824e-06
10    SOUTH AMERICA               1.338824e-06
2      CENTRAL ASIA               1.318408e-06
7   NORTHERN AFRICA               1.180148e-06
15   WESTERN AFRICA               1.170189e-06
6     MIDDLE AFRICA               1.144698e-06
12   SOUTHEAST ASIA               7.020032e-07
11       SOUTH ASIA               3.301794e-07
3         EAST ASIA               9.726755e-08


In [20]:
# Group by region and calculate high quality articles per capita for each region
region_high_quality_coverage = analysis_df.groupby('region', as_index=False).agg(
    high_quality_articles=('high_quality_articles', 'sum'),
    population=('population', 'sum')
)

# Calculate high quality articles per capita for each region
region_high_quality_coverage['high_quality_articles_per_capita'] = region_high_quality_coverage['high_quality_articles'] / region_high_quality_coverage['population']

# Sort by high quality articles per capita (descending)
region_high_quality_coverage_sorted = region_high_quality_coverage.sort_values(by='high_quality_articles_per_capita', ascending=False)
print("Regions by high quality articles (per capita):")
print(region_high_quality_coverage_sorted[['region', 'high_quality_articles_per_capita']])


Regions by high quality articles (per capita):
             region  high_quality_articles_per_capita
8   NORTHERN EUROPE                      3.597122e-07
14  SOUTHERN EUROPE                      3.300330e-07
0         CARIBBEAN                      3.005464e-07
1   CENTRAL AMERICA                      1.559454e-07
5    EASTERN EUROPE                      1.427498e-07
17   WESTERN EUROPE                      1.323773e-07
13  SOUTHERN AFRICA                      1.024890e-07
16     WESTERN ASIA                      9.140149e-08
9           OCEANIA                      9.009009e-08
2      CENTRAL ASIA                      6.218905e-08
7   NORTHERN AFRICA                      5.861665e-08
12   SOUTHEAST ASIA                      4.609112e-08
10    SOUTH AMERICA                      4.235294e-08
6     MIDDLE AFRICA                      3.468781e-08
15   WESTERN AFRICA                      2.953874e-08
4    EASTERN AFRICA                      2.911208e-08
11       SOUTH ASIA                