# Politician Analysis and Results

This file completes the data analysis and results sections of Homework 2.

In [50]:
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import pandas as pd
import numpy as np

Read in the data first.

In [51]:
data = pd.read_csv('data/wp_politicians_by_country.csv')
data.head()

Unnamed: 0,article_title,country,article_quality,revision_id,population,region
0,Shahjahan Noori,Afghanistan,"""GA""",1099689043,41.1,SOUTH ASIA
1,Abdul Ghafar Lakanwal,Afghanistan,"""Start""",943562276,41.1,SOUTH ASIA
2,Majah Ha Adrif,Afghanistan,"""Start""",852404094,41.1,SOUTH ASIA
3,Haroon al-Afghani,Afghanistan,"""B""",1095102390,41.1,SOUTH ASIA
4,Tayyab Agha,Afghanistan,"""Start""",1104998382,41.1,SOUTH ASIA


First we will need to transform the data to perform the analysis. This will involve creating a column to determine whether the article quality is high or not.

In [52]:
# Method to make a column to signify if article is high quality.
def is_high_quality (row):
    if row in ['"FA"', '"GA"']:
        return 1
    else:
        return 0

In [53]:
# Adding the high_quality column
data['high_quality'] = data.apply(lambda x: is_high_quality(x['article_quality']), axis=1)
data.head()

Unnamed: 0,article_title,country,article_quality,revision_id,population,region,high_quality
0,Shahjahan Noori,Afghanistan,"""GA""",1099689043,41.1,SOUTH ASIA,1
1,Abdul Ghafar Lakanwal,Afghanistan,"""Start""",943562276,41.1,SOUTH ASIA,0
2,Majah Ha Adrif,Afghanistan,"""Start""",852404094,41.1,SOUTH ASIA,0
3,Haroon al-Afghani,Afghanistan,"""B""",1095102390,41.1,SOUTH ASIA,0
4,Tayyab Agha,Afghanistan,"""Start""",1104998382,41.1,SOUTH ASIA,0


Now we will aggregate the data by country and get total number of articles per capita and total number of high quality articles per capita.

In [54]:
# Creating the country aggregation
agg_data = data.groupby(['country', 'population', 'region']).agg({'high_quality': 'sum', 'article_title': 'count'})
agg_data = agg_data.reset_index()
agg_data['articles_per_population'] = agg_data['article_title']/agg_data['population']
agg_data['hq_articles_per_population'] = agg_data['high_quality']/agg_data['population']
agg_data = agg_data.drop(['high_quality', 'article_title'], axis=1)
agg_data.head()

Unnamed: 0,country,population,region,articles_per_population,hq_articles_per_population
0,Afghanistan,41.1,SOUTH ASIA,2.871046,0.145985
1,Albania,2.8,SOUTHERN EUROPE,29.642857,2.142857
2,Algeria,44.9,NORTHERN AFRICA,0.757238,0.0
3,Andorra,0.1,SOUTHERN EUROPE,100.0,20.0
4,Angola,35.6,MIDDLE AFRICA,1.179775,0.0


Now we will aggregate the data by region and get total number of articles per capita and total number of high quality articles per capita.

In [55]:
# Bring back the population data for region populations
world_pop = pd.read_csv('population_by_country_2022.csv')

In [56]:
# Creating the region aggregation
reg_agg_data = data.groupby(['region']).agg({'high_quality': 'sum', 'article_title': 'count'})
reg_agg_data = reg_agg_data.reset_index()
reg_agg_data = reg_agg_data.merge(world_pop, how='left', left_on='region', right_on='Geography')
reg_agg_data = reg_agg_data.rename(columns={'Population (millions)': 'population'})
reg_agg_data['articles_per_population'] = reg_agg_data['article_title']/reg_agg_data['population']
reg_agg_data['hq_articles_per_population'] = reg_agg_data['high_quality']/reg_agg_data['population']
reg_agg_data = reg_agg_data.drop(['high_quality', 'article_title', 'Geography'], axis=1)
reg_agg_data.head(20)

Unnamed: 0,region,population,articles_per_population,hq_articles_per_population
0,CARIBBEAN,44.0,4.568182,0.181818
1,CENTRAL AMERICA,178.0,1.095506,0.05618
2,CENTRAL ASIA,78.0,1.358974,0.038462
3,EAST ASIA,1674.0,0.146356,0.009558
4,EASTERN AFRICA,473.0,1.374207,0.031712
5,EASTERN EUROPE,287.0,2.56446,0.135889
6,MIDDLE AFRICA,196.0,1.035714,0.02551
7,NORTHERN AFRICA,251.0,0.904382,0.023904
8,NORTHERN EUROPE,107.0,2.448598,0.074766
9,OCEANIA,44.0,1.954545,0.045455


## Now we can get the results.

First off the 10 countries with the most articles per capita. There are six countries without population values so we will exclude them from this analysis.

In [57]:
# Not using .head() to skip the countries without population
agg_data.sort_values(by='articles_per_population', ascending=False, ignore_index=True).loc[6:15]

Unnamed: 0,country,population,region,articles_per_population,hq_articles_per_population
6,Antigua and Barbuda,0.1,CARIBBEAN,170.0,0.0
7,Federated States of Micronesia,0.1,OCEANIA,130.0,0.0
8,Andorra,0.1,SOUTHERN EUROPE,100.0,20.0
9,Barbados,0.3,CARIBBEAN,93.333333,0.0
10,Marshall Islands,0.1,OCEANIA,90.0,0.0
11,Montenegro,0.6,SOUTHERN EUROPE,60.0,5.0
12,Seychelles,0.1,EASTERN AFRICA,60.0,0.0
13,Luxembourg,0.7,WESTERN EUROPE,52.857143,0.0
14,Bhutan,0.8,SOUTH ASIA,51.25,0.0
15,Grenada,0.1,CARIBBEAN,50.0,0.0


Now the 10 countries with the least articles per capita.

In [58]:
agg_data.sort_values(by='articles_per_population').head(10)

Unnamed: 0,country,population,region,articles_per_population,hq_articles_per_population
32,China,1436.6,EAST ASIA,0.001392,0.0
106,Mexico,127.5,CENTRAL AMERICA,0.007843,0.0
140,Saudi Arabia,36.7,WESTERN ASIA,0.081744,0.054496
134,Romania,19.0,EASTERN EUROPE,0.105263,0.105263
73,India,1417.2,SOUTH ASIA,0.1256,0.004234
153,Sri Lanka,22.4,SOUTH ASIA,0.133929,0.0
48,Egypt,103.5,NORTHERN AFRICA,0.135266,0.0
53,Ethiopia,123.4,EASTERN AFRICA,0.202593,0.024311
161,Taiwan,23.2,EAST ASIA,0.215517,0.0
180,Vietnam,99.4,SOUTHEAST ASIA,0.27163,0.020121


Now the 10 countries with the most high quality articles per capita. Again, there is a country without population so we will exclude it from this analysis.

In [59]:
# Not using .head() to skip the country without population
agg_data.sort_values(by='hq_articles_per_population', ascending=False, ignore_index=True).loc[1:10]

Unnamed: 0,country,population,region,articles_per_population,hq_articles_per_population
1,Andorra,0.1,SOUTHERN EUROPE,100.0,20.0
2,Montenegro,0.6,SOUTHERN EUROPE,60.0,5.0
3,Albania,2.8,SOUTHERN EUROPE,29.642857,2.142857
4,Suriname,0.6,SOUTH AMERICA,38.333333,1.666667
5,Bosnia-Herzegovina,3.4,SOUTHERN EUROPE,15.294118,1.470588
6,Lithuania,2.8,NORTHERN EUROPE,26.785714,1.071429
7,Croatia,3.8,SOUTHERN EUROPE,14.210526,1.052632
8,Slovenia,2.1,SOUTHERN EUROPE,20.47619,0.952381
9,Palestinian Territory,5.4,WESTERN ASIA,13.148148,0.925926
10,Gabon,2.4,MIDDLE AFRICA,2.5,0.833333


Now the 10 countries with the least high quality articles per capita. There are actually 86 countries with no high quality articles. The below two queries display all countries without a high quality article.

In [60]:
agg_data.sort_values(by='hq_articles_per_population', ignore_index=True).head(50)

Unnamed: 0,country,population,region,articles_per_population,hq_articles_per_population
0,Laos,7.5,SOUTHEAST ASIA,0.8,0.0
1,Mongolia,3.4,EAST ASIA,3.235294,0.0
2,Moldova,3.5,EASTERN EUROPE,5.142857,0.0
3,Mexico,127.5,CENTRAL AMERICA,0.007843,0.0
4,Marshall Islands,0.1,OCEANIA,90.0,0.0
5,Malta,0.5,SOUTHERN EUROPE,14.0,0.0
6,Maldives,0.6,SOUTH ASIA,36.666667,0.0
7,Malawi,20.4,EASTERN AFRICA,0.343137,0.0
8,Madagascar,29.6,EASTERN AFRICA,1.013514,0.0
9,Luxembourg,0.7,WESTERN EUROPE,52.857143,0.0


In [61]:
agg_data.sort_values(by='hq_articles_per_population', ignore_index=True).loc[50:85]

Unnamed: 0,country,population,region,articles_per_population,hq_articles_per_population
50,Qatar,2.7,WESTERN ASIA,5.925926,0.0
51,Paraguay,6.8,SOUTH AMERICA,2.352941,0.0
52,Oman,4.6,WESTERN ASIA,1.956522,0.0
53,North Macedonia,1.8,SOUTHERN EUROPE,15.555556,0.0
54,Gambia,2.7,WESTERN AFRICA,7.777778,0.0
55,Finland,5.6,NORTHERN EUROPE,7.142857,0.0
56,Zimbabwe,16.3,EASTERN AFRICA,3.865031,0.0
57,Bangladesh,171.2,SOUTH ASIA,0.327103,0.0
58,Argentina,46.2,SOUTH AMERICA,1.168831,0.0
59,Austria,9.0,WESTERN EUROPE,9.555556,0.0


Now the regions ordered by number of articles per population.

In [62]:
reg_agg_data.sort_values(by='articles_per_population', ascending=False)

Unnamed: 0,region,population,articles_per_population,hq_articles_per_population
14,SOUTHERN EUROPE,151.0,5.89404,0.304636
0,CARIBBEAN,44.0,4.568182,0.181818
17,WESTERN EUROPE,197.0,3.548223,0.111675
5,EASTERN EUROPE,287.0,2.56446,0.135889
8,NORTHERN EUROPE,107.0,2.448598,0.074766
16,WESTERN ASIA,294.0,2.333333,0.095238
9,OCEANIA,44.0,1.954545,0.045455
13,SOUTHERN AFRICA,69.0,1.710145,0.057971
4,EASTERN AFRICA,473.0,1.374207,0.031712
2,CENTRAL ASIA,78.0,1.358974,0.038462


Finally, regions ordered by number of high quality articles per population

In [63]:
reg_agg_data.sort_values(by='hq_articles_per_population', ascending=False)

Unnamed: 0,region,population,articles_per_population,hq_articles_per_population
14,SOUTHERN EUROPE,151.0,5.89404,0.304636
0,CARIBBEAN,44.0,4.568182,0.181818
5,EASTERN EUROPE,287.0,2.56446,0.135889
17,WESTERN EUROPE,197.0,3.548223,0.111675
16,WESTERN ASIA,294.0,2.333333,0.095238
8,NORTHERN EUROPE,107.0,2.448598,0.074766
13,SOUTHERN AFRICA,69.0,1.710145,0.057971
1,CENTRAL AMERICA,178.0,1.095506,0.05618
9,OCEANIA,44.0,1.954545,0.045455
2,CENTRAL ASIA,78.0,1.358974,0.038462
