## Name : Sindhu Madhadi

### Assignment A2: The goal of this assignment is to explore the concept of bias through data on Wikipedia articles 

## Step 1: Getting the Article and Population Data

In [1]:
import json
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt

### Wikipedia Politicians by country dataset:
### https://figshare.com/articles/dataset/Untitled_Item/5513449


In [2]:
df_article = pd.read_csv('A2_data/page_data.csv')
df_article.head()


Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


### The population data is available in CSV format as WPDS_2020_data.csv : https://docs.google.com/spreadsheets/d/1CFJO2zna2No5KqNm9rPK5PCACoXKzb-nycJFhV689Iw/edit#gid=283125346

In [3]:
df_population = pd.read_csv('A2_data/WPDS_2020_data.csv')
df_population.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


## Step 2: Cleaning the Data

### Ignore rows that provide cumulative regional population counts, rather than country-level counts. But remain them for future use[Rows with capital letters in name]

In [4]:
df_population[df_population["Name"].str.isupper()]

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
10,WESTERN AFRICA,WESTERN AFRICA,Sub-Region,2019,401.115,401115000
27,EASTERN AFRICA,EASTERN AFRICA,Sub-Region,2019,444.97,444970000
48,MIDDLE AFRICA,MIDDLE AFRICA,Sub-Region,2019,179.757,179757000
58,SOUTHERN AFRICA,SOUTHERN AFRICA,Sub-Region,2019,67.732,67732000
64,NORTHERN AMERICA,NORTHERN AMERICA,Sub-Region,2019,368.193,368193000
67,LATIN AMERICA AND THE CARIBBEAN,LATIN AMERICA AND THE CARIBBEAN,Sub-Region,2019,651.036,651036000
68,CENTRAL AMERICA,CENTRAL AMERICA,Sub-Region,2019,178.611,178611000


In [5]:
#Save them for future use:
Capital_population_name = df_population[df_population["Name"].str.isupper()]
Capital_population_name


Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
10,WESTERN AFRICA,WESTERN AFRICA,Sub-Region,2019,401.115,401115000
27,EASTERN AFRICA,EASTERN AFRICA,Sub-Region,2019,444.97,444970000
48,MIDDLE AFRICA,MIDDLE AFRICA,Sub-Region,2019,179.757,179757000
58,SOUTHERN AFRICA,SOUTHERN AFRICA,Sub-Region,2019,67.732,67732000
64,NORTHERN AMERICA,NORTHERN AMERICA,Sub-Region,2019,368.193,368193000
67,LATIN AMERICA AND THE CARIBBEAN,LATIN AMERICA AND THE CARIBBEAN,Sub-Region,2019,651.036,651036000
68,CENTRAL AMERICA,CENTRAL AMERICA,Sub-Region,2019,178.611,178611000


In [6]:
df_population.drop(df_population[df_population["Name"].str.isupper()].index,inplace=True)

### The dataset  of Wikipedia Politicians contains some page names that start with the string "Template:". These pages are not Wikipedia articles, and should not be included in your analysis. 


In [7]:
template_article_name=df_article[df_article["page"].str.startswith("Template:")]
template_article_name

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409
5,Template:Nigeria-politician-stub,Nigeria,391862819
...,...,...,...
44916,Template:New Zealand prime minister electoral ...,New Zealand,806286945
44966,Template:Current New Zealand political party l...,New Zealand,806301302
45587,Template:Lists of US Presidents and Vice Presi...,United States,806668141
45823,Template:Prime Ministers of Australia,Australia,806799996


In [8]:
df_article.drop(df_article[df_article["page"].str.startswith("Template:")].index,inplace=True)
df_article

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


## Step 3: Getting Article Quality Predictions

#### Get the predicted quality scores for each article in the Wikipedia dataset. We're using a machine learning system called ORES
#### The article quality estimates are, from best to worst:
#### FA - Featured article 
#### GA - Good article
#### B - B-class article
#### C - C-class article
####  Start - Start-class article
####  Stub - Stub-class article


In [9]:
def api_ores_data(revision_ids):
    
    #Defining headers:
    HEADERS = {'User-Agent': 'https://github.com/sindhumadhadi09', 'From': 'sindhu09@uw.edu'}
    
    #endpoint:
    endpoint="https://ores.wikimedia.org/v3/scores/{context}/?models={model}&revids={revid}"
   
    #parameters:
    params = {'context': 'enwiki',
              'model'  : 'articlequality',
              'revid'  : '|'.join(str(x) for x in revision_ids)
             }

    api_call = requests.get(endpoint.format(**params), headers=HEADERS)
    response = api_call.json()
    
    # Stripping out the predictions:
    rev_prediction_arr = []
    pred_notfound_arr = []
    for rev_id in revision_ids:
        try:
            prediction = response['enwiki']["scores"][str(rev_id)]["articlequality"]["score"]["prediction"]
            rev_prediction_arr.append({'rev_id':rev_id,
                                  'preditcion':prediction})
        except:
            # Storing the rev_ids for which we couldn't get any prediction.
            pred_notfound_arr.append(rev_id)
    return rev_prediction_arr, pred_notfound_arr


In [10]:
# Call API:

revs_prediction_arr = []
log_pred_notfound = []
for i, rev_ids in enumerate(np.array_split(df_article, 1000)):
    # Getting the prediction and storing the results in arrays.
    rev_ids=rev_ids['rev_id'].tolist()
    
    rev_prediction_value, pred_notfound_value = api_ores_data(rev_ids)

    revs_prediction_arr.extend(rev_prediction_value)
    log_pred_notfound.extend(pred_notfound_value)
    
    
    



In [11]:

len(revs_prediction_arr),len(log_pred_notfound)



(46425, 276)

### Step 4: Combining the Datasets

In [12]:
# Convert the prediction array to a dataframe:
rev_prediction_df=pd.DataFrame(revs_prediction_arr)
rev_prediction_df.head()


#combine the aarticle data with predciltion value:
df_article = df_article.merge(rev_prediction_df, on='rev_id')
df_article.head()


Unnamed: 0,page,country,rev_id,preditcion
0,Bir I of Kanem,Chad,355319463,Stub
1,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
2,Yos Por,Cambodia,393822005,Stub
3,Julius Gregr,Czech Republic,395521877,Stub
4,Edvard Gregr,Czech Republic,395526568,Stub


In [13]:
# Saving the data.
df_article.to_csv('A2_data/page_data_prediction.csv')


In [15]:
# Combine the two data frames:

output_data = df_article.merge(df_population, how='outer', left_on ="country" ,right_on="Name")
output_data

Unnamed: 0,page,country,rev_id,preditcion,FIPS,Name,Type,TimeFrame,Data (M),Population
0,Bir I of Kanem,Chad,355319463.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
1,Abdullah II of Kanem,Chad,498683267.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
2,Salmama II of Kanem,Chad,565745353.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
3,Kuri I of Kanem,Chad,565745365.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
4,Mohammed I of Kanem,Chad,565745375.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
...,...,...,...,...,...,...,...,...,...,...
46447,,,,,PF,French Polynesia,Country,2019.0,0.280,280000.0
46448,,,,,GU,Guam,Country,2019.0,0.175,175000.0
46449,,,,,NC,New Caledonia,Country,2019.0,0.295,295000.0
46450,,,,,PW,Palau,Country,2019.0,0.018,18000.0


### consiering the edge cases:
### Either the population dataset does not have an entry for the equivalent Wikipedia country, or vise versa.


In [16]:

output_countries_no_match = output_data[(output_data["country"].isnull())|(output_data["Name"].isnull())]

In [17]:
#save to file:
output_countries_no_match.to_csv("A2_data/wp_wpds_countries-no_match.csv")

In [28]:
# remaining data:
remaining_data = output_data[(output_data["country"].isnull()==False)&(output_data["Name"].isnull()==False)]
# renaming column names:
remaining_data.columns = ['article_name', 'country', 'revision_id', 'article_quality_est.',
       'FIPS', 'Name', 'Type', 'TimeFrame', 'Data (M)',
       'population']
remaining_data

Unnamed: 0,article_name,country,revision_id,article_quality_est.,FIPS,Name,Type,TimeFrame,Data (M),population
0,Bir I of Kanem,Chad,355319463.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
1,Abdullah II of Kanem,Chad,498683267.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
2,Salmama II of Kanem,Chad,565745353.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
3,Kuri I of Kanem,Chad,565745365.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
4,Mohammed I of Kanem,Chad,565745375.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
...,...,...,...,...,...,...,...,...,...,...
46414,Rita Sinon,Seychelles,800323154.0,Stub,SC,Seychelles,Country,2019.0,0.098,98000.0
46415,Sylvette Frichot,Seychelles,800323798.0,Stub,SC,Seychelles,Country,2019.0,0.098,98000.0
46416,May De Silva,Seychelles,800969960.0,Start,SC,Seychelles,Country,2019.0,0.098,98000.0
46417,Vincent Meriton,Seychelles,802051093.0,Stub,SC,Seychelles,Country,2019.0,0.098,98000.0


In [69]:
#Finalising Schema:
remaining_data = remaining_data[["country","article_name","revision_id","article_quality_est.","population"]]
remaining_data




Unnamed: 0,country,article_name,revision_id,article_quality_est.,population
0,Chad,Bir I of Kanem,355319463.0,Stub,16877000.0
1,Chad,Abdullah II of Kanem,498683267.0,Stub,16877000.0
2,Chad,Salmama II of Kanem,565745353.0,Stub,16877000.0
3,Chad,Kuri I of Kanem,565745365.0,Stub,16877000.0
4,Chad,Mohammed I of Kanem,565745375.0,Stub,16877000.0
...,...,...,...,...,...
44563,Seychelles,Rita Sinon,800323154.0,Stub,98000.0
44564,Seychelles,Sylvette Frichot,800323798.0,Stub,98000.0
44565,Seychelles,May De Silva,800969960.0,Start,98000.0
44566,Seychelles,Vincent Meriton,802051093.0,Stub,98000.0


In [30]:
# save to file :
remaining_data.to_csv("A2_data/wp_wpds_politicians_by_country.csv")

## Step 5: Analysis
### articles-per-population and high-quality articles for each country AND for each geographic region. 

### Articles-per-population

In [72]:
articles_per_population = remaining_data.groupby(["country"]).apply(lambda s: (s.article_name.count()/s.population.max())*100)
articles_per_population


country
Afghanistan    0.000819
Albania        0.016068
Algeria        0.000262
Andorra        0.041463
Angola         0.000326
                 ...   
Venezuela      0.000454
Vietnam        0.000194
Yemen          0.000389
Zambia         0.000136
Zimbabwe       0.001097
Length: 183, dtype: float64

### High - Quality Articles:

In [74]:

remaining_data["hight_quality_article"]= (remaining_data["article_quality_est."]=="FA")|(remaining_data["article_quality_est."]=="GA")



In [75]:
high_quality_articles = remaining_data.groupby(["country"]).apply(lambda s: (s.hight_quality_article.sum()/s.article_name.count())*100)
high_quality_articles

country
Afghanistan    4.075235
Albania        0.657895
Algeria        1.724138
Andorra        0.000000
Angola         0.000000
                 ...   
Venezuela      2.307692
Vietnam        6.951872
Yemen          2.586207
Zambia         0.000000
Zimbabwe       1.226994
Length: 183, dtype: float64

# Step 6: Results

## 1.Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [79]:
articles_per_population.to_frame("articles_per_population_per_country").reset_index().sort_values("articles_per_population_per_country",ascending = False).head(10)


Unnamed: 0,country,articles_per_population_per_country
169,Tuvalu,0.54
117,Nauru,0.472727
138,San Marino,0.238235
110,Monaco,0.105263
95,Liechtenstein,0.071795
104,Marshall Islands,0.064912
164,Tonga,0.063636
70,Iceland,0.05462
3,Andorra,0.041463
52,Federated States of Micronesia,0.033962


## 2.Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population


In [80]:
articles_per_population.to_frame("articles_per_population_per_country").reset_index().sort_values("articles_per_population_per_country",ascending = True).head(10)




Unnamed: 0,country,articles_per_population_per_country
71,India,6.9e-05
72,Indonesia,7.7e-05
34,China,8.1e-05
176,Uzbekistan,8.2e-05
51,Ethiopia,8.8e-05
181,Zambia,0.000136
84,"Korea, North",0.00014
162,Thailand,0.000168
114,Mozambique,0.000186
13,Bangladesh,0.000187


### 3.Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality


In [81]:
high_quality_articles.to_frame("high_articles_per_count").reset_index().sort_values("high_articles_per_count",ascending = False).head(10)


Unnamed: 0,country,high_articles_per_count
84,"Korea, North",22.222222
140,Saudi Arabia,12.820513
135,Romania,12.244898
31,Central African Republic,12.121212
176,Uzbekistan,10.714286
106,Mauritania,10.416667
64,Guatemala,8.433735
44,Dominica,8.333333
158,Syria,7.8125
18,Benin,7.692308


### 4.Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [97]:
high_quality_articles.to_frame("high_articles_per_count").reset_index().sort_values("high_articles_per_count",ascending = True).head(10)


Unnamed: 0,country,high_articles_per_count
148,Solomon Islands,0.0
164,Tonga,0.0
117,Nauru,0.0
116,Namibia,0.0
43,Djibouti,0.0
114,Mozambique,0.0
110,Monaco,0.0
49,Eritrea,0.0
50,Estonia,0.0
109,Moldova,0.0


In [84]:
# To answer furter questions we need:
data_WPDS = pd.read_csv("A2_data/WPDS_2020_data.csv")

In [85]:
country_region = data_WPDS["Type"]
country_name = data_WPDS["Name"]
population = data_WPDS["Population"]
set_regions_country = {}
set_regions_population ={}

In [88]:
#create region country mapping
for p,cr,cn in zip(population,country_region,country_name):
    if cr=="Sub-Region":
        set_regions_country[cn]=[]
        current = cn
        set_regions_population[cn]=p
    if cr=="Country":
        if current!= None:
            set_regions_country[current].append(cn)

In [90]:
#country---> region:
set_country_region ={}
for r,c in set_regions_country.items():
    for i in c :
        set_country_region[i]=r

In [93]:
remaining_data["region"] = remaining_data["country"].replace(set_country_region)

In [94]:

#create a region for region population
remaining_data["region_population"] = remaining_data["region"].replace(set_regions_population)


### 5.Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population

In [95]:
remaining_data.groupby(["region"]).apply(lambda s: (s.article_name.count()/s.population.max())*100).to_frame("articles_per_regional_pop_per_country").sort_values("articles_per_regional_pop_per_country",ascending = False)



Unnamed: 0_level_0,articles_per_regional_pop_per_country
region,Unnamed: 1_level_1
OCEANIA,0.012138
SOUTHERN EUROPE,0.006153
CARIBBEAN,0.006095
Channel Islands,0.005603
WESTERN EUROPE,0.005474
WESTERN ASIA,0.003061
EASTERN EUROPE,0.002543
EASTERN AFRICA,0.002177
MIDDLE AFRICA,0.002045
SOUTH AMERICA,0.001431


## 6.Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [98]:
remaining_data.groupby(["region"]).apply(lambda s: (s.high_quality_article.sum()/s.article_name.count())*100).to_frame("high_articles_per_count").sort_values("high_articles_per_count",ascending = False)



Unnamed: 0_level_0,high_articles_per_count
region,Unnamed: 1_level_1
NORTHERN AMERICA,5.470805
SOUTHEAST ASIA,3.613861
WESTERN ASIA,3.472493
EASTERN EUROPE,3.161844
EAST ASIA,3.07319
CENTRAL ASIA,2.857143
Channel Islands,2.710603
MIDDLE AFRICA,2.406015
NORTHERN AFRICA,2.113459
OCEANIA,2.015355
