# Data Scaping & Aggregation 
Submitted by: Samantha Roska, Rebecca Hailperin-Lausch, and Samantha Russel

This file contains the steps we did to scrape, aggregate, and manipulate the raw data.

In [1]:
import numpy as np
import pandas as pd
import glob
import re
import warnings

In [2]:
def column_rename(col_name):
    '''helps with renaming columns'''
    t = col_name.strip()
    t = re.sub('%', 'percent ', t)
    t = re.sub(' ', '_', t)
    t = str.lower(t)
    return t

### Obtaining Dataset 1: Women in Parliments 

A seperate script was written to scrape the data from the web. This script reads in the data from each month and each year into individual csv files.

In [3]:
# This line uses the script to pull in the data from the IPU website and save each file to a csv in the data/world_data/ folder. 
# It is commented out because we already ran this script and want to avoid unecessarily pulling from their website
#!python scripts\obtaining_world_data.py

First, the seperate csv files needed to be read in and aggregated into one dataframe.

In [4]:
sheets = []

pat= r'(\d{4})'
for filename in glob.glob("./data/world_data/wd_*.csv"):
    ls = re.split(pat,filename)
    year = int(ls[1])
    if year > 2019:
        temp = pd.read_csv(filename,delimiter=',',skiprows=5, index_col=None,header=None)
        sheets.append(temp)
    elif year < 2019 and year > 2008:
        temp = pd.read_csv(filename,delimiter=',',skiprows=2, index_col=None,header=None)
        sheets.append(dataframe)
    else:
        temp = pd.read_csv(filename,delimiter=',',skiprows=1, index_col=None,header=None)
        dataframe = pd.DataFrame(temp)
        sheets.append(dataframe)

df = pd.concat(sheets,axis=0,ignore_index=True)
df = df.drop(columns=[10])
df.columns = ['Rank','Country','Lower single House Elections','Lower single House Seats','Lower single House Women','Lower single House %W','Upper House Senate Elections', 'Upper House Senate Seats','Upper House Senate Women','Upper House Senate %W']
df.head(15)

Unnamed: 0,Rank,Country,Lower single House Elections,Lower single House Seats,Lower single House Women,Lower single House %W,Upper House Senate Elections,Upper House Senate Seats,Upper House Senate Women,Upper House Senate %W
0,1,Sweden,09 1998,349,149,42.7,---,---,---,---
1,2,Denmark,03 1998,179,67,37.4,---,---,---,---
2,3,Finland,03 1999,200,73,36.5,---,---,---,---
3,4,Netherlands,05 1998,150,54,36.0,05 1999,75,20,26.7
4,5,Norway,09 2001,165,59,35.8,---,---,---,---
5,6,Iceland,05 1999,63,22,34.9,---,---,---,---
6,7,Germany,09 1998,669,207,30.9,N.A.,69,17,24.6
7,8,New Zealand,11 1999,120,37,30.8,---,---,---,---
8,9,Mozambique,12 1999,250,75,30.0,---,---,---,---
9,10,South Africa,06 1999,399,119,29.8,06 1999,89,17,31.5*


In [5]:
# Rename columns to be consistent
text_transform_df = df.replace(regex=r'---|\?',value=0).replace('N.A.',value='0')
text_transform_df = text_transform_df.rename(columns=lambda x: column_rename(x))

##### Series data conversion & manipulation

In [6]:
# Convert dates that had period to same format as other dates
text_transform_df['lower_single_house_elections'] = text_transform_df['lower_single_house_elections'].str.replace('.',' ',regex=False)
text_transform_df['upper_house_senate_elections'] = text_transform_df['upper_house_senate_elections'].str.replace('.',' ',regex=False)

In [7]:
# Convert dates into correct format
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    date_tranform_df = text_transform_df.copy()
    date_tranform_df.country = date_tranform_df.country.replace(regex=r'\(|\)|\*|\d',value='')

    date_tranform_df.lower_single_house_elections = pd.to_datetime(date_tranform_df.lower_single_house_elections, errors='coerce',infer_datetime_format=True)
    date_tranform_df.upper_house_senate_elections = pd.to_datetime(date_tranform_df.upper_house_senate_elections, errors='coerce',infer_datetime_format=True)


##### Conversion of numeric fields

In [8]:
num_transform_df = date_tranform_df.copy()

# Convert to numeric
num_transform_df.lower_single_house_seats = pd.to_numeric(num_transform_df.lower_single_house_seats.replace(regex='\D',value=''))
num_transform_df.lower_single_house_women = pd.to_numeric(num_transform_df.lower_single_house_women.replace(regex='\D',value=''))

num_transform_df.upper_house_senate_seats = pd.to_numeric(num_transform_df.upper_house_senate_seats.replace(regex='\D',value=''))
num_transform_df.upper_house_senate_women = pd.to_numeric(num_transform_df.upper_house_senate_women.replace(regex='\D',value=''))

# Fill nas with zeros
num_transform_df.upper_house_senate_women.fillna(0, inplace=True)
num_transform_df.upper_house_senate_seats.fillna(0, inplace=True)

In [9]:
# calculate the percent of women in both upper and lower houses and add these as new columns
num_transform_df['lower_single_house_percent_w'] = num_transform_df.lower_single_house_women / num_transform_df.lower_single_house_seats
num_transform_df['upper_house_senate_percent_w'] = num_transform_df.upper_house_senate_women / num_transform_df.upper_house_senate_seats

We then checked that the columns are all in the correct format.

In [10]:
num_transform_df.dtypes

rank                                    object
country                                 object
lower_single_house_elections    datetime64[ns]
lower_single_house_seats               float64
lower_single_house_women               float64
lower_single_house_percent_w           float64
upper_house_senate_elections    datetime64[ns]
upper_house_senate_seats               float64
upper_house_senate_women               float64
upper_house_senate_percent_w           float64
dtype: object

In [11]:
# Drop duplicate rows
drop_dupp = num_transform_df.drop_duplicates()
drop_dupp.reset_index(drop=True)

Unnamed: 0,rank,country,lower_single_house_elections,lower_single_house_seats,lower_single_house_women,lower_single_house_percent_w,upper_house_senate_elections,upper_house_senate_seats,upper_house_senate_women,upper_house_senate_percent_w
0,1,Sweden,1998-09-01,349.0,149.0,0.426934,NaT,0.0,0.0,
1,2,Denmark,1998-03-01,179.0,67.0,0.374302,NaT,0.0,0.0,
2,3,Finland,1999-03-01,200.0,73.0,0.365000,NaT,0.0,0.0,
3,4,Netherlands,1998-05-01,150.0,54.0,0.360000,1999-05-01,75.0,20.0,0.266667
4,5,Norway,2001-09-01,165.0,59.0,0.357576,NaT,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...
9791,111,Cambodia,2018-07-01,125.0,26.0,0.208000,2018-02-01,62.0,10.0,0.161290
9792,112,Kyrgyzstan,2021-11-01,88.0,18.0,0.204545,NaT,0.0,0.0,
9793,116,Mauritius,2019-11-01,70.0,14.0,0.200000,NaT,0.0,0.0,
9794,118,Burkina Faso,2022-03-01,71.0,14.0,0.197183,NaT,0.0,0.0,


Finally, we used the lower_single_house_elections_year and upper_house_senate_elections_year columns to add two new columns that contain just the year of that election. These new columns will be used in the visualization file in order to join this dataset with the other dataset. 

In [12]:
# Add a year as a seperate column
drop_dupp_add_year = drop_dupp.copy()
drop_dupp_add_year['lower_single_house_elections_year'] = drop_dupp_add_year['lower_single_house_elections'].astype(str).apply(lambda x:x[:4])
drop_dupp_add_year['upper_house_senate_elections_year'] = drop_dupp_add_year['upper_house_senate_elections'].astype(str).apply(lambda x:x[:4])

In [13]:
# Save file to csv for use in analysis & visualizations
drop_dupp_add_year.to_csv('./data/world_data_final.csv', index=False)

### Obtaining Dataset 2: Democracy Index

This data was read in directly using pandas.read_html().

In [14]:
# Read in the data directly
url = 'https://en.wikipedia.org/wiki/Democracy_Index'
webdata = pd.read_html(url)
country_scores = webdata[5]
country_scores

Unnamed: 0,Region,2022 rank,Country,Regime type,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2008,2006
0,North America,12,Canada,Full democracy,8.88,8.87,9.24,9.22,9.15,9.15,9.15,9.08,9.08,9.08,9.08,9.08,9.08,9.07,9.07
1,North America,30,United States,Flawed democracy,7.85,7.85,7.92,7.96,7.96,7.98,7.98,8.05,8.11,8.11,8.11,8.11,8.18,8.22,8.22
2,Western Europe,20,Austria,Full democracy,8.20,8.07,8.16,8.29,8.29,8.42,8.41,8.54,8.54,8.48,8.62,8.49,8.49,8.49,8.69
3,Western Europe,36,Belgium,Flawed democracy,7.64,7.51,7.51,7.64,7.78,7.78,7.77,7.93,7.93,8.05,8.05,8.05,8.05,8.16,8.15
4,Western Europe,37,Cyprus,Flawed democracy,7.38,7.43,7.56,7.59,7.59,7.59,7.65,7.53,7.40,7.29,7.29,7.29,7.29,7.70,7.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,Sub-Saharan Africa,92,Tanzania,Hybrid regime,5.10,5.10,5.10,5.16,5.41,5.47,5.76,5.58,5.77,5.77,5.88,5.64,5.64,5.28,5.18
163,Sub-Saharan Africa,130,Togo,Authoritarian,2.99,2.80,2.80,3.30,3.10,3.05,3.32,3.41,3.45,3.45,3.45,3.45,3.45,2.43,1.75
164,Sub-Saharan Africa,99,Uganda,Hybrid regime,4.55,4.48,4.94,5.02,5.20,5.09,5.26,5.22,5.22,5.22,5.16,5.13,5.05,5.03,5.14
165,Sub-Saharan Africa,78,Zambia,Hybrid regime,5.80,5.72,4.86,5.09,5.61,5.68,5.99,6.28,6.39,6.26,6.26,6.19,5.68,5.25,5.25


### Data Cleaning & Manipulation: Democracy Index

In [15]:
# Rename columns to be consistent
country_scores.columns = [col.lower().replace(' ','_') for col in country_scores.columns]

In [16]:
# Add the 5 year mean 
country_scores['five_year_mean'] = country_scores[['2022', '2021', '2020', '2019', '2018']].mean(axis=1)

In [17]:
# Saving dataset to csv for use in analysis
country_scores.to_csv('data/democracy_index_data.csv', index=False)