# Data Scaping, Manipulation & Aggregation 
Submitted by: Samantha Roska, Rebecca Hailperin-Lausch, and Samantha Russel

This file contains the steps we did to scrap, aggregate, and manipulate the data.

In [32]:
import numpy as np
import pandas as pd
import glob
import re
import warnings

In [29]:
def column_rename(col_name):
    '''helps with renaming columns'''
    t = col_name.strip()
    t = re.sub('%', 'percent ', t)
    t = re.sub(' ', '_', t)
    t = str.lower(t)
    return t

### Obtaining Dataset 1: Women in Parliments 

A seperate script was written to scrape the data from the web. This script is included in the zip file and project repository and it scraped the data from each month and each year into individual csv files.

In [8]:
# This line uses the script to pull in the data from the IPU website and save each file to csv in the data/world_data/ folder. 
# It is commented out becuase we already ran this script, and want to avoid unecessarily pulling from their website
!python scripts\obtaining_world_data.py

Data Cleaning & Manipulation: Women in Parliments 

First, the seperate csv files needed to be aggregated into one dataframe.

In [None]:
sheets = []

pat= r'(\d{4})'
for filename in glob.glob("./data/world_data/wd_*.csv"):
    ls = re.split(pat,filename)
    year = int(ls[1])
    if year > 2019:
        temp = pd.read_csv(filename,delimiter=',',skiprows=5, index_col=None,header=None)
        sheets.append(temp)
    elif year < 2019 and year > 2008:
        temp = pd.read_csv(filename,delimiter=',',skiprows=2, index_col=None,header=None)
        sheets.append(dataframe)
    else:
        temp = pd.read_csv(filename,delimiter=',',skiprows=1, index_col=None,header=None)
        dataframe = pd.DataFrame(temp)
        sheets.append(dataframe)

df = pd.concat(sheets,axis=0,ignore_index=True)
df = df.drop(columns=[10])
df.columns = ['Rank','Country','Lower single House Elections','Lower single House Seats','Lower single House Women','Lower single House %W','Upper House Senate Elections', 'Upper House Senate Seats','Upper House Senate Women','Upper House Senate %W']
df.head(15)

In [None]:
# Rename columns to be consistent
text_transform_df = df.replace(regex=r'---|\?',value=0)
text_transform_df = text_transform_df.rename(columns=lambda x: column_rename(x))

##### Series data conversion & manipulation

In [None]:
# Convert dates that had period to same format as other dates
text_transform_df['lower_single_house_elections'] = text_transform_df['lower_single_house_elections'].str.replace('.',' ',regex=False)
text_transform_df['upper_house_senate_elections'] = text_transform_df['upper_house_senate_elections'].str.replace('.',' ',regex=False)

In [None]:
# Convert dates into correct format
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    date_tranform_df = text_transform_df.copy()
    date_tranform_df.country = date_tranform_df.country.replace(regex=r'\(|\)|\*|\d',value='')

    date_tranform_df.lower_single_house_elections = pd.to_datetime(date_tranform_df.lower_single_house_elections, errors='coerce',infer_datetime_format=True)
    date_tranform_df.upper_house_senate_elections = pd.to_datetime(date_tranform_df.upper_house_senate_elections, errors='coerce',infer_datetime_format=True)


##### Conversion of numeric fields

In [None]:
# Convert to numeric
num_transform_df = date_tranform_df.copy()

num_transform_df.lower_single_house_seats = pd.to_numeric(num_transform_df.lower_single_house_seats.replace(regex='\D',value=''))
num_transform_df.lower_single_house_women = pd.to_numeric(num_transform_df.lower_single_house_women.replace(regex='\D',value=''))

num_transform_df.upper_house_senate_seats = pd.to_numeric(num_transform_df.upper_house_senate_seats.replace(regex='\D',value=''))
num_transform_df.upper_house_senate_women = pd.to_numeric(num_transform_df.upper_house_senate_women.replace(regex='\D',value=''))

# Fill nas with zeros
num_transform_df.upper_house_senate_women.fillna(0, inplace=True)
num_transform_df.upper_house_senate_seats.fillna(0, inplace=True)

In [None]:
num_transform_df.upper_house_senate_women.fillna(0, inplace=True)
num_transform_df.upper_house_senate_seats.fillna(0, inplace=True)

In [None]:
num_transform_df['lower_single_house_percent_w'] = num_transform_df.lower_single_house_women	 / num_transform_df.lower_single_house_seats
num_transform_df['upper_single_house_percent_w'] = num_transform_df.upper_house_senate_women / num_transform_df.upper_house_senate_seats

We checked that the columns are all the correct format.

In [None]:
num_transform_df.dtypes

In [None]:
# Drop duplicate rows
drop_dupp = num_transform_df.drop_duplicates()
drop_dupp.reset_index(drop=True)

In [None]:
# Add a year as a seperate column
drop_dupp_add_year = drop_dupp.copy()
drop_dupp_add_year['lower_single_house_elections_year'] = drop_dupp_add_year['lower_single_house_elections'].astype(str).apply(lambda x:x[:4])
drop_dupp_add_year['upper_house_senate_elections_year'] = drop_dupp_add_year['upper_house_senate_elections'].astype(str).apply(lambda x:x[:4])

In [None]:
# Save file to csv for use in analysis & visualizations
drop_dupp_add_year.to_csv('./data/world_data_final.csv', index=False)

### Obtaining Dataset 2: Democracy Index

This data was read in directly using pandas.read_html().

In [None]:
# Read in the data directly
url = 'https://en.wikipedia.org/wiki/Democracy_Index'
webdata = pd.read_html(url)
country_scores = webdata[5]
country_scores

### Data Cleaning & Manipulation: Democracy Index

In [None]:
# Rename columns to be consistent
country_scores.columns = [col.lower().replace(' ','_') for col in country_scores.columns]

In [None]:
# Add the 5 year mean 
country_scores['five_year_mean'] = country_scores[['2022', '2021', '2020', '2019', '2018']].mean(axis=1)

In [None]:
# Saving dataset to csv for use in analysis
country_scores.to_csv('data/democracy_index_data.csv', index=False)

#### Data Exploration

First, we looked at the number of countries grouped by regime type. Full Democracies had the fewest number of countries. 

In [None]:
country_scores.groupby(['regime_type']).count()

Next, we filtered the data to look at only democracies (flawed and full).

In [None]:
democracies = country_scores[country_scores['five_year_mean'] > 6]
print('Number of democracies in dataset: ' +  str(len(set(democracies['country']))))
democracies

## Joining the two datasets together

In order to run the correlation and produce some of the visualzations, the datasets need to be joined together into one dataframe.

In [None]:
# Read in democracy index data
democracy_index = pd.read_csv('data/democracy_index_data.csv')
# Read in the percent women in parliments data
women_in_parliments =  pd.read_csv('data/world_data_final.csv')

First the Democracy Index dataset needs to be converted from wide to long. 

In [None]:
democracy_index

In [None]:
# Get list of years to be used to melt the dataframe
years_list = [str(year) for year in range(2010,2023)]
years_list.insert(0,'2008')
years_list.insert(0,'2006')
# Melt dataframe from wide to long
democracy_index_melted = democracy_index.melt(id_vars=['region','2022_rank','country','regime_type','five_year_mean'],value_vars=years_list,var_name='year',value_name='democracy_index_score')

Because many countries do not have two legislative bodies, there is significantly more missing data for the upper house. For this reason, we will focus on the lower house data only. 

In [None]:
# Get only lower house and rename columns
women_lower_house = women_in_parliments[['country','lower_single_house_elections_year','lower_single_house_percent_w','upper_single_house_elections_year','upper_single_house_percent_w']]
women_lower_house = women_lower_house.rename({'lower_or_single_house_elections_year':'year','lower_single_house_percent_w':'percent_W'},axis=1)

In [None]:
# Join the two datasets on year and country
joined_df = democracy_index_melted.merge(women_lower_house, on=['country','year'])

In [None]:
joined_df

In [None]:
# Saving dataset to csv for use in analysis
joined_df.to_csv('data/joined_dataset.csv',index=False)