In [1]:
import requests
import json
import datetime as dt
import time
import requests
import regex as re
import collections
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
from termcolor import colored

import os
from os import path
from wordcloud import WordCloud

import scipy.stats as stats
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Infrastructure EDA

### Goals
The goal of EDA here will be to examine which datasets are feasible for our problem at hand. We need a dataset that covers a wide range of countries and data for at least 20 years. Additionally we need to carefully examine the units for each indicator as we will most likely need to convert them to a standard unit. 

Once cleaned we will need to calculate the Coeffecient of Variation (COV) for each country across each of the indicators to see which countries have very unstable infrastructure and most likely political instability.

From there we will select countries that are specific to a region that also show a high COV in our disease datasets.

There is also the issue of multicolinearity between healthcare spending and the healthcare inputs of each dataset. I will likely just include healthcare spending since that dataset has the most robust data compared to the inputs dataset. Ideally we would like to have data on specific inputs for every country however countries that are relatively unstable politically and infrastructure wise have only recently started reporting these statistics.

In [2]:
%autosave 120

Autosaving every 120 seconds


In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
infra = pd.read_csv('../Data/Infrastructure/infrastructure_1.csv')

In [5]:
health = pd.read_csv('../Data/Infrastructure/Health_1.csv')

In [6]:
hosp_count = pd.read_csv('../Data/Infrastructure/healthcare_hospitalscount.csv')

In [7]:
h_tech = pd.read_csv('../Data/Infrastructure/healthcare_tech.csv')

In [8]:
h_workers = pd.read_csv('../Data/Infrastructure/healthcare_workers.csv')

In [9]:
h_spend = pd.read_csv('../Data/Infrastructure/healthcare_expenditure.csv')

## Infrastructure 

In [10]:
infra.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,ICT goods exports (% of total goods exports),TX.VAL.ICTG.ZS.UN,,,,,,,...,0.404431,0.419583,0.713791,0.412533,0.93362,0.542872,1.038984,1.18568,,
1,Aruba,ABW,ICT goods imports (% total goods imports),TM.VAL.ICTG.ZS.UN,,,,,,,...,4.764945,4.627908,4.420786,5.268659,5.243941,5.734922,4.918148,4.85433,,
2,Aruba,ABW,Individuals using the Internet (% of population),IT.NET.USER.ZS,,,,,,,...,62.0,69.0,74.0,78.9,83.78,88.661227,93.542454,97.17,,
3,Aruba,ABW,Secure Internet servers (per 1 million people),IT.NET.SECR.P6,,,,,,,...,88.522559,127.393528,253.51014,426.526042,568.54318,757.132862,1172.858342,977.544939,1152.628844,
4,Aruba,ABW,Secure Internet servers,IT.NET.SECR,,,,,,,...,9.0,13.0,26.0,44.0,59.0,79.0,123.0,103.0,122.0,


In [11]:
#infra.dtypes

In [12]:
#counting nulls for each
#infra.isnull().sum()

In [13]:
infra['Country Name'].unique()

array(['Aruba', 'Afghanistan', 'Angola', 'Albania', 'Andorra',
       'Arab World', 'United Arab Emirates', 'Argentina', 'Armenia',
       'American Samoa', 'Antigua and Barbuda', 'Australia', 'Austria',
       'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso',
       'Bangladesh', 'Bulgaria', 'Bahrain', 'Bahamas, The',
       'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda',
       'Bolivia', 'Brazil', 'Barbados', 'Brunei Darussalam', 'Bhutan',
       'Botswana', 'Central African Republic', 'Canada',
       'Central Europe and the Baltics', 'Switzerland', 'Channel Islands',
       'Chile', 'China', "Cote d'Ivoire", 'Cameroon', 'Congo, Dem. Rep.',
       'Congo, Rep.', 'Colombia', 'Comoros', 'Cabo Verde', 'Costa Rica',
       'Caribbean small states', 'Cuba', 'Curacao', 'Cayman Islands',
       'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica',
       'Denmark', 'Dominican Republic', 'Algeria',
       'East Asia & Pacific (excluding high income)',
       '

In [14]:
infra['Indicator Name'].unique()

array(['ICT goods exports (% of total goods exports)',
       'ICT goods imports (% total goods imports)',
       'Individuals using the Internet (% of population)',
       'Secure Internet servers (per 1 million people)',
       'Secure Internet servers',
       'Fixed broadband subscriptions (per 100 people)',
       'Fixed broadband subscriptions',
       'Fixed telephone subscriptions (per 100 people)',
       'Fixed telephone subscriptions',
       'Mobile cellular subscriptions (per 100 people)',
       'Mobile cellular subscriptions',
       'Container port traffic (TEU: 20 foot equivalent units)',
       'Liner shipping connectivity index (maximum value in 2004 = 100)',
       'Rail lines (total route-km)',
       'Railways, passengers carried (million passenger-km)',
       'Railways, goods transported (million ton-km)',
       'Air transport, passengers carried',
       'Air transport, freight (million ton-km)',
       'Air transport, registered carrier departures worldwide',

In [15]:
#dropping columns I don't need
infra.drop(axis = 0, columns = ['Country Code', 'Indicator Code'], inplace = True)

In [16]:
#Setting the index to Country Name
infra = infra.set_index('Country Name')

In [17]:
infra.head()

Unnamed: 0_level_0,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,ICT goods exports (% of total goods exports),,,,,,,,,,...,0.404431,0.419583,0.713791,0.412533,0.93362,0.542872,1.038984,1.18568,,
Aruba,ICT goods imports (% total goods imports),,,,,,,,,,...,4.764945,4.627908,4.420786,5.268659,5.243941,5.734922,4.918148,4.85433,,
Aruba,Individuals using the Internet (% of population),,,,,,,,,,...,62.0,69.0,74.0,78.9,83.78,88.661227,93.542454,97.17,,
Aruba,Secure Internet servers (per 1 million people),,,,,,,,,,...,88.522559,127.393528,253.51014,426.526042,568.54318,757.132862,1172.858342,977.544939,1152.628844,
Aruba,Secure Internet servers,,,,,,,,,,...,9.0,13.0,26.0,44.0,59.0,79.0,123.0,103.0,122.0,


In [18]:
infra.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/infra_clean.csv', index = True)

**Notes**: Lots of nulls for early data because they likely did not have the means to record such data in the 70s and 80s. Should also be noted there are a lot of indicators here, will likely have to handpick which ones I will use.

### Infrastructure COV

In [19]:
#finding total COV for a row, this does COV for each individual indicator
infra['COV'] = (infra.std(axis = 1))/(infra.mean(axis=1))

In [20]:
#we are grouping each country and summing all their COV for each indicator
infra_total_cov = infra.groupby('Country Name')['COV'].sum()

In [21]:
#converting to dataframe
infra_total_cov = pd.DataFrame(data=infra_total_cov)

In [22]:
infra_order = infra_total_cov.sort_values(by ='COV' , ascending=False)

In [23]:
infra_order.head(40)

Unnamed: 0_level_0,COV
Country Name,Unnamed: 1_level_1
Cambodia,38.780142
Indonesia,35.355332
China,35.069553
Cameroon,33.977386
Vietnam,33.07464
Brazil,33.037385
India,32.921431
Sri Lanka,32.328129
Nigeria,32.291814
Mozambique,31.382685


In [24]:
infra_order.tail(20)

Unnamed: 0_level_0,COV
Country Name,Unnamed: 1_level_1
British Virgin Islands,8.038518
Guam,7.938291
Monaco,7.891493
Puerto Rico,7.708636
Bermuda,7.37669
Faroe Islands,7.302297
Gibraltar,6.981661
Cayman Islands,6.809587
American Samoa,6.745444
Greenland,6.691395


## Healthcare Total Expenditure

In [25]:
h_spend.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,HEALTHEXP,TOT,PC_GDP,A,1971,4.547,
1,AUS,HEALTHEXP,TOT,PC_GDP,A,1972,4.547,
2,AUS,HEALTHEXP,TOT,PC_GDP,A,1973,4.511,
3,AUS,HEALTHEXP,TOT,PC_GDP,A,1974,5.112,
4,AUS,HEALTHEXP,TOT,PC_GDP,A,1975,5.76,


In [26]:
h_spend['Country Name'].unique()

KeyError: 'Country Name'

In [None]:
h_spend.dtypes

In [None]:
h_spend.isnull().sum()

In [None]:
h_spend.drop(axis = 0, columns = ['FREQUENCY', 'Flag Codes'], inplace = True)

In [None]:
h_spend = h_spend.set_index('LOCATION')

In [None]:
h_spend.head()

In [None]:
h_spend.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/hspend_clean.csv', index = True)

**Notes**: Going to have to conver the index 3 letter to actual country name since that's what the other datasets are using. Will be easy once I do country selection and my subset of countries is relatively small. Decided to leave in subject, indicator, and measure for convenience stack. Also a wide range of data for this dataset, going back to 1971 WITH values.

### Healthcare Spending COV

In [None]:
#grouping by location and taking the COV of all values for grouped countries
spend_cov = (h_spend.groupby('LOCATION')['Value'].std()) / (h_spend.groupby('LOCATION')['Value'].mean())
spend_cov = pd.DataFrame(data=spend_cov)

#renaming for readability
spend_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
spend_order = spend_cov.sort_values(by ='COV' , ascending=False)

In [None]:
#highest 50 COV
spend_order.head(20)

In [None]:
#lowest 50 COV
spend_order.tail(20)

## Health Indicators

In [None]:
health.head()

In [None]:
health.isnull().sum()

In [None]:
len(health['Indicator Name'].unique())

In [None]:
len(health['Country Name'].unique())

In [None]:
health['Indicator Name'].unique()

**Notes**: Most of the data that I want from this dataset can be found in the infrastructure dataset above. Therefore I will likely not be using this dataset.

## Hospital Counts

In [None]:
hosp_count.head()

In [None]:
hosp_count.dtypes

In [None]:
hosp_count['Country'].unique()

In [None]:
hosp_count.isnull().sum()

In [None]:
hosp_count['Year'].unique()

In [None]:
hosp_count.drop(axis = 0, columns = ['VAR', 'UNIT', 'Measure', 'COU', 'Flag Codes', 'Flags'], inplace = True)

In [None]:
hosp_count = hosp_count.set_index('Country')

In [None]:
hosp_count.head()

In [None]:
hosp_count.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/hospcount_clean.csv', index = True)

**Notes**: Pretty straightforward dataset, just counts of hospitals in a country over time. Will likely have the lowest COV among other datasets. Data goes back to 2000.

### Hospital COV

In [None]:
#grouping by location and taking the COV of all values for grouped countries
hosp_cov = (hosp_count.groupby('Country')['Value'].std()) / (hosp_count.groupby('Country')['Value'].mean())
hosp_cov = pd.DataFrame(data=hosp_cov)

#renaming for readability
hosp_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
hosp_order = hosp_cov.sort_values(by ='COV' , ascending=False)

In [None]:
#highest 50 COV
hosp_order.head(100)

In [None]:
#lowest 50 COV
hosp_order.tail(20)

## Medical Worker Counts

In [None]:
h_workers.head()

In [None]:
h_workers.dtypes

In [None]:
h_workers['Country'].unique()

In [None]:
h_workers.isnull().sum()

In [None]:
h_workers['Year'].unique()

In [None]:
h_workers.drop(axis = 0, columns = ['VAR', 'UNIT', 'Measure', 'COU', 'Flag Codes', 'Flags'], inplace = True)

In [None]:
h_workers = h_workers.set_index('Country')

In [None]:
h_workers.head()

In [None]:
h_workers.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/workers_clean.csv', index = True)

**Notes**: Again a fairly straight forward dataset, just counts for each type of graduates. Unfortunately values start in 2005, will have to figure out how to find data for 2000-2005. Will be feasable once I select my subset of countries.

### Medical Workers COV

In [None]:
#grouping by location and taking the COV of all values for grouped countries
workers_cov = (h_workers.groupby('Country')['Value'].std()) / (h_workers.groupby('Country')['Value'].mean())
workers_cov = pd.DataFrame(data=workers_cov)

#renaming for readability
workers_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
workers_order = workers_cov.sort_values(by ='COV' , ascending=False)

In [None]:
workers_order.head(40)

In [None]:
workers_order.tail(40)

## Medical Technology Counts

In [None]:
h_tech.head()

In [None]:
h_tech.dtypes

In [None]:
h_tech['Country'].unique

In [None]:
h_tech.isnull().sum()

In [None]:
h_tech['Year'].unique()

In [None]:
h_tech.drop(axis = 0, columns = ['VAR', 'UNIT', 'Measure', 'COU', 'Flag Codes', 'Flags'], inplace = True)

In [None]:
h_tech = h_tech.set_index('Country')

In [None]:
h_tech.head()

In [None]:
h_tech.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/tech_clean.csv', index = True)

**Notes**: Another count dataset with data from 2000 to now. 

### Medical Technology COV

In [None]:
#grouping by location and taking the COV of all values for grouped countries
tech_cov = (h_tech.groupby('Country')['Value'].std()) / (h_tech.groupby('Country')['Value'].mean())
tech_cov = pd.DataFrame(data=tech_cov)

#renaming for readability
tech_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
tech_order = tech_cov.sort_values(by ='COV' , ascending=False)

In [None]:
#highest 50 COV
tech_order.head(20)

In [None]:
#lowest 50 COV
tech_order.tail(20)

# COV Analysis Findings