In [2]:
import scipy.stats as stats
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Table of Contents
- [Reading in Data](#read) 
- [Disease EDA & Cleaning](#eda)
    - [Cholera EDA & Cleaning](#cholera)
    - [Ebola EDA & Cleaning](#ebola)
    - [Malaria EDA & Cleaning](#malaria)
    - [Meningitis EDA & Cleaning](#meningitis)
    - [Tuberculosis EDA & Cleaning](#tuberculosis)
    - [Zika EDA & Cleaning](#zika)
    - [Tetanus EDA & Cleaning](#tetanus)
    - [Rubella EDA & Cleaning](#rubella)
    - [Pertussis EDA & Cleaning](#pert)
    - [Mumps EDA & Cleaning](#mumps)
    - [Measles EDA & Cleaning](#measles)
- [EDA Findings](#find)
- [Selecting Countries for Modeling](#select)
    - [Cholera COV](#chol_cov)
    - [Malaria COV](#malaria_cov)
    - [Meningitis COV](#men_cov)
    - [Tuberculosis COV](#tb_cov)
    - [Tetanus COV](#tet_cov)
    - [Rubella COV](#rub_cov)
    - [Pertussis COV](#pert_cov)
    - [Mumps COV](#mumps_cov)
    - [Measles COV](#measles_cov)
    
- [COV Findings and Country Selection](#select2)

# Infrastructure EDA

The goal of EDA here will be to examine which datasets are feasible for our problem at hand. We need a dataset that covers a wide range of countries and data for at least 20 years. Additionally we need to carefully examine the units for each indicator as we will most likely need to convert them to a standard unit. 

Once cleaned we will calculate the Coeffecient of Variation (COV) for our infrastructure metrics over time for each country. Countries with high COV may be politically unstable or have exhibited a large change in spending and have drastically changing budgets, whereas countries with low COV will have very stable governments. The ambiguity in this metric will mean that it will not be weighted heavily when it comes to selecting countries, but rather to guage the stability/volatility of a country relative to other countries.

There is also the issue of multicolinearity between healthcare spending and the healthcare inputs of each dataset. I will likely just include healthcare spending since that dataset has the most robust data compared to the inputs dataset. Ideally we would like to have data on specific inputs for every country however countries that are relatively unstable politically and infrastructure wise have only recently started reporting these statistics.

In [2]:
%autosave 120

Autosaving every 120 seconds


In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
infra = pd.read_csv('../Data/Infrastructure/infrastructure_1.csv')

In [5]:
health = pd.read_csv('../Data/Infrastructure/Health_1.csv')

In [6]:
hosp_count = pd.read_csv('../Data/Infrastructure/healthcare_hospitalscount.csv')

In [7]:
doc_count = pd.read_csv('../Data/Infrastructure/docs_per1000.csv')

In [8]:
nurse_count = pd.read_csv('../Data/Infrastructure/nurses_per1000.csv')

In [9]:
pharm_spend = pd.read_csv('../Data/Infrastructure/pharm_uspercap.csv')

In [10]:
h_tech = pd.read_csv('../Data/Infrastructure/healthcare_tech.csv')

In [11]:
h_workers = pd.read_csv('../Data/Infrastructure/healthcare_workers.csv')

In [12]:
h_spend = pd.read_csv('../Data/Infrastructure/healthcare_expenditure.csv')

## Infrastructure 

#### Varying Units

In [13]:
infra.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,ICT goods exports (% of total goods exports),TX.VAL.ICTG.ZS.UN,,,,,,,...,0.404431,0.419583,0.713791,0.412533,0.93362,0.542872,1.038984,1.18568,,
1,Aruba,ABW,ICT goods imports (% total goods imports),TM.VAL.ICTG.ZS.UN,,,,,,,...,4.764945,4.627908,4.420786,5.268659,5.243941,5.734922,4.918148,4.85433,,
2,Aruba,ABW,Individuals using the Internet (% of population),IT.NET.USER.ZS,,,,,,,...,62.0,69.0,74.0,78.9,83.78,88.661227,93.542454,97.17,,
3,Aruba,ABW,Secure Internet servers (per 1 million people),IT.NET.SECR.P6,,,,,,,...,88.522559,127.393528,253.51014,426.526042,568.54318,757.132862,1172.858342,977.544939,1152.628844,
4,Aruba,ABW,Secure Internet servers,IT.NET.SECR,,,,,,,...,9.0,13.0,26.0,44.0,59.0,79.0,123.0,103.0,122.0,


In [14]:
#infra.dtypes

In [15]:
#counting nulls for each
#infra.isnull().sum()

In [4]:
#checked unique countries
#infra['Country Name'].unique()

In [3]:
#checked unique indicators
#infra['Indicator Name'].unique()

In [18]:
#dropping columns I don't need
infra.drop(axis = 0, columns = ['Country Code', 'Indicator Code'], inplace = True)

In [19]:
#Setting the index to Country Name
infra = infra.set_index('Country Name')

In [20]:
infra.head()

Unnamed: 0_level_0,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,ICT goods exports (% of total goods exports),,,,,,,,,,...,0.404431,0.419583,0.713791,0.412533,0.93362,0.542872,1.038984,1.18568,,
Aruba,ICT goods imports (% total goods imports),,,,,,,,,,...,4.764945,4.627908,4.420786,5.268659,5.243941,5.734922,4.918148,4.85433,,
Aruba,Individuals using the Internet (% of population),,,,,,,,,,...,62.0,69.0,74.0,78.9,83.78,88.661227,93.542454,97.17,,
Aruba,Secure Internet servers (per 1 million people),,,,,,,,,,...,88.522559,127.393528,253.51014,426.526042,568.54318,757.132862,1172.858342,977.544939,1152.628844,
Aruba,Secure Internet servers,,,,,,,,,,...,9.0,13.0,26.0,44.0,59.0,79.0,123.0,103.0,122.0,


In [21]:
infra.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/infra_clean.csv', index = True)

**Notes**: Lots of nulls for early data because they likely did not have the means to record such data in the 70s and 80s. Should also be noted there are a lot of indicators here, will likely have to handpick which ones I will use.

### Infrastructure COV

In [22]:
#finding total COV for a row, this does COV for each individual indicator
infra['COV'] = (infra.std(axis = 1))/(infra.mean(axis=1))

In [23]:
#we are grouping each country and summing all their COV for each indicator
infra_total_cov = infra.groupby('Country Name')['COV'].sum()

In [24]:
#converting to dataframe
infra_total_cov = pd.DataFrame(data=infra_total_cov)

In [25]:
infra_order = infra_total_cov.sort_values(by ='COV' , ascending=False)

In [26]:
infra_order

Unnamed: 0_level_0,COV
Country Name,Unnamed: 1_level_1
Cambodia,38.780142
Indonesia,35.355332
China,35.069553
Cameroon,33.977386
Vietnam,33.07464
Brazil,33.037385
India,32.921431
Sri Lanka,32.328129
Nigeria,32.291814
Mozambique,31.382685


In [27]:
infra_order.to_csv('../Data/Infrastructure/infrastructure_cov/infra_cov.csv', index = True)

## Healthcare Total Expenditure

#### Percent GDP

In [28]:
h_spend.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,HEALTHEXP,TOT,PC_GDP,A,1971,4.547,
1,AUS,HEALTHEXP,TOT,PC_GDP,A,1972,4.547,
2,AUS,HEALTHEXP,TOT,PC_GDP,A,1973,4.511,
3,AUS,HEALTHEXP,TOT,PC_GDP,A,1974,5.112,
4,AUS,HEALTHEXP,TOT,PC_GDP,A,1975,5.76,


In [29]:
h_spend['LOCATION'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'GRC', 'HUN', 'ISL', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'MEX',
       'NLD', 'NZL', 'NOR', 'POL', 'PRT', 'SVK', 'ESP', 'SWE', 'CHE',
       'TUR', 'GBR', 'USA', 'BRA', 'CHL', 'CHN', 'EST', 'IND', 'IDN',
       'ISR', 'RUS', 'SVN', 'ZAF', 'COL', 'LVA', 'LTU', 'OAVG', 'CRI',
       'BGR', 'HRV', 'CYP', 'MLT', 'ROU'], dtype=object)

In [30]:
h_spend.dtypes

LOCATION       object
INDICATOR      object
SUBJECT        object
MEASURE        object
FREQUENCY      object
TIME            int64
Value         float64
Flag Codes     object
dtype: object

In [31]:
h_spend.isnull().sum()

LOCATION          0
INDICATOR         0
SUBJECT           0
MEASURE           0
FREQUENCY         0
TIME              0
Value             0
Flag Codes    13859
dtype: int64

In [32]:
h_spend.drop(axis = 0, columns = ['FREQUENCY', 'Flag Codes'], inplace = True)

In [33]:
h_spend = h_spend.set_index('LOCATION')

In [34]:
h_spend.head()

Unnamed: 0_level_0,INDICATOR,SUBJECT,MEASURE,TIME,Value
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AUS,HEALTHEXP,TOT,PC_GDP,1971,4.547
AUS,HEALTHEXP,TOT,PC_GDP,1972,4.547
AUS,HEALTHEXP,TOT,PC_GDP,1973,4.511
AUS,HEALTHEXP,TOT,PC_GDP,1974,5.112
AUS,HEALTHEXP,TOT,PC_GDP,1975,5.76


In [35]:
h_spend.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/hspend_clean.csv', index = True)

**Notes**: Going to have to conver the index 3 letter to actual country name since that's what the other datasets are using. Will be easy once I do country selection and my subset of countries is relatively small. Decided to leave in subject, indicator, and measure for convenience stack. Also a wide range of data for this dataset, going back to 1971 WITH values.

### Healthcare Spending COV

In [36]:
#grouping by location and taking the COV of all values for grouped countries
spend_cov = (h_spend.groupby('LOCATION')['Value'].std()) / (h_spend.groupby('LOCATION')['Value'].mean())
spend_cov = pd.DataFrame(data=spend_cov)

#renaming for readability
spend_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
spend_order = spend_cov.sort_values(by ='COV' , ascending=False)

In [37]:
spend_order.to_csv('../Data/Infrastructure/infrastructure_cov/spend_cov.csv', index = True)

In [38]:
#highest 50 COV
spend_order

Unnamed: 0_level_0,COV
LOCATION,Unnamed: 1_level_1
LUX,2.375354
IRL,2.2718
NOR,2.222062
GBR,2.213435
USA,2.208883
DNK,2.194189
KOR,2.179606
DEU,2.175834
JPN,2.147319
FIN,2.127337


# Pharmaceutical Spending

#### US Dollars / Capita

In [39]:
pharm_spend.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,PHARMAEXP,TOT,USD_CAP,A,1998,258.948,
1,AUS,PHARMAEXP,TOT,USD_CAP,A,1999,287.245,
2,AUS,PHARMAEXP,TOT,USD_CAP,A,2000,338.16,
3,AUS,PHARMAEXP,TOT,USD_CAP,A,2001,367.652,
4,AUS,PHARMAEXP,TOT,USD_CAP,A,2002,381.433,


In [40]:
pharm_spend.isnull().sum()

LOCATION        0
INDICATOR       0
SUBJECT         0
MEASURE         0
FREQUENCY       0
TIME            0
Value           0
Flag Codes    613
dtype: int64

In [41]:
pharm_spend['LOCATION'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'GRC', 'HUN', 'ISL', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'MEX',
       'NLD', 'NZL', 'NOR', 'POL', 'PRT', 'SVK', 'ESP', 'SWE', 'CHE',
       'TUR', 'GBR', 'USA', 'EST', 'ISR', 'SVN', 'LVA', 'LTU', 'BGR',
       'HRV', 'CYP', 'ROU', 'RUS', 'CRI', 'MLT'], dtype=object)

In [42]:
pharm_spend['TIME'].unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

In [43]:
pharm_spend.drop(axis = 0, columns = ['SUBJECT', 'MEASURE', 'FREQUENCY', 'Flag Codes'], inplace = True)

In [44]:
pharm_spend.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/pharm_spend_clean.csv', index = True)

### Pharmaceutical Spending COV

In [45]:
#grouping by location and taking the COV of all values for grouped countries
pharm_cov = (pharm_spend.groupby('LOCATION')['Value'].std()) / (pharm_spend.groupby('LOCATION')['Value'].mean())
pharm_cov = pd.DataFrame(data=pharm_cov)

#renaming for readability
pharm_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
pharm_order = spend_cov.sort_values(by ='COV' , ascending=False)

In [46]:
pharm_order.to_csv('../Data/Infrastructure/infrastructure_cov/pharm_cov.csv', index = True)

In [47]:
pharm_order.head()

Unnamed: 0_level_0,COV
LOCATION,Unnamed: 1_level_1
LUX,2.375354
IRL,2.2718
NOR,2.222062
GBR,2.213435
USA,2.208883


## Health Indicators

#### Varying Units

In [48]:
health.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,Unmet need for contraception (% of married wom...,SP.UWT.TFRT,,,,,,,...,,,,,,,,,,
1,Aruba,ABW,Completeness of death registration with cause-...,SP.REG.DTHS.ZS,,,,,,,...,,,,,,,,,,
2,Aruba,ABW,Completeness of birth registration (%),SP.REG.BRTH.ZS,,,,,,,...,,,,,,,,,,
3,Aruba,ABW,"Completeness of birth registration, urban (%)",SP.REG.BRTH.UR.ZS,,,,,,,...,,,,,,,,,,
4,Aruba,ABW,"Completeness of birth registration, rural (%)",SP.REG.BRTH.RU.ZS,,,,,,,...,,,,,,,,,,


In [49]:
health.isnull().sum()

Country Name          0
Country Code          0
Indicator Name        0
Indicator Code        0
1960              47898
1961              48327
1962              47847
1963              48301
1964              48273
1965              47907
1966              48214
1967              47728
1968              48172
1969              48119
1970              47511
1971              48023
1972              47543
1973              47996
1974              47960
1975              47556
1976              47903
1977              47409
1978              47862
1979              47851
1980              47173
1981              47338
1982              46901
1983              47298
1984              47172
1985              46797
1986              46774
1987              46149
1988              46821
1989              46654
1990              41141
1991              43573
1992              42788
1993              43283
1994              43355
1995              42505
1996              42936
1997            

In [50]:
len(health['Indicator Name'].unique())

250

In [51]:
len(health['Country Name'].unique())

264

In [5]:
#checked unique indicators
#health['Indicator Name'].unique()

**Notes**: Most of the data that I want from this dataset can be found in the infrastructure dataset above. Therefore I will likely not be using this dataset.

## Hospital Counts

#### Raw Counts

In [53]:
hosp_count.head()

Unnamed: 0,VAR,Variable,UNIT,Measure,COU,Country,YEA,Year,Value,Flag Codes,Flags
0,HOSPTHOS,Hospitals,NOMBRENB,Number,AUS,Australia,2000,2000,1265.0,,
1,HOSPTHOS,Hospitals,NOMBRENB,Number,AUS,Australia,2001,2001,1283.0,,
2,HOSPTHOS,Hospitals,NOMBRENB,Number,AUS,Australia,2002,2002,1284.0,,
3,HOSPTHOS,Hospitals,NOMBRENB,Number,AUS,Australia,2003,2003,1286.0,,
4,HOSPTHOS,Hospitals,NOMBRENB,Number,AUS,Australia,2004,2004,1291.0,,


In [54]:
hosp_count.dtypes

VAR            object
Variable       object
UNIT           object
Measure        object
COU            object
Country        object
YEA             int64
Year            int64
Value         float64
Flag Codes     object
Flags          object
dtype: object

In [55]:
hosp_count['Country'].unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Czech Republic',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland',
       'Ireland', 'Italy', 'Japan', 'Korea', 'Luxembourg', 'Mexico',
       'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal',
       'Slovak Republic', 'Spain', 'Sweden', 'Switzerland', 'Turkey',
       'United Kingdom', 'United States', 'Chile', 'Estonia', 'Israel',
       'Slovenia', 'Lithuania', 'Latvia'], dtype=object)

In [56]:
hosp_count.isnull().sum()

VAR              0
Variable         0
UNIT             0
Measure          0
COU              0
Country          0
YEA              0
Year             0
Value            0
Flag Codes    4306
Flags         4306
dtype: int64

In [57]:
hosp_count['Year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

In [58]:
hosp_count.drop(axis = 0, columns = ['VAR', 'UNIT', 'Measure', 'COU', 'YEA', 'Flag Codes', 'Flags'], inplace = True)

In [59]:
hosp_count = hosp_count.set_index('Country')

In [60]:
hosp_count.head()

Unnamed: 0_level_0,Variable,Year,Value
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,Hospitals,2000,1265.0
Australia,Hospitals,2001,1283.0
Australia,Hospitals,2002,1284.0
Australia,Hospitals,2003,1286.0
Australia,Hospitals,2004,1291.0


In [61]:
hosp_count.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/hospcount_clean.csv', index = True)

**Notes**: Pretty straightforward dataset, just counts of hospitals in a country over time. Will likely have the lowest COV among other datasets. Data goes back to 2000.

### Hospital COV

In [62]:
#grouping by location and taking the COV of all values for grouped countries
hosp_cov = (hosp_count.groupby('Country')['Value'].std()) / (hosp_count.groupby('Country')['Value'].mean())
hosp_cov = pd.DataFrame(data=hosp_cov)

#renaming for readability
hosp_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
hosp_order = hosp_cov.sort_values(by ='COV' , ascending=False)

In [63]:
hosp_order.to_csv('../Data/Infrastructure/infrastructure_cov/hosp_cov.csv', index = True)

In [64]:
#highest 50 COV
hosp_order

Unnamed: 0_level_0,COV
Country,Unnamed: 1_level_1
Netherlands,1.449967
Korea,1.440981
Mexico,1.361906
Turkey,1.339239
Japan,1.235737
Italy,1.230184
United States,1.222081
Germany,1.220496
Spain,1.206731
France,1.206625


## Medical Worker Counts

#### Raw Counts

In [65]:
h_workers.head()

Unnamed: 0,VAR,Variable,UNIT,Measure,COU,Country,YEA,Year,Value,Flag Codes,Flags
0,HEDUMEGR,Medical graduates,NOMBRENB,Number,AUS,Australia,2005,2005,1798.0,,
1,HEDUMEGR,Medical graduates,NOMBRENB,Number,AUS,Australia,2006,2006,1884.0,,
2,HEDUMEGR,Medical graduates,NOMBRENB,Number,AUS,Australia,2007,2007,2117.0,,
3,HEDUMEGR,Medical graduates,NOMBRENB,Number,AUS,Australia,2008,2008,2389.0,,
4,HEDUMEGR,Medical graduates,NOMBRENB,Number,AUS,Australia,2009,2009,2361.0,,


In [66]:
h_workers.dtypes

VAR            object
Variable       object
UNIT           object
Measure        object
COU            object
Country        object
YEA             int64
Year            int64
Value         float64
Flag Codes     object
Flags          object
dtype: object

In [67]:
h_workers['Country'].unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Iceland', 'Ireland', 'Italy', 'Japan', 'Korea', 'Luxembourg',
       'Mexico', 'Netherlands', 'New Zealand', 'Norway', 'Poland',
       'Portugal', 'Slovak Republic', 'Spain', 'Sweden', 'Switzerland',
       'Turkey', 'United Kingdom', 'United States', 'Brazil', 'Chile',
       "China (People's Republic of)", 'Estonia', 'India', 'Indonesia',
       'Israel', 'Russia', 'Slovenia', 'South Africa', 'Lithuania',
       'Latvia', 'Colombia', 'Costa Rica'], dtype=object)

In [68]:
h_workers.isnull().sum()

VAR               0
Variable          0
UNIT              0
Measure           0
COU               0
Country           0
YEA               0
Year              0
Value             2
Flag Codes    79520
Flags         79520
dtype: int64

In [69]:
h_workers['Year'].unique()

array([2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018])

In [70]:
h_workers.drop(axis = 0, columns = ['VAR', 'UNIT', 'Measure', 'COU', 'Flag Codes', 'Flags'], inplace = True)

In [71]:
h_workers = h_workers.set_index('Country')

In [72]:
h_workers.head()

Unnamed: 0_level_0,Variable,YEA,Year,Value
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,Medical graduates,2005,2005,1798.0
Australia,Medical graduates,2006,2006,1884.0
Australia,Medical graduates,2007,2007,2117.0
Australia,Medical graduates,2008,2008,2389.0
Australia,Medical graduates,2009,2009,2361.0


In [73]:
h_workers.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/workers_clean.csv', index = True)

**Notes**: Again a fairly straight forward dataset, just counts for each type of graduates. Unfortunately values start in 2005, will have to figure out how to find data for 2000-2005. Will be feasable once I select my subset of countries.

### Medical Workers COV

In [74]:
#grouping by location and taking the COV of all values for grouped countries
workers_cov = (h_workers.groupby('Country')['Value'].std()) / (h_workers.groupby('Country')['Value'].mean())
workers_cov = pd.DataFrame(data=workers_cov)

#renaming for readability
workers_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
workers_order = workers_cov.sort_values(by ='COV' , ascending=False)

In [75]:
workers_order.to_csv('../Data/Infrastructure/infrastructure_cov/worker_cov.csv', index = True)

In [76]:
workers_order

Unnamed: 0_level_0,COV
Country,Unnamed: 1_level_1
Iceland,6.827375
Chile,6.824575
Hungary,5.387268
Denmark,5.193154
United Kingdom,4.898067
Sweden,4.842242
Switzerland,4.348566
United States,4.346516
France,4.336614
Norway,4.222294


# Doctor Counts

#### Per 1000

In [77]:
doc_count.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,MEDICALDOC,TOT,1000HAB,A,1999,2.45,
1,AUS,MEDICALDOC,TOT,1000HAB,A,2000,2.49,
2,AUS,MEDICALDOC,TOT,1000HAB,A,2001,2.56,
3,AUS,MEDICALDOC,TOT,1000HAB,A,2002,2.56,
4,AUS,MEDICALDOC,TOT,1000HAB,A,2003,2.63,


In [78]:
doc_count.isnull().sum()

LOCATION        0
INDICATOR       0
SUBJECT         0
MEASURE         0
FREQUENCY       0
TIME            0
Value           0
Flag Codes    571
dtype: int64

In [79]:
doc_count['LOCATION'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'HUN', 'ISL', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'MEX', 'NLD',
       'NZL', 'NOR', 'POL', 'SVK', 'ESP', 'SWE', 'CHE', 'TUR', 'GBR',
       'USA', 'BRA', 'CHN', 'EST', 'IND', 'IDN', 'ISR', 'RUS', 'SVN',
       'ZAF', 'COL', 'LVA', 'LTU'], dtype=object)

In [80]:
doc_count['TIME'].unique()

array([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2010, 2018])

In [81]:
doc_count.drop(axis = 0, columns = ['SUBJECT', 'MEASURE', 'FREQUENCY', 'Flag Codes'], inplace = True)

In [82]:
doc_count.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/doc_count_clean.csv', index = True)

### Doctor Count COV

In [83]:
#grouping by location and taking the COV of all values for grouped countries
doc_cov = (doc_count.groupby('LOCATION')['Value'].std()) / (doc_count.groupby('LOCATION')['Value'].mean())
doc_cov = pd.DataFrame(data=doc_cov)

#renaming for readability
doc_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
doc_order = spend_cov.sort_values(by ='COV' , ascending=False)

In [84]:
doc_order.to_csv('../Data/Infrastructure/infrastructure_cov/doc_cov.csv', index = True)

In [85]:
doc_order.head()

Unnamed: 0_level_0,COV
LOCATION,Unnamed: 1_level_1
LUX,2.375354
IRL,2.2718
NOR,2.222062
GBR,2.213435
USA,2.208883


# Nurse Counts

#### Per 1000

In [86]:
nurse_count.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,NURSE,TOT,1000HAB,A,1998,10.3,
1,AUS,NURSE,TOT,1000HAB,A,1999,10.17,
2,AUS,NURSE,TOT,1000HAB,A,2000,10.07,
3,AUS,NURSE,TOT,1000HAB,A,2001,9.95,
4,AUS,NURSE,TOT,1000HAB,A,2002,9.94,


In [87]:
nurse_count.isnull().sum()

LOCATION        0
INDICATOR       0
SUBJECT         0
MEASURE         0
FREQUENCY       0
TIME            0
Value           0
Flag Codes    603
dtype: int64

In [88]:
nurse_count['LOCATION'].unique()

array(['AUS', 'AUT', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU', 'GRC',
       'HUN', 'ISL', 'IRL', 'JPN', 'KOR', 'LUX', 'MEX', 'NLD', 'NZL',
       'NOR', 'POL', 'PRT', 'SVK', 'ESP', 'CHE', 'TUR', 'GBR', 'USA',
       'BRA', 'CHN', 'EST', 'IND', 'IDN', 'ISR', 'RUS', 'SVN', 'ZAF',
       'ITA', 'BEL', 'SWE', 'COL', 'LVA', 'LTU'], dtype=object)

In [89]:
nurse_count['TIME'].unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2006, 2010, 2018])

In [90]:
nurse_count.drop(axis = 0, columns = ['SUBJECT', 'MEASURE', 'FREQUENCY', 'Flag Codes'], inplace = True)

In [91]:
nurse_count.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/nurse_count_clean.csv', index = True)

### Nurse Count COV

In [92]:
#grouping by location and taking the COV of all values for grouped countries
nurse_cov = (nurse_count.groupby('LOCATION')['Value'].std()) / (nurse_count.groupby('LOCATION')['Value'].mean())
nurse_cov = pd.DataFrame(data=nurse_cov)

#renaming for readability
nurse_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
nurse_order = nurse_cov.sort_values(by ='COV' , ascending=False)

In [93]:
nurse_order.to_csv('../Data/Infrastructure/infrastructure_cov/nurse_cov.csv', index = True)

In [94]:
nurse_order.head()

Unnamed: 0_level_0,COV
LOCATION,Unnamed: 1_level_1
BRA,0.49402
CHN,0.395138
COL,0.342354
IND,0.283637
KOR,0.27446


## Medical Technology Counts

#### Raw Counts

In [95]:
h_tech.head()

Unnamed: 0,VAR,Variable,UNIT,Measure,COU,Country,YEA,Year,Value,Flag Codes,Flags
0,IPINSCAN,"Computed Tomography scanners, total",NOMBRENB,Number,AUS,Australia,2000,2000,500.0,E,Estimated value
1,IPINSCAN,"Computed Tomography scanners, total",NOMBRENB,Number,AUS,Australia,2001,2001,560.0,E,Estimated value
2,IPINSCAN,"Computed Tomography scanners, total",NOMBRENB,Number,AUS,Australia,2002,2002,670.0,E,Estimated value
3,IPINSCAN,"Computed Tomography scanners, total",NOMBRENB,Number,AUS,Australia,2003,2003,800.0,E,Estimated value
4,IPINSCAN,"Computed Tomography scanners, total",NOMBRENB,Number,AUS,Australia,2004,2004,910.0,E,Estimated value


In [96]:
h_tech.dtypes

VAR            object
Variable       object
UNIT           object
Measure        object
COU            object
Country        object
YEA             int64
Year            int64
Value         float64
Flag Codes     object
Flags          object
dtype: object

In [97]:
h_tech['Country'].unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Iceland', 'Ireland', 'Italy', 'Japan', 'Korea', 'Luxembourg',
       'Mexico', 'Netherlands', 'New Zealand', 'Poland', 'Portugal',
       'Slovak Republic', 'Spain', 'Sweden', 'Switzerland', 'Turkey',
       'United Kingdom', 'United States', 'Brazil', 'Chile', 'Estonia',
       'Israel', 'Russia', 'Slovenia', 'Latvia', 'Lithuania', 'Colombia',
       'Costa Rica'], dtype=object)

In [98]:
h_tech.isnull().sum()

VAR               0
Variable          0
UNIT              0
Measure           0
COU               0
Country           0
YEA               0
Year              0
Value             0
Flag Codes    12466
Flags         12466
dtype: int64

In [99]:
h_tech['Year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2007, 2008])

In [100]:
h_tech.drop(axis = 0, columns = ['VAR', 'UNIT', 'Measure', 'COU', 'Flag Codes', 'YEA', 'Flags'], inplace = True)

In [101]:
h_tech = h_tech.set_index('Country')

In [102]:
h_tech.head()

Unnamed: 0_level_0,Variable,Year,Value
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,"Computed Tomography scanners, total",2000,500.0
Australia,"Computed Tomography scanners, total",2001,560.0
Australia,"Computed Tomography scanners, total",2002,670.0
Australia,"Computed Tomography scanners, total",2003,800.0
Australia,"Computed Tomography scanners, total",2004,910.0


In [103]:
h_tech.to_csv('../Data/Infrastructure/Cleaned_Infrastructure/tech_clean.csv', index = True)

**Notes**: Another count dataset with data from 2000 to now. 

### Medical Technology COV

In [104]:
#grouping by location and taking the COV of all values for grouped countries
tech_cov = (h_tech.groupby('Country')['Value'].std()) / (h_tech.groupby('Country')['Value'].mean())
tech_cov = pd.DataFrame(data=tech_cov)

#renaming for readability
tech_cov.rename(columns={"Value": "COV"}, inplace = True)

#creating a dataframe that is ordered by COV and not alphabetically.
tech_order = tech_cov.sort_values(by ='COV' , ascending=False)

In [105]:
tech_order.to_csv('../Data/Infrastructure/infrastructure_cov/tech_cov.csv', index = True)

In [106]:
#highest 50 COV
tech_order

Unnamed: 0_level_0,COV
Country,Unnamed: 1_level_1
Mexico,2.168663
Chile,1.755242
Korea,1.712309
Japan,1.681581
Canada,1.609451
France,1.598685
Poland,1.559945
Italy,1.537384
Australia,1.537054
Finland,1.511528


In [107]:
#lowest 50 COV
tech_order

Unnamed: 0_level_0,COV
Country,Unnamed: 1_level_1
Mexico,2.168663
Chile,1.755242
Korea,1.712309
Japan,1.681581
Canada,1.609451
France,1.598685
Poland,1.559945
Italy,1.537384
Australia,1.537054
Finland,1.511528


# Infrastructure Coeffecient of Variation Analysis Findings

From our initial findings in disease analysis we found that may volatile countries in regards to disease transmission are in the Latin America / Carribean region. We also found previously that many Latin American / Carribean countries have a lot of variation in the number of disease cases over time. For this reason we will aim to focus on a country that exhibits both of these characteristics for a particular or several diseases. For those reasons along with the robustness of data available I think either Mexico or Brazil would be good choices to study.