In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

import functools
from functools import reduce

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option('display.max_rows', 140)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Pre Processing Table of Contents
- [Reading in Data](#read) 
- [Country & Disease Selection](#select)
- [Filtering Infrastructure Data for Mexico](#filter)
    - [Mexico Infrastructure Data](#infra)
    - [Mexico Healthcare Spending](#health)
    - [Mexico Pharmaceutical Spending](#pharm)
    - [Mexico Medical Technology Counts](#tech)
    - [Mexico Medical Workers Counts](#workers)
    - [Mexico Doctor Counts](#dr)
    - [Mexico Nurses Count](#nurses)
    - [Mexico Hospital Count](#hospital)
- [Filtering Disease Data for Mexico](#filter2)
    - [Mexico Cholera Data](#cholera)
    - [Mexico Measles Data](#measles)
    - [Mexico Mumps Data](#mumps)
    - [Mexico Pertussis Data](#pert)
    - [Mexico Rubella Data](#rubella)
- [Merging Disease Dataframe](#dd)
- [Merging Infrastructure Dataframes](#id)
- [Creating our Time Series Dataframe](#ts)
    

# Reading in Data<a id='read'></a> 

In [3]:
infra_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/infra_clean.csv')

In [4]:
spend_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/hspend_clean.csv')

In [5]:
mex_pharm = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/pharm_spend_clean.csv')

In [6]:
tech_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/tech_clean.csv')

In [7]:
workers_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/workers_clean.csv')

In [8]:
mex_dr = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/doc_count_clean.csv')

In [9]:
mex_nurses = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/nurse_count_clean.csv')

In [10]:
hosp_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/hospcount_clean.csv')

In [11]:
cholera_c = pd.read_csv('../Data/Diseases/cleaned_disease/cholera_clean.csv')

In [12]:
malaria_c = pd.read_csv('../Data/Diseases/cleaned_disease/malaria_clean.csv')

In [13]:
measles_c = pd.read_csv('../Data/Diseases/cleaned_disease/measles_clean.csv')

In [14]:
mumps_c = pd.read_csv('../Data/Diseases/cleaned_disease/mumps_clean.csv')

In [15]:
pert_c = pd.read_csv('../Data/Diseases/cleaned_disease/pert_clean.csv')

In [16]:
rubella_c = pd.read_csv('../Data/Diseases/cleaned_disease/rubella_clean.csv')

In [17]:
tb_c = pd.read_csv('../Data/Diseases/cleaned_disease/tb_clean.csv')

In [18]:
tet_c = pd.read_csv('../Data/Diseases/cleaned_disease/tet_clean.csv')

# Country & Disease Selection<a id='select'></a> 

Mexico was selected as the country of choice for this project because of it's high variation in both infrastructure metrics and disease cases while also having a significant amount of cases for those diseases. Additionally Mexico has more robust data relative to other countries in regards to infrastructure metrics as many developing and third world countries have only recently started reporting statistics.

We selected Mexico based on the following infrastructure metrics:

- Infrastructure COV of 26 which was relatively high considering the worldwide range of 2 to 33.
- Medical Technology COV which was the highest worlwide.
- Third highest hospital count COV worldwide.
- Relatively high COV for medical expenditure relative to other countries at 1.3 with the highest value being 2.3.

We are choosing to study transmission cases for the following diseases because Mexico showed a high COV relative to other countries as well as having a high number of cases.

- Cholera 
- Measles
- Mumps
- Pertussis
- Rubella

# Filtering Infrastructure Data for Mexico<a id='filter'></a> 

### Infrastructure Data for Mexico<a id='infra'></a> 

In [19]:
mex_infra = infra_c.loc[infra_c['Country Name'] == 'Mexico']

In [20]:
#seeing how many total indicators we have
len(mex_infra['Indicator Name'].unique())

47

In [21]:
#counting nulls
mex_infra.isnull().sum()

Country Name       0
Indicator Name     0
1999              22
2000              19
2001              17
2002              12
2003              19
2004              12
2005              14
2006              14
2007               4
2008              13
2009              15
2010              11
2011              11
2012               4
2013              11
2014               6
2015               7
2016              19
2017              19
2018              22
2019              47
COV                2
dtype: int64

In [22]:
#figuring out which rows/indicators we have to lose because of nans
#mex_infra

In [23]:
#dropping the previously made COV column
mex_infra.drop(axis = 0, columns = ['COV'], inplace = True)

#dropping rows with too many null values
mex_infra = mex_infra.drop([7147, 7148, 7157, 7158, 7159, 7163, 7173, 7175, 7176, 7177, 7178, 7179, 7180, 7181])

#dropping country name because we are just working with mexico now
mex_infra.drop(axis = 0, columns = ['Country Name'], inplace = True)

#renaming transposed columns
mex_infra.rename(columns={"Indicator Name": "Year"}, inplace = True)

#setting new index
mex_infra = mex_infra.set_index('Year')

#filling rows that did not have many Nans with the mean
mex_infra = mex_infra.apply(lambda row: row.fillna(row.mean()), axis=1)

#transposing so that our indicators become our x values.
mex_infra = mex_infra.T

#just creating a column from index for future manipulation, joning, concatting, etc.
mex_infra['Year'] = mex_infra.index

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [24]:
mex_infra

Year,ICT goods exports (% of total goods exports),ICT goods imports (% total goods imports),Individuals using the Internet (% of population),Fixed broadband subscriptions (per 100 people),Fixed broadband subscriptions,Fixed telephone subscriptions (per 100 people),Fixed telephone subscriptions,Mobile cellular subscriptions (per 100 people),Mobile cellular subscriptions,Container port traffic (TEU: 20 foot equivalent units),Liner shipping connectivity index (maximum value in 2004 = 100),"Air transport, passengers carried","Air transport, freight (million ton-km)","Air transport, registered carrier departures worldwide","Trademark applications, resident, by count","Trademark applications, nonresident, by count","Industrial design applications, resident, by count","Industrial design applications, nonresident, by count",Public private partnerships investment in water and sanitation (current US$),Public private partnerships investment in transport (current US$),Public private partnerships investment in energy (current US$),Investment in water and sanitation with private participation (current US$),Investment in transport with private participation (current US$),Investment in energy with private participation (current US$),Electric power consumption (kWh per capita),Electricity production from oil sources (% of total),Electricity production from nuclear sources (% of total),Electricity production from natural gas sources (% of total),Electric power transmission and distribution losses (% of output),Electricity production from hydroelectric sources (% of total),Electricity production from coal sources (% of total),"ICT service exports (% of service exports, BoP)","ICT service exports (BoP, current US$)",Year.1
1999,18.507953,17.130352,1.857436,6.913037,8203597.0,11.209318,10927385.0,7.931116,7731635.0,3545091.0,36.862667,20561100.0,317.0,328300.0,71826.6,31781.4,608.0,976.0,173600000.0,370200000.0,306300000.0,173600000.0,370200000.0,348500000.0,1689.185862,45.374132,5.222704,18.809984,14.288549,17.117644,9.685656,10.129306,1168485000.0,1999
2000,20.909537,17.900664,5.081384,0.015167,15000.0,12.468853,12331676.0,14.234481,14077880.0,1315701.0,36.862667,20894200.0,309.858,290412.0,71826.6,31781.4,668.0,1258.0,375763300.0,793600000.0,1469000000.0,375763300.0,793600000.0,2214300000.0,1800.467938,45.508205,3.997083,21.455695,13.848547,16.109396,9.234958,8.997126,1212828000.0,2000
2001,22.061841,18.906531,7.038023,0.049851,50000.0,13.7332,13774146.0,21.692881,21757559.0,1358136.0,36.862667,20172820.0,295.884,292208.0,71826.6,31781.4,779.0,973.0,72000000.0,223800000.0,311000000.0,72000000.0,223800000.0,311000000.0,1834.151423,42.002355,4.077799,26.020151,14.057798,13.319438,10.794063,6.32509,787198700.0,2001
2002,20.739979,17.298672,11.9,0.227651,231486.0,14.726971,14975085.0,25.498675,25928266.0,1564540.0,36.862667,19618660.0,341.598,278456.0,71826.6,31781.4,769.0,1208.0,375763300.0,71400000.0,2124000000.0,375763300.0,71400000.0,2124000000.0,1843.963674,34.384867,4.466573,34.166739,14.169122,11.433822,11.925983,4.442751,557379900.0,2002
2003,19.31121,17.392107,12.9,0.415567,428371.0,15.841972,16330066.0,29.198101,30097700.0,1693791.0,36.862667,20661210.0,350.089,287017.0,71826.6,31781.4,820.0,1162.0,375763300.0,74700000.0,1285000000.0,375763300.0,74700000.0,1285000000.0,2001.018228,30.348366,4.445912,40.143173,12.312408,8.41599,12.936834,3.397368,422526100.0,2003
2004,19.684402,18.409806,14.1,1.011609,1057282.0,17.292494,18073238.0,36.790088,38451135.0,1903345.0,25.29,21167880.0,394.925,330441.0,38314.0,20239.0,902.0,1555.0,421700000.0,485600000.0,695000000.0,421700000.0,485600000.0,695000000.0,1927.073923,28.888431,3.869806,42.792624,14.690024,10.609345,10.000716,3.082827,422529900.0,2004
2005,17.988816,17.142958,17.21,1.813451,1922352.0,18.406666,19512024.0,44.458899,47128746.0,2144345.0,25.49,21857660.0,390.425,331225.0,41680.0,22219.0,987.0,1790.0,30000000.0,1321300000.0,120000000.0,30000000.0,1321300000.0,120000000.0,1996.392573,27.303723,4.308763,40.13351,14.921362,11.049655,13.05669,3.482021,547927400.0,2005
2006,18.769373,17.540997,19.52,2.807731,3020000.0,18.465294,19861299.0,51.501843,55395461.0,2680374.0,29.78,21243010.0,456.958,317856.0,45161.0,24620.0,1041.0,1982.0,53100000.0,2744430000.0,801000000.0,53100000.0,2744430000.0,801000000.0,2020.952871,21.79696,4.214863,46.325087,15.360626,11.809839,12.328842,3.343793,535420100.0,2006
2007,17.713619,13.5052,20.81,4.126043,4504422.0,18.318046,19997903.0,60.968358,66559462.0,1661288.0,30.98,20952510.0,482.34,309633.0,54610.0,28608.0,943.0,1939.0,306500000.0,6146400000.0,120000000.0,306500000.0,6146400000.0,120000000.0,2047.531118,20.251289,3.928421,49.797943,15.268856,10.306402,11.931904,2.753846,482918900.0,2007
2008,19.534464,16.112432,21.71,6.797468,7532633.0,18.491522,20491430.0,67.954054,75303469.0,3312717.0,31.17,18825980.0,482.785,266244.0,56592.0,27695.0,1188.0,1993.0,38900000.0,905080000.0,562000000.0,38900000.0,905080000.0,562000000.0,2046.694449,18.821454,3.640347,52.096987,15.375675,14.553218,7.898186,2.260716,406308100.0,2008


In [25]:
#saving to csv
mex_infra.to_csv('../Data/Infrastructure/cleaned_Infrastructure/mex_infra.csv', index = True)

**Notes**: Around 15/47 of our indicators cannot be used for this project. The indicators that had less than 3 nulls I simply filled with the mean as I did not believe they would affect our outcome too much. Only did this for values that I knew had values such as water treatment, or electricity, those values were not 0.

### Healthcare Expenditure for Mexico<a id='health'></a> 

In [26]:
#filtering for only mexico healthcare spending
mex_spend = spend_c.loc[spend_c['LOCATION'] == 'MEX']

#filtering for total expenditure metric only
mex_spend = mex_spend.loc[mex_spend['SUBJECT'] == 'TOT']

#filtering for percent gdp as our metric
mex_spend = mex_spend.loc[mex_spend['MEASURE'] == 'PC_GDP']

#renaming columns
mex_spend.rename(columns={"Value": "Healthcare Expenditure Percent GDP", 'TIME':'Year'}, inplace = True)

#dropping columns we don't care about
mex_spend.drop(axis = 0, columns = ['SUBJECT', 'MEASURE','INDICATOR','LOCATION'], inplace = True)

In [27]:
#final mexico healthcare spending datframe
mex_spend

Unnamed: 0,Year,Healthcare Expenditure Percent GDP
1279,1999,4.395
1280,2000,4.449
1281,2001,4.82
1282,2002,5.072
1283,2003,5.815
1284,2004,5.954
1285,2005,5.836
1286,2006,5.655
1287,2007,5.766
1288,2008,5.699


**Notes**: Mexican healthcare spending as a portion of Mexico's GDP has increased and then decreased in recent years.

### Mexico Pharmaceutical Spending Data<a id='pharm'></a> 

In [28]:
#filtering pharmaceutical spending data for mexico
mex_pharm = mex_pharm.loc[mex_pharm['LOCATION'] == 'MEX']

#renaming columns
mex_pharm.rename(columns={"Value": "Mexico Pharmaceutical Spending US $ / Capita", 'TIME': 'Year'}, inplace = True)

#dropping irrelevant columns
mex_pharm.drop(axis = 0, columns = ['Unnamed: 0', 'LOCATION','INDICATOR'], inplace = True)

#setting index to year
mex_pharm = mex_pharm.set_index('Year')

#inputting null values
mex_pharm.loc['2018'] = 251.123

#creating a year column again just for future manipulation of dataframe
mex_pharm['Year'] = mex_pharm.index

In [29]:
#final mexico pharmaceutical spending data
mex_pharm

Unnamed: 0_level_0,Mexico Pharmaceutical Spending US $ / Capita,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1999,88.193,1999
2000,100.973,2000
2001,110.626,2001
2002,127.529,2002
2003,242.818,2003
2004,264.753,2004
2005,270.915,2005
2006,282.052,2006
2007,292.359,2007
2008,295.853,2008


**Comments**: Inputed null value based on previous year value, did not think it would affect distribution or results too much. Mexico's spending on healthcare per capita has increased significantly from 1999 to 2003 but has stayed pretty consistent. However it has decreased in recent years.

### Mexico Medical Technology Count Data<a id='pharm'></a> 

In [30]:
#filtering technology data for mexico
mex_tech = tech_c.loc[tech_c['Country'] == 'Mexico']

In [31]:
#checking to see which technology columns we care about
mex_tech['Variable'].unique()

array(['Computed Tomography scanners, total',
       'Computed Tomography scanners, in hospitals',
       'Computed Tomography scanners, in ambulatory sector',
       'Magnetic Resonance Imaging units, total',
       'Magnetic Resonance Imaging units, in hospitals',
       'Magnetic Resonance Imaging units, in ambulatory sector',
       'Positron Emission Tomography (PET) scanners, total',
       'Positron Emission Tomography (PET) scanners, in hospitals',
       'Positron Emission Tomography (PET) scanners, in ambulatory sector',
       'Gamma cameras, total', 'Gamma cameras, in hospitals',
       'Gamma cameras, in ambulatory sector', 'Mammographs, total',
       'Mammographs, in hospitals', 'Mammographs, in ambulatory sector',
       'Radiation therapy equipment, total',
       'Radiation therapy equipment, in hospitals',
       'Radiation therapy equipment, in ambulatory sector'], dtype=object)

In [32]:
#renaming columns
mex_tech.rename(columns={"Value": "Medical Device Count"}, inplace = True)

#creating a database that is filtering for total CT scans
mex_tech_1 = mex_tech.loc[mex_tech['Variable'] == 'Computed Tomography scanners, total']

#renaming columns to appropriate device count for future merging of dataframe
mex_tech_1.rename(columns={"Medical Device Count": "CT Scan Device Counts"}, inplace = True)

#I only want total counts not per x number of people rates, therefore that is the first 18 rows.
mex_tech_1 = mex_tech_1.iloc[0:17]

#removing variable column because labeled column is self eplanatory
mex_tech_1.drop(axis = 0, columns = ['Country', 'Variable'], inplace = True)

#setting index to year
mex_tech_1 = mex_tech_1.set_index('Year')

#inputting null values
mex_tech_1.loc['1999'] = ['200']
mex_tech_1.loc['2000'] = ['220']
mex_tech_1.loc['2018'] = ['730']

#adding back the year column for future manipulation
mex_tech_1['Year'] = mex_tech_1.index

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [33]:
mex_tech_1

Unnamed: 0_level_0,CT Scan Device Counts,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,250,2001
2002,296,2002
2003,309,2003
2004,289,2004
2005,325,2005
2006,344,2006
2007,403,2007
2008,446,2008
2009,459,2009
2010,535,2010


**Notes**: Filled empty values based on nearby year values. THe number of CT scan devices has increased substantiallyover the years, approximately 3.5 fold since 1999.

In [34]:
#making another tech dataframe that contains data on only PET scanner totals
mex_tech_2 = mex_tech.loc[mex_tech['Variable'] == 'Positron Emission Tomography (PET) scanners, total']

#renaming columns
mex_tech_2.rename(columns={"Medical Device Count": "PET Scan Device Counts"}, inplace = True)

#dropping variable column because our columns are self explanatory
mex_tech_2.drop(axis = 0, columns = ['Country', 'Variable'], inplace = True)

#only want the first rows for the same reason above, want total counts, not rates.
mex_tech_2 = mex_tech_2.iloc[0:14]

#setting index to year
mex_tech_2 = mex_tech_2.set_index('Year')

#imputting null values
mex_tech_2.loc['1999'] = [0]
mex_tech_2.loc['2000'] = [0]
mex_tech_2.loc['2001'] = [0]
mex_tech_2.loc['2002'] = [0]
mex_tech_2.loc['2003'] = [0]
mex_tech_2.loc['2018'] = [15]

#adding back yearly column for future manipulation
mex_tech_2['Year'] = mex_tech_2.index

In [35]:
mex_tech_2

Unnamed: 0_level_0,PET Scan Device Counts,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.0,2004
2005,0.0,2005
2006,0.0,2006
2007,0.0,2007
2008,0.0,2008
2009,0.0,2009
2010,0.0,2010
2011,5.0,2011
2012,7.0,2012
2013,7.0,2013


**Notes**: Again filled nans based on closest available data. PET scanners are used for detecting diseases and would be a critical part of containing a virus as it helps detection. Mexico did not have any of these devices until 2011.

### Mexico Medical Workers Count Data<a id='workers'></a> 

In [36]:
#filtering medical workers for mexico
mex_workers = workers_c.loc[workers_c['Country'] == 'Mexico']

#dropping YEA because that's the same column as YEAR.
mex_workers.drop(axis = 0, columns = ['YEA'], inplace = True)

In [37]:
mex_workers.head()

Unnamed: 0,Country,Variable,Year,Value
6468,Mexico,Medical graduates,2005,11973.0
6469,Mexico,Medical graduates,2006,10899.0
6470,Mexico,Medical graduates,2007,12255.0
6471,Mexico,Medical graduates,2008,13209.0
6472,Mexico,Medical graduates,2009,12926.0


**Notes**: Lots of missing data for Mexico, not feasible to use.

### Mexico Doctors per 1000 People Data<a id='dr'></a> 

In [38]:
#filtering for mexico
mex_dr = mex_dr.loc[mex_dr['LOCATION'] == 'MEX']

#renaming columns
mex_dr.rename(columns={"Value": "Doctors per 1000 People"}, inplace = True)

#dropping irrelevant columns
mex_dr.drop(axis = 0, columns = ['Unnamed: 0', 'LOCATION','INDICATOR'], inplace = True)

#setting index to year
mex_dr = mex_dr.set_index('TIME')

#imputting null values
mex_dr.loc['2018'] = [2.43]

#creating year column again for future manipulation
mex_dr['Year'] = mex_dr.index

In [39]:
#these are per 1000 people
mex_dr

Unnamed: 0_level_0,Doctors per 1000 People,Year
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1
1999,1.67,1999
2000,1.59,2000
2001,1.49,2001
2002,1.5,2002
2003,1.55,2003
2004,1.63,2004
2005,1.75,2005
2006,1.87,2006
2007,1.91,2007
2008,1.94,2008


**Notes**: Again imputed missing data on closest available data, which was only 2018. Mexico has increased the number of doctors per 1000 people over the years. 

### Mexico Nurses per 1000 People Data<a id='nurses'></a> 

In [40]:
#filtering for mexico
mex_nurses = mex_nurses.loc[mex_nurses['LOCATION'] == 'MEX']

#renaming columns
mex_nurses.rename(columns={"Value": "Nurses per 1000 People"}, inplace = True)

#dropping irrelevant columns
mex_nurses.drop(axis = 0, columns = ['Unnamed: 0', 'LOCATION','INDICATOR'], inplace = True)

#set index to year
mex_nurses = mex_nurses.set_index('TIME')

#imputting nearby values
mex_nurses.loc['2018'] = [2.90]

#creating year column for future manipulation
mex_nurses['Year'] = mex_nurses.index

In [41]:
#per 1000
mex_nurses

Unnamed: 0_level_0,Nurses per 1000 People,Year
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1
1998,2.09,1998
1999,2.14,1999
2000,2.19,2000
2001,2.19,2001
2002,2.19,2002
2003,2.1,2003
2004,2.08,2004
2005,2.18,2005
2006,2.2,2006
2007,2.27,2007


**Notes**: The number of nurses over the years has not increased as much as the number of doctors. This could be because accesibility in terms of programs/funding/certification of program is more feasible than becoming a doctor especially in developing countries.

### Mexico Hospital Count Data<a id='hospital'></a> 

In [42]:
#filtering for mexico data
mex_hosp = hosp_c.loc[hosp_c['Country'] == 'Mexico' ]

#filtering for hospitals counts
mex_hosp = mex_hosp.loc[mex_hosp['Variable'] == 'Hospitals']

#only want raw counts not the per capita rates
mex_hosp = mex_hosp.iloc[0:19]

#renaming columns
mex_hosp.rename(columns={"Value": "Hospital Count"}, inplace = True)

#dropping irrelevant columns
mex_hosp.drop(axis = 0, columns = ['Country', 'Variable'], inplace = True)

#setting index to Year
mex_hosp = mex_hosp.set_index('Year')

#imputting null values
mex_hosp.loc['1999'] = [3940]

#adding year column back for future manipulation
mex_hosp['Year'] = mex_hosp.index

In [43]:
mex_hosp

Unnamed: 0_level_0,Hospital Count,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,3952.0,2000
2001,3978.0,2001
2002,4088.0,2002
2003,4150.0,2003
2004,4110.0,2004
2005,4243.0,2005
2006,4245.0,2006
2007,4344.0,2007
2008,4379.0,2008
2009,4406.0,2009


**Notes**: Imputed null values on estimation based on closest available data. The number of hospitals in Mexico has a lot of variation as previously shown by our COV chart. 

# Filtering Disease Data for Mexico<a id='filter2'></a> 

### Mexico Cholera Data<a id='cholera'></a> 

In [44]:
#filtering for mexico data
mex_cholera = cholera_c.loc[cholera_c['Country'] == 'Mexico']

#sorting data by year
mex_cholera = mex_cholera.sort_values(by=['Year'], ascending = False)

#renaming columns
mex_cholera.rename(columns={"cholera_cases": "Cholera Cases"}, inplace = True)

#dropping country because we know it's just mexico data
mex_cholera.drop(axis = 0, columns = ['Country'], inplace = True)

#converting data type
mex_cholera['Cholera Cases'] = mex_cholera['Cholera Cases'].astype('int')

In [45]:
mex_cholera

Unnamed: 0,Year,Cholera Cases
1406,2015,1
1407,2014,14
1408,2013,187
1409,2012,2
1410,2011,1
1411,2010,1
1412,2008,1
1413,2001,1
1414,2000,5
1415,1998,71


**Comments**: Can't use too much missing data. There's a huge gap from 2001 to 2008.

### Mexico Measles Data<a id='measles'></a> 

In [46]:
#filtering for mexico
mex_measles = measles_c.loc[measles_c['Country'] == 'Mexico']

#transposing to get measles as our X
mex_measles = mex_measles.T

#only want these rows for 1999-2018
mex_measles = mex_measles.iloc[1:21]

#renaming columns
mex_measles.rename(columns={110: "Measles Cases"}, inplace = True)

#converting datatypes
mex_measles['Measles Cases'] = mex_measles['Measles Cases'].astype('int')

In [47]:
mex_measles

Unnamed: 0,Measles Cases
2018,5
2017,0
2016,0
2015,0
2014,3
2013,0
2012,0
2011,3
2010,0
2009,0


**Notes**: A case of high variation but very few number of cases, this disease is not prominent for Mexico and probably won't be used.

### Mexico Mumps Data<a id='mumps'></a> 

In [48]:
#filtering for mexico
mex_mumps = mumps_c.loc[mumps_c['Country'] == 'Mexico']

#transposing to get # of cases as X
mex_mumps = mex_mumps.T

#only want 1999-2018
mex_mumps = mex_mumps.iloc[1:21]

#renaming columns
mex_mumps.rename(columns={110: "Mumps Cases"}, inplace = True)

#imputting null values
mex_mumps.loc['1999'] = [31000]
mex_mumps.loc['2002'] = [27000]
mex_mumps.loc['2017'] = [3450]
mex_mumps.loc['2018'] = [7857]

#converting datatype
mex_mumps['Mumps Cases'] = mex_mumps['Mumps Cases'].astype('int')

In [49]:
mex_mumps

Unnamed: 0,Mumps Cases
2018,7857
2017,3450
2016,3646
2015,3399
2014,4143
2013,0
2012,5683
2011,2685
2010,5780
2009,0


**Comments**: Imputting null values here was the trickiest. I used Honduras as my proxy for imputting null values. It had a complete dataset available and consistently had 3% of the # of Mexico Mumps cases so I filled accordingly. This is a good dataset to use because there is high variation and a lot of cases in Mexico.

### Mexico Pertussis Data<a id='pert'></a> 

In [50]:
#filtered for Mexico data
mex_pert = pert_c.loc[pert_c['Country'] == 'Mexico']

#transposed to get # of cases as X
mex_pert = mex_pert.T

#only want 1999-2018
mex_pert = mex_pert.iloc[1:21]

#renaming columns
mex_pert.rename(columns={110: "Pertussis Cases"}, inplace = True)

#converting datatypes
mex_pert['Pertussis Cases'] = mex_pert['Pertussis Cases'].astype('int')

In [51]:
mex_pert

Unnamed: 0,Pertussis Cases
2018,783
2017,827
2016,1029
2015,1107
2014,955
2013,961
2012,978
2011,252
2010,371
2009,559


**Notes**: Also a good dataset to use, no nulls, 20 years and a lot of variation in the number of cases.

### Mexico Rubella Data<a id='rubella'></a> 

In [52]:
#filtering for Mexico
mex_rubella = rubella_c.loc[rubella_c['Country'] == 'Mexico']

#transposing for # of cases as X
mex_rubella = mex_rubella.T

#only want 1999-2018
mex_rubella = mex_rubella.iloc[1:21]

#renaming columns
mex_rubella.rename(columns={110: "Rubella Cases"}, inplace = True)

#converting datatypes
mex_rubella['Rubella Cases'] = mex_rubella['Rubella Cases'].astype('int')

In [53]:
mex_rubella

Unnamed: 0,Rubella Cases
2018,2
2017,0
2016,0
2015,0
2014,0
2013,0
2012,2
2011,0
2010,0
2009,0


**Notes**: This disease showed a lot of variation from 1999-2002 but has since been pretty much eradicated and we probably won't use it.

### Mexico Tuberculosis Data

In [54]:
#filtering for Mexico
mex_tb = tb_c.loc[tb_c['Country'] == 'Mexico']

#dropping irrelevant columns
mex_tb.drop(axis = 0, columns = ['Country', 'Unnamed: 0'], inplace = True)

#renaming columns
mex_tb.rename(columns={'tuberculosis_incidence': "TB Cases"}, inplace = True)

#setting index
mex_tb = mex_tb.set_index('Year')

#filling missing data
mex_tb.loc['1999'] = [22000]

#converting datatypes
mex_tb['TB Cases'] = mex_tb['TB Cases'].astype('int')

In [55]:
mex_tb

Unnamed: 0_level_0,TB Cases
Year,Unnamed: 1_level_1
2018,29000
2017,28000
2016,28000
2015,27000
2014,26000
2013,26000
2012,26000
2011,25000
2010,25000
2009,24000


**Notes**: Filled nulls on closest available data. This disease is pretty consistent year to year and does not show much variation, will probably not use this dataset.

# Selecting & Merging Disease Dataframes<a id='dd'></a>

**Note**: Because there is a significant number of cases and variation for Pertussis and Mumps, we will only be using those two datasets for our model.

In [56]:
merge1 = mex_pert.join(mex_mumps)

In [57]:
merge1

Unnamed: 0,Pertussis Cases,Mumps Cases
2018,783,7857
2017,827,3450
2016,1029,3646
2015,1107,3399
2014,955,4143
2013,961,0
2012,978,5683
2011,252,2685
2010,371,5780
2009,559,0


In [58]:
#we will be doing our model on the total number of these two diseases
merge1['Total Cases'] = merge1['Pertussis Cases'] + merge1['Mumps Cases']

In [59]:
merge1

Unnamed: 0,Pertussis Cases,Mumps Cases,Total Cases
2018,783,7857,8640
2017,827,3450,4277
2016,1029,3646,4675
2015,1107,3399,4506
2014,955,4143,5098
2013,961,0,961
2012,978,5683,6661
2011,252,2685,2937
2010,371,5780,6151
2009,559,0,559


In [60]:
#saving our cleaned and final disease dataframe
merge1.to_csv('../Data/Diseases/cleaned_disease/mexico_disease.csv', index = True)

## Creating Final Dataframe (Disease Counts & Infrastructure) <a id='id'></a>

In [61]:
mex_infra = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/mex_infra.csv')

In [62]:
mex_infra.head()

Unnamed: 0.1,Unnamed: 0,ICT goods exports (% of total goods exports),ICT goods imports (% total goods imports),Individuals using the Internet (% of population),Fixed broadband subscriptions (per 100 people),Fixed broadband subscriptions,Fixed telephone subscriptions (per 100 people),Fixed telephone subscriptions,Mobile cellular subscriptions (per 100 people),Mobile cellular subscriptions,Container port traffic (TEU: 20 foot equivalent units),Liner shipping connectivity index (maximum value in 2004 = 100),"Air transport, passengers carried","Air transport, freight (million ton-km)","Air transport, registered carrier departures worldwide","Trademark applications, resident, by count","Trademark applications, nonresident, by count","Industrial design applications, resident, by count","Industrial design applications, nonresident, by count",Public private partnerships investment in water and sanitation (current US$),Public private partnerships investment in transport (current US$),Public private partnerships investment in energy (current US$),Investment in water and sanitation with private participation (current US$),Investment in transport with private participation (current US$),Investment in energy with private participation (current US$),Electric power consumption (kWh per capita),Electricity production from oil sources (% of total),Electricity production from nuclear sources (% of total),Electricity production from natural gas sources (% of total),Electric power transmission and distribution losses (% of output),Electricity production from hydroelectric sources (% of total),Electricity production from coal sources (% of total),"ICT service exports (% of service exports, BoP)","ICT service exports (BoP, current US$)",Year
0,1999,18.507953,17.130352,1.857436,6.913037,8203597.0,11.209318,10927385.0,7.931116,7731635.0,3545091.0,36.862667,20561100.0,317.0,328300.0,71826.6,31781.4,608.0,976.0,173600000.0,370200000.0,306300000.0,173600000.0,370200000.0,348500000.0,1689.185862,45.374132,5.222704,18.809984,14.288549,17.117644,9.685656,10.129306,1168485000.0,1999
1,2000,20.909537,17.900664,5.081384,0.015167,15000.0,12.468853,12331676.0,14.234481,14077880.0,1315701.0,36.862667,20894205.0,309.858,290412.0,71826.6,31781.4,668.0,1258.0,375763300.0,793600000.0,1469000000.0,375763300.0,793600000.0,2214300000.0,1800.467938,45.508205,3.997083,21.455695,13.848547,16.109396,9.234958,8.997126,1212828000.0,2000
2,2001,22.061841,18.906531,7.038023,0.049851,50000.0,13.7332,13774146.0,21.692881,21757559.0,1358136.0,36.862667,20172824.0,295.884,292208.0,71826.6,31781.4,779.0,973.0,72000000.0,223800000.0,311000000.0,72000000.0,223800000.0,311000000.0,1834.151423,42.002355,4.077799,26.020151,14.057798,13.319438,10.794063,6.32509,787198700.0,2001
3,2002,20.739979,17.298672,11.9,0.227651,231486.0,14.726971,14975085.0,25.498675,25928266.0,1564540.0,36.862667,19618662.0,341.598,278456.0,71826.6,31781.4,769.0,1208.0,375763300.0,71400000.0,2124000000.0,375763300.0,71400000.0,2124000000.0,1843.963674,34.384867,4.466573,34.166739,14.169122,11.433822,11.925983,4.442751,557379900.0,2002
4,2003,19.31121,17.392107,12.9,0.415567,428371.0,15.841972,16330066.0,29.198101,30097700.0,1693791.0,36.862667,20661206.0,350.089,287017.0,71826.6,31781.4,820.0,1162.0,375763300.0,74700000.0,1285000000.0,375763300.0,74700000.0,1285000000.0,2001.018228,30.348366,4.445912,40.143173,12.312408,8.41599,12.936834,3.397368,422526100.0,2003


In [63]:
mex_infra.drop(axis = 0, columns = ['Unnamed: 0'], inplace = True)

In [64]:
mex_infra.head()

Unnamed: 0,ICT goods exports (% of total goods exports),ICT goods imports (% total goods imports),Individuals using the Internet (% of population),Fixed broadband subscriptions (per 100 people),Fixed broadband subscriptions,Fixed telephone subscriptions (per 100 people),Fixed telephone subscriptions,Mobile cellular subscriptions (per 100 people),Mobile cellular subscriptions,Container port traffic (TEU: 20 foot equivalent units),Liner shipping connectivity index (maximum value in 2004 = 100),"Air transport, passengers carried","Air transport, freight (million ton-km)","Air transport, registered carrier departures worldwide","Trademark applications, resident, by count","Trademark applications, nonresident, by count","Industrial design applications, resident, by count","Industrial design applications, nonresident, by count",Public private partnerships investment in water and sanitation (current US$),Public private partnerships investment in transport (current US$),Public private partnerships investment in energy (current US$),Investment in water and sanitation with private participation (current US$),Investment in transport with private participation (current US$),Investment in energy with private participation (current US$),Electric power consumption (kWh per capita),Electricity production from oil sources (% of total),Electricity production from nuclear sources (% of total),Electricity production from natural gas sources (% of total),Electric power transmission and distribution losses (% of output),Electricity production from hydroelectric sources (% of total),Electricity production from coal sources (% of total),"ICT service exports (% of service exports, BoP)","ICT service exports (BoP, current US$)",Year
0,18.507953,17.130352,1.857436,6.913037,8203597.0,11.209318,10927385.0,7.931116,7731635.0,3545091.0,36.862667,20561100.0,317.0,328300.0,71826.6,31781.4,608.0,976.0,173600000.0,370200000.0,306300000.0,173600000.0,370200000.0,348500000.0,1689.185862,45.374132,5.222704,18.809984,14.288549,17.117644,9.685656,10.129306,1168485000.0,1999
1,20.909537,17.900664,5.081384,0.015167,15000.0,12.468853,12331676.0,14.234481,14077880.0,1315701.0,36.862667,20894205.0,309.858,290412.0,71826.6,31781.4,668.0,1258.0,375763300.0,793600000.0,1469000000.0,375763300.0,793600000.0,2214300000.0,1800.467938,45.508205,3.997083,21.455695,13.848547,16.109396,9.234958,8.997126,1212828000.0,2000
2,22.061841,18.906531,7.038023,0.049851,50000.0,13.7332,13774146.0,21.692881,21757559.0,1358136.0,36.862667,20172824.0,295.884,292208.0,71826.6,31781.4,779.0,973.0,72000000.0,223800000.0,311000000.0,72000000.0,223800000.0,311000000.0,1834.151423,42.002355,4.077799,26.020151,14.057798,13.319438,10.794063,6.32509,787198700.0,2001
3,20.739979,17.298672,11.9,0.227651,231486.0,14.726971,14975085.0,25.498675,25928266.0,1564540.0,36.862667,19618662.0,341.598,278456.0,71826.6,31781.4,769.0,1208.0,375763300.0,71400000.0,2124000000.0,375763300.0,71400000.0,2124000000.0,1843.963674,34.384867,4.466573,34.166739,14.169122,11.433822,11.925983,4.442751,557379900.0,2002
4,19.31121,17.392107,12.9,0.415567,428371.0,15.841972,16330066.0,29.198101,30097700.0,1693791.0,36.862667,20661206.0,350.089,287017.0,71826.6,31781.4,820.0,1162.0,375763300.0,74700000.0,1285000000.0,375763300.0,74700000.0,1285000000.0,2001.018228,30.348366,4.445912,40.143173,12.312408,8.41599,12.936834,3.397368,422526100.0,2003


**Note**: I could not combine the healthcare infrastructure dataframes and the disease dataframes to the non-healthcare infrastructure dataframe despite having the same datatype, index, and shared Year column. Not sure why this happened, but decided to save time resolving the issue and impute these values manually in an excel csv and feed it back in because it was only 20 values. 

In [65]:
mex_final_df = pd.read_csv('../Data/Infrastructure/mex_pre_fs.csv')

In [66]:
mex_final_df

Unnamed: 0.1,Unnamed: 0,Year,ICT goods exports (% of total goods exports),ICT goods imports (% total goods imports),Individuals using the Internet (% of population),Fixed broadband subscriptions (per 100 people),Fixed broadband subscriptions,Fixed telephone subscriptions (per 100 people),Fixed telephone subscriptions,Mobile cellular subscriptions (per 100 people),Mobile cellular subscriptions,Container port traffic (TEU: 20 foot equivalent units),Liner shipping connectivity index (maximum value in 2004 = 100),"Air transport, passengers carried","Air transport, freight (million ton-km)","Air transport, registered carrier departures worldwide","Trademark applications, resident, by count","Trademark applications, nonresident, by count","Industrial design applications, resident, by count","Industrial design applications, nonresident, by count",Public private partnerships investment in water and sanitation (current US$),Public private partnerships investment in transport (current US$),Public private partnerships investment in energy (current US$),Investment in water and sanitation with private participation (current US$),Investment in transport with private participation (current US$),Investment in energy with private participation (current US$),Electric power consumption (kWh per capita),Electricity production from oil sources (% of total),Electricity production from nuclear sources (% of total),Electricity production from natural gas sources (% of total),Electric power transmission and distribution losses (% of output),Electricity production from hydroelectric sources (% of total),Electricity production from coal sources (% of total),"ICT service exports (% of service exports, BoP)","ICT service exports (BoP, current US$)",Healthcare Expenditure Percent GDP,Pharmaceutical Spending US $ / Capita,CT Scan Device Counts,PET Scan Device Counts,Doctors per 1000 People,Nurses per 1000 People,Hospital Count,Total Cases
0,1999,1999,18.507953,17.130352,1.857436,6.913037,8203597.0,11.209318,10927385,7.931116,7731635,3545091.283,36.862667,20561100.0,317.0,328300.0,71826.6,31781.4,608,976,173600000.0,370200000,306300000,173600000.0,370200000,348500000,1689.185862,45.374132,5.222704,18.809984,14.288549,17.117644,9.685656,10.129306,1168485000.0,4.395,88.193,200,0,1.67,2.14,3940,8645
1,2000,2000,20.909537,17.900664,5.081384,0.015167,15000.0,12.468853,12331676,14.234481,14077880,1315701.0,36.862667,20894205.0,309.858,290412.0,71826.6,31781.4,668,1258,375763300.0,793600000,1469000000,375763300.0,793600000,2214300000,1800.467938,45.508205,3.997083,21.455695,13.848547,16.109396,9.234958,8.997126,1212828000.0,4.449,100.973,220,0,1.59,2.19,3952,4277
2,2001,2001,22.061841,18.906531,7.038023,0.049851,50000.0,13.7332,13774146,21.692881,21757559,1358136.0,36.862667,20172824.0,295.884,292208.0,71826.6,31781.4,779,973,72000000.0,223800000,311000000,72000000.0,223800000,311000000,1834.151423,42.002355,4.077799,26.020151,14.057798,13.319438,10.794063,6.32509,787198700.0,4.82,110.626,250,0,1.49,2.19,3978,4675
3,2002,2002,20.739979,17.298672,11.9,0.227651,231486.0,14.726971,14975085,25.498675,25928266,1564540.0,36.862667,19618662.0,341.598,278456.0,71826.6,31781.4,769,1208,375763300.0,71400000,2124000000,375763300.0,71400000,2124000000,1843.963674,34.384867,4.466573,34.166739,14.169122,11.433822,11.925983,4.442751,557379900.0,5.072,127.529,296,0,1.5,2.19,4088,4506
4,2003,2003,19.31121,17.392107,12.9,0.415567,428371.0,15.841972,16330066,29.198101,30097700,1693791.0,36.862667,20661206.0,350.089,287017.0,71826.6,31781.4,820,1162,375763300.0,74700000,1285000000,375763300.0,74700000,1285000000,2001.018228,30.348366,4.445912,40.143173,12.312408,8.41599,12.936834,3.397368,422526100.0,5.815,242.818,309,0,1.55,2.1,4150,5101
5,2004,2004,19.684402,18.409806,14.1,1.011609,1057282.0,17.292494,18073238,36.790088,38451135,1903345.0,25.29,21167881.0,394.925,330441.0,38314.0,20239.0,902,1555,421700000.0,485600000,695000000,421700000.0,485600000,695000000,1927.073923,28.888431,3.869806,42.792624,14.690024,10.609345,10.000716,3.082827,422529900.0,5.954,264.753,289,0,1.63,2.08,4110,961
6,2005,2005,17.988816,17.142958,17.21,1.813451,1922352.0,18.406666,19512024,44.458899,47128746,2144345.0,25.49,21857658.0,390.425,331225.0,41680.0,22219.0,987,1790,30000000.0,1321300000,120000000,30000000.0,1321300000,120000000,1996.392573,27.303723,4.308763,40.13351,14.921362,11.049655,13.05669,3.482021,547927400.0,5.836,270.915,325,0,1.75,2.18,4243,6661
7,2006,2006,18.769373,17.540997,19.52,2.807731,3020000.0,18.465294,19861299,51.501843,55395461,2680374.0,29.78,21243010.0,456.958,317856.0,45161.0,24620.0,1041,1982,53100000.0,2744430000,801000000,53100000.0,2744430000,801000000,2020.952871,21.79696,4.214863,46.325087,15.360626,11.809839,12.328842,3.343793,535420100.0,5.655,282.052,344,0,1.87,2.2,4245,2940
8,2007,2007,17.713619,13.5052,20.81,4.126043,4504422.0,18.318046,19997903,60.968358,66559462,1661288.0,30.98,20952513.0,482.34,309633.0,54610.0,28608.0,943,1939,306500000.0,6146400000,120000000,306500000.0,6146400000,120000000,2047.531118,20.251289,3.928421,49.797943,15.268856,10.306402,11.931904,2.753846,482918900.0,5.766,292.359,403,0,1.91,2.27,4344,6151
9,2008,2008,19.534464,16.112432,21.71,6.797468,7532633.0,18.491522,20491430,67.954054,75303469,3312717.195,31.17,18825976.0,482.785,266244.0,56592.0,27695.0,1188,1993,38900000.0,905080000,562000000,38900000.0,905080000,562000000,2046.694449,18.821454,3.640347,52.096987,15.375675,14.553218,7.898186,2.260716,406308100.0,5.699,295.853,446,0,1.94,2.3,4379,559


# Feature Selection

### Highest Correlation with Cases

In [67]:
#finding the highest correlations with total amount of cases
mex_final_df[mex_final_df.columns[1:]].corr()['Total Cases'][:-1]

Year                                                                            0.762504
ICT goods exports (% of total goods exports)                                   -0.532857
ICT goods imports (% total goods imports)                                      -0.299513
Individuals using the Internet (% of population)                                0.854396
Fixed broadband subscriptions (per 100 people)                                  0.764278
Fixed broadband subscriptions                                                   0.795243
Fixed telephone subscriptions (per 100 people)                                  0.046732
Fixed telephone subscriptions                                                   0.407163
Mobile cellular subscriptions (per 100 people)                                  0.600547
Mobile cellular subscriptions                                                   0.658463
Container port traffic (TEU: 20 foot equivalent units)                          0.831464
Liner shipping connec

In [68]:
mex_final_df.corr().unstack().sort_values().drop_duplicates()

Electricity production from oil sources (% of total)            Electricity production from natural gas sources (% of total)    -0.991249
Electricity production from natural gas sources (% of total)    ICT service exports (% of service exports, BoP)                 -0.909215
Electricity production from oil sources (% of total)            Pharmaceutical Spending US $ / Capita                           -0.904571
Fixed telephone subscriptions                                   ICT service exports (% of service exports, BoP)                 -0.894759
Electricity production from natural gas sources (% of total)    ICT service exports (BoP, current US$)                          -0.879521
                                                                                                                                   ...   
ICT service exports (% of service exports, BoP)                 ICT service exports (BoP, current US$)                           0.988446
Public private partnerships invest

### Feature Selection:


Most of feature selection was selecting the highest correlated features of each category. Also only selecting the highest from each category to prevent multicollinearity. I.e., The energy metrics all have high correlation but are collinear because they are in the same sector so we are only selecting one energy metric. However, there are some features that are although in different categories still collinear. Multicollinearity isn't a huge problem in ARIMAX models since it is a forecasting model and not inference so I decided to leave these in because the goal of this model is interpretability AND forecasting. Yes it is true if healthcare spending overall increases you most likely will get increases in Medical Technology and Pharmaceuticals, but ultimately we want this model to be able to say "if we have this amount of healthcare spending but this amount of medical technology how does the # of cases change."

Also selecting metrics that I am interested in regardless of their correlation with Number of Cases. Again I want this model to be interpretable, and therefore have factors that can address a broad range of questions. For example, even though investment in sanitation has little correlation with Number of Cases I want to see the effect that it has if we invest heavily or reduce investment into it because that is an obvious area that is considered part of public health.

- ICT goods exports (% of total goods exports)
- Individuals using the Internet (% of populaiton)
- Fixed telephone subscriptions (per 100 people)
- Air transport, freight (million ton-km)
- Industrial design applications, nonresident, by count
- Public private partnerships investment in water and sanitation (current US dollars)
- Public private partnerships investment in transport (current US dollars
- Electricity production from natural gas sources (% of total)
- ICT service exports (% of service exports, BoP)
- Healthcare Expenditure Percent GDP                                             
- Pharmaceutical Spending US Dollars / Capita                                          
- CT Scan Device Counts                                                          
- PET Scan Device Counts                                                         
- Doctors per 1000 People                                                        
- Nurses per 1000 People                                                         
- Hospital Count

In [69]:
#creating a dataframe from the columns which I selected above, and our Y(Total Cases)
mex_final_df = mex_final_df[['Year', 'ICT goods exports (% of total goods exports)', 'Individuals using the Internet (% of population)',
                           'Fixed telephone subscriptions (per 100 people)', 'Air transport, freight (million ton-km)',
                           'Industrial design applications, nonresident, by count', 'Public private partnerships investment in water and sanitation (current US$)',
                           'Public private partnerships investment in transport (current US$)', 'Electricity production from natural gas sources (% of total)',
                           'ICT service exports (% of service exports, BoP)', 'Healthcare Expenditure Percent GDP', 'Pharmaceutical Spending US $ / Capita',
                           'CT Scan Device Counts', 'PET Scan Device Counts','Doctors per 1000 People', 'Nurses per 1000 People', 'Hospital Count',
                           'Total Cases']]

In [70]:
mex_final_df

Unnamed: 0,Year,ICT goods exports (% of total goods exports),Individuals using the Internet (% of population),Fixed telephone subscriptions (per 100 people),"Air transport, freight (million ton-km)","Industrial design applications, nonresident, by count",Public private partnerships investment in water and sanitation (current US$),Public private partnerships investment in transport (current US$),Electricity production from natural gas sources (% of total),"ICT service exports (% of service exports, BoP)",Healthcare Expenditure Percent GDP,Pharmaceutical Spending US $ / Capita,CT Scan Device Counts,PET Scan Device Counts,Doctors per 1000 People,Nurses per 1000 People,Hospital Count,Total Cases
0,1999,18.507953,1.857436,11.209318,317.0,976,173600000.0,370200000,18.809984,10.129306,4.395,88.193,200,0,1.67,2.14,3940,8645
1,2000,20.909537,5.081384,12.468853,309.858,1258,375763300.0,793600000,21.455695,8.997126,4.449,100.973,220,0,1.59,2.19,3952,4277
2,2001,22.061841,7.038023,13.7332,295.884,973,72000000.0,223800000,26.020151,6.32509,4.82,110.626,250,0,1.49,2.19,3978,4675
3,2002,20.739979,11.9,14.726971,341.598,1208,375763300.0,71400000,34.166739,4.442751,5.072,127.529,296,0,1.5,2.19,4088,4506
4,2003,19.31121,12.9,15.841972,350.089,1162,375763300.0,74700000,40.143173,3.397368,5.815,242.818,309,0,1.55,2.1,4150,5101
5,2004,19.684402,14.1,17.292494,394.925,1555,421700000.0,485600000,42.792624,3.082827,5.954,264.753,289,0,1.63,2.08,4110,961
6,2005,17.988816,17.21,18.406666,390.425,1790,30000000.0,1321300000,40.13351,3.482021,5.836,270.915,325,0,1.75,2.18,4243,6661
7,2006,18.769373,19.52,18.465294,456.958,1982,53100000.0,2744430000,46.325087,3.343793,5.655,282.052,344,0,1.87,2.2,4245,2940
8,2007,17.713619,20.81,18.318046,482.34,1939,306500000.0,6146400000,49.797943,2.753846,5.766,292.359,403,0,1.91,2.27,4344,6151
9,2008,19.534464,21.71,18.491522,482.785,1993,38900000.0,905080000,52.096987,2.260716,5.699,295.853,446,0,1.94,2.3,4379,559


In [71]:
mex_final_df.to_csv('../Data/Infrastructure/mex_final_df.csv', index = False)