In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

import matplotlib.pyplot as plt
%matplotlib inline

# Pre Processing Table of Contents
- [Reading in Data](#read) 
- [Country & Disease Selection](#select)
- [Filtering Infrastructure Data for Mexico](#filter)
    - [Mexico Infrastructure Data](#infra)
    - [Mexico Healthcare Spending](#health)
    - [Mexico Pharmaceutical Spending](#pharm)
    - [Mexico Medical Technology Counts](#tech)
    - [Mexico Medical Workers Counts](#workers)
    - [Mexico Doctor Counts](#dr)
    - [Mexico Nurses Count](#nurses)
    - [Mexico Hospital Count](#hospital)
- [Filtering Disease Data for Mexico](#filter2)
    - [Mexico Cholera Data](#cholera)
    - [Mexico Measles Data](#measles)
    - [Mexico Mumps Data](#mumps)
    - [Mexico Pertussis Data](#pert)
    - [Mexico Rubella Data](#rubella)
- [Creating our Time Series Dataframe](#df)
    

# Reading in Data<a id='read'></a> 

In [2]:
infra_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/infra_clean.csv')

In [3]:
spend_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/hspend_clean.csv')

In [4]:
mex_pharm = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/pharm_spend_clean.csv')

In [5]:
tech_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/tech_clean.csv')

In [6]:
workers_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/workers_clean.csv')

In [7]:
mex_dr = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/doc_count_clean.csv')

In [8]:
mex_nurses = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/nurse_count_clean.csv')

In [9]:
hosp_c = pd.read_csv('../Data/Infrastructure/Cleaned_Infrastructure/hospcount_clean.csv')

In [10]:
cholera_c = pd.read_csv('../Data/Diseases/cleaned_disease/cholera_clean.csv')

In [11]:
malaria_c = pd.read_csv('../Data/Diseases/cleaned_disease/malaria_clean.csv')

In [12]:
measles_c = pd.read_csv('../Data/Diseases/cleaned_disease/measles_clean.csv')

In [13]:
mumps_c = pd.read_csv('../Data/Diseases/cleaned_disease/mumps_clean.csv')

In [14]:
pert_c = pd.read_csv('../Data/Diseases/cleaned_disease/pert_clean.csv')

In [15]:
rubella_c = pd.read_csv('../Data/Diseases/cleaned_disease/rubella_clean.csv')

In [16]:
tb_c = pd.read_csv('../Data/Diseases/cleaned_disease/tb_clean.csv')

In [17]:
tet_c = pd.read_csv('../Data/Diseases/cleaned_disease/tet_clean.csv')

# Country & Disease Selection<a id='select'></a> 

Mexico was selected as the country of choice for this project because of it's high variation in both infrastructure metrics and disease cases while also having a significant amount of cases for those diseases. Additionally Mexico has more robust data relative to other countries in regards to infrastructure metrics as many developing and third world countries have only recently started reporting statistics.

We selected Mexico based on the following infrastructure metrics:

- Infrastructure COV of 26 which was relatively high considering the worldwide range of 2 to 33.
- Medical Technology COV which was the highest worlwide.
- Third highest hospital count COV worldwide.
- Relatively high COV for medical expenditure relative to other countries at 1.3 with the highest value being 2.3.

We are choosing to study transmission cases for the following diseases because Mexico showed a high COV relative to other countries as well as having a high number of cases.

- Cholera 
- Measles
- Mumps
- Pertussis
- Rubella

# Filtering Infrastructure Data for Mexico<a id='filter'></a> 

### Infrastructure Data for Mexico<a id='infra'></a> 

In [18]:
mex_infra = infra_c.loc[infra_c['Country Name'] == 'Mexico']

In [19]:
mex_infra.head()

Unnamed: 0,Country Name,Indicator Name,1999,2000,2001,2002,2003,2004,2005,2006,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
7144,Mexico,ICT goods exports (% of total goods exports),,20.909537,22.061841,20.739979,19.31121,19.684402,17.988816,18.769373,...,20.166782,16.995108,16.858755,16.266397,16.038272,16.189341,16.143444,16.108171,,
7145,Mexico,ICT goods imports (% total goods imports),,17.900664,18.906531,17.298672,17.392107,18.409806,17.142958,17.540997,...,19.225457,17.284306,16.507423,17.107871,16.338254,16.452883,16.508494,15.112958,,
7146,Mexico,Individuals using the Internet (% of population),1.857436,5.081384,7.038023,11.9,12.9,14.1,17.21,19.52,...,31.05,37.176295,39.75,43.46,44.39,57.431043,59.540446,63.852249,65.772634,
7147,Mexico,Secure Internet servers (per 1 million people),,,,,,,,,...,13.418882,17.295404,26.82603,31.608935,40.96211,57.886926,119.659418,185.48242,225.74548,
7148,Mexico,Secure Internet servers,,,,,,,,,...,1531.0,2001.0,3146.0,3756.0,4930.0,7054.0,14758.0,23144.0,28487.0,


In [20]:
len(mex_infra['Indicator Name'].unique())

47

In [21]:
mex_infra.isnull().sum()

Country Name       0
Indicator Name     0
1999              22
2000              19
2001              17
2002              12
2003              19
2004              12
2005              14
2006              14
2007               4
2008              13
2009              15
2010              11
2011              11
2012               4
2013              11
2014               6
2015               7
2016              19
2017              19
2018              22
2019              47
dtype: int64

In [22]:
#figuring out which rows/indicators we have to lose because of nans
#mex_infra

In [23]:
#dropping rows with too many null values
mex_infra = mex_infra.drop([7157, 7158, 7159, 7163, 7164, 7165, 7173, 7175, 7176, 7177, 7178, 7179, 7180, 7181])

In [24]:
mex_infra.drop(axis = 0, columns = ['Country Name'], inplace = True)

In [25]:
mex_infra = mex_infra.set_index('Indicator Name')

In [26]:
#checking to see if data is as expected
mex_infra.head()

Unnamed: 0_level_0,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ICT goods exports (% of total goods exports),,20.909537,22.061841,20.739979,19.31121,19.684402,17.988816,18.769373,17.713619,19.534464,...,20.166782,16.995108,16.858755,16.266397,16.038272,16.189341,16.143444,16.108171,,
ICT goods imports (% total goods imports),,17.900664,18.906531,17.298672,17.392107,18.409806,17.142958,17.540997,13.5052,16.112432,...,19.225457,17.284306,16.507423,17.107871,16.338254,16.452883,16.508494,15.112958,,
Individuals using the Internet (% of population),1.857436,5.081384,7.038023,11.9,12.9,14.1,17.21,19.52,20.81,21.71,...,31.05,37.176295,39.75,43.46,44.39,57.431043,59.540446,63.852249,65.772634,
Secure Internet servers (per 1 million people),,,,,,,,,,,...,13.418882,17.295404,26.82603,31.608935,40.96211,57.886926,119.659418,185.48242,225.74548,
Secure Internet servers,,,,,,,,,,,...,1531.0,2001.0,3146.0,3756.0,4930.0,7054.0,14758.0,23144.0,28487.0,


In [27]:
#filling rows that did not have many Nans with the mean
mex_infra = mex_infra.apply(lambda row: row.fillna(row.mean()), axis=1)

**Notes**: Around 15/47 of our indicators cannot be used for this project. 

### Healthcare Expenditure for Mexico<a id='health'></a> 

In [28]:
mex_spend = spend_c.loc[spend_c['LOCATION'] == 'MEX']

In [29]:
#percent gdp
mex_spend.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,TIME,Value
1279,MEX,HEALTHEXP,TOT,PC_GDP,1999,4.395
1280,MEX,HEALTHEXP,TOT,PC_GDP,2000,4.449
1281,MEX,HEALTHEXP,TOT,PC_GDP,2001,4.82
1282,MEX,HEALTHEXP,TOT,PC_GDP,2002,5.072
1283,MEX,HEALTHEXP,TOT,PC_GDP,2003,5.815


### Mexico Pharmaceutical Spending Data<a id='pharm'></a> 

In [30]:
mex_pharm = mex_pharm.loc[mex_pharm['LOCATION'] == 'MEX']

In [31]:
mex_pharm.head()

Unnamed: 0.1,Unnamed: 0,LOCATION,INDICATOR,TIME,Value
333,333,MEX,PHARMAEXP,1999,88.193
334,334,MEX,PHARMAEXP,2000,100.973
335,335,MEX,PHARMAEXP,2001,110.626
336,336,MEX,PHARMAEXP,2002,127.529
337,337,MEX,PHARMAEXP,2003,242.818


### Mexico Medical Technology Count Data<a id='pharm'></a> 

In [32]:
mex_tech = tech_c.loc[tech_c['Country'] == 'Mexico']

In [33]:
mex_tech.head()

Unnamed: 0,Country,Variable,Year,Value
3330,Mexico,"Computed Tomography scanners, total",2001,250.0
3331,Mexico,"Computed Tomography scanners, total",2002,296.0
3332,Mexico,"Computed Tomography scanners, total",2003,309.0
3333,Mexico,"Computed Tomography scanners, total",2004,289.0
3334,Mexico,"Computed Tomography scanners, total",2005,325.0


### Mexico Medical Workers Count Data<a id='workers'></a> 

In [34]:
mex_workers = workers_c.loc[workers_c['Country'] == 'Mexico']

In [35]:
mex_workers.drop(axis = 0, columns = ['YEA'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [36]:
mex_workers.head()
#not enough data can't use. maybe fill in this data?

Unnamed: 0,Country,Variable,Year,Value
6468,Mexico,Medical graduates,2005,11973.0
6469,Mexico,Medical graduates,2006,10899.0
6470,Mexico,Medical graduates,2007,12255.0
6471,Mexico,Medical graduates,2008,13209.0
6472,Mexico,Medical graduates,2009,12926.0


### Mexico Doctors per 1000 People Data<a id='dr'></a> 

In [37]:
mex_dr = mex_dr.loc[mex_dr['LOCATION'] == 'MEX']

In [38]:
#these are per 1000 people
mex_dr.head()

Unnamed: 0.1,Unnamed: 0,LOCATION,INDICATOR,TIME,Value
277,277,MEX,MEDICALDOC,1999,1.67
278,278,MEX,MEDICALDOC,2000,1.59
279,279,MEX,MEDICALDOC,2001,1.49
280,280,MEX,MEDICALDOC,2002,1.5
281,281,MEX,MEDICALDOC,2003,1.55


### Mexico Nurses per 1000 People Data<a id='nurses'></a> 

In [39]:
mex_nurses = mex_nurses.loc[mex_nurses['LOCATION'] == 'MEX']

In [40]:
#per 1000
mex_nurses.head()

Unnamed: 0.1,Unnamed: 0,LOCATION,INDICATOR,TIME,Value
263,263,MEX,NURSE,1998,2.09
264,264,MEX,NURSE,1999,2.14
265,265,MEX,NURSE,2000,2.19
266,266,MEX,NURSE,2001,2.19
267,267,MEX,NURSE,2002,2.19


### Mexico Hospital Count Data<a id='hospital'></a> 

In [41]:
mex_hosp =hosp_c.loc[hosp_c['Country'] == 'Mexico']

In [42]:
mex_hosp.head()

Unnamed: 0,Country,Variable,Year,Value
1097,Mexico,Hospitals,2000,3952.0
1098,Mexico,Hospitals,2001,3978.0
1099,Mexico,Hospitals,2002,4088.0
1100,Mexico,Hospitals,2003,4150.0
1101,Mexico,Hospitals,2004,4110.0


# Filtering Disease Data for Mexico<a id='filter2'></a> 

### Mexico Cholera Data<a id='cholera'></a> 

In [43]:
mex_cholera = cholera_c.loc[cholera_c['Country'] == 'Mexico']

In [44]:
mex_cholera

Unnamed: 0,Country,Year,cholera_cases
1406,Mexico,2015,1.0
1407,Mexico,2014,14.0
1408,Mexico,2013,187.0
1409,Mexico,2012,2.0
1410,Mexico,2011,1.0
1411,Mexico,2010,1.0
1412,Mexico,2008,1.0
1413,Mexico,2001,1.0
1414,Mexico,2000,5.0
1415,Mexico,1998,71.0


### Mexico Measles Data<a id='measles'></a> 

In [45]:
mex_measles = measles_c.loc[measles_c['Country'] == 'Mexico']

In [46]:
mex_measles

Unnamed: 0,Country,2018,2017,2016,2015,2014,2013,2012,2011,2010,...,1989,1988,1987,1986,1985,1984,1983,1982,1981,1980
110,Mexico,5.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,...,20381.0,3915.0,2776.0,9824.0,23826.0,5158.0,3368.0,6364.0,11136.0,29730.0


### Mexico Mumps Data<a id='mumps'></a> 

In [47]:
mex_mumps = mumps_c.loc[mumps_c['Country'] == 'Mexico']

In [48]:
mex_mumps

Unnamed: 0,Country,2018,2017,2016,2015,2014,2013,2012,2011,2010,...,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998
110,Mexico,,,3646.0,3399.0,4143.0,0.0,5683.0,2685.0,5780.0,...,7880.0,8322.0,8651.0,8425.0,10825.0,,19696.0,27403.0,,


### Mexico Pertussis Data<a id='pert'></a> 

In [49]:
mex_pert = pert_c.loc[pert_c['Country'] == 'Mexico']

In [50]:
mex_pert

Unnamed: 0,Country,2018,2017,2016,2015,2014,2013,2012,2011,2010,...,1989,1988,1987,1986,1985,1984,1983,1982,1981,1980
110,Mexico,783.0,827.0,1029.0,1107.0,955.0,961.0,978.0,252.0,371.0,...,1978.0,693.0,763.0,1268.0,2608.0,1753.0,1227.0,2000.0,3977.0,5539.0


### Mexico Rubella Data<a id='rubella'></a> 

In [51]:
mex_rubella = rubella_c.loc[rubella_c['Country'] == 'Mexico']

In [52]:
mex_rubella

Unnamed: 0,Country,2018,2017,2016,2015,2014,2013,2012,2011,2010,...,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998
110,Mexico,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,102.0,74.0,38.0,699.0,35.0,3685.0,4843.0,11751.0,17180.0,51846.0


# Creating a Time Series Dataframe