In [44]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#import geopandas as gpd # geodata processing
# Get geolocation using geocoder
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='covid19co', timeout=None)
# Https requests
import requests
import unidecode
# Dates
from datetime import date
from calendar import day_name, month_name

# Short ID
import subprocess
import sys
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
try:
    from shortid import ShortId
except Exception:
    install('shortid')
    from shortid import ShortId

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Colombia Covid 19 Pipeline
Dataset is obtained from [Instituto Nacional de Salud](https://www.ins.gov.co/Noticias/Paginas/Coronavirus.aspx) daily report Coronavirus 2019 from Colombia.

You can get the official dataset here: 
[INS - Official Report](https://e.infogram.com/api/live/flex/bc384047-e71c-47d9-b606-1eb6a29962e3/664bc407-2569-4ab8-b7fb-9deb668ddb7a)

The number of new cases are increasing day by day around the world.
This dataset has information about reported cases from 32 Colombia departments.

You can view and collaborate to the analysis here:
[colombia_covid_19_analysis](https://www.kaggle.com/sebaxtian/colombia-covid-19-analysis) Kaggle Notebook Kernel.

---

In [45]:
# Any results you write to the current directory are saved as output.
OUTPUT_DIR = '../output'
# URL original dataset
URL_DATASET = 'https://e.infogram.com/api/live/flex/bc384047-e71c-47d9-b606-1eb6a29962e3/664bc407-2569-4ab8-b7fb-9deb668ddb7a'

In [46]:
# Reading the json as a dict
with requests.get(URL_DATASET) as original_dataset:
    data = original_dataset.json()
#print(data)

# Get attributes and data
attrs = data['data'][0][0]
del data['data'][0][0]
data = data['data'][0]

# Build dataframe
covid_df = pd.DataFrame(data=data, columns=attrs)

# Size dataframe
covid_df.shape

(1780, 9)

In [47]:
# Show dataframe
covid_df.tail()

Unnamed: 0,ID de caso,Fecha de diagnóstico,Ciudad de ubicación,Departamento o Distrito,Atención**,Edad,Sexo,Tipo*,País de procedencia
1775,1776,07/04/2020,Tuluá,Valle del Cauca,Casa,25,M,Importado,Panamá
1776,1777,07/04/2020,Tuluá,Valle del Cauca,Casa,29,M,Importado,Panamá
1777,1778,07/04/2020,Palmira,Valle del Cauca,Casa,12,M,En estudio,Colombia
1778,1779,07/04/2020,Palmira,Valle del Cauca,Casa,36,M,Importado,Ecuador
1779,1780,07/04/2020,Medellin,Antioquia,Casa,15,F,Relacionado,0


In [48]:
# Rename columns
covid_df.rename(columns={
    "ID de caso": "id_case",
    "Fecha de diagnóstico": "date",
    "Ciudad de ubicación": "city",
    "Departamento o Distrito": "dept_dist",
    "Atención**": "care",
    "Edad": "age",
    "Sexo": "sex",
    "Tipo*": "kind",
    "País de procedencia": "country_origin"}, inplace=True)
# Show dataframe
covid_df.head()

Unnamed: 0,id_case,date,city,dept_dist,care,age,sex,kind,country_origin
0,1,06/03/2020,Bogotá,Bogotá D.C.,Recuperado,19,F,Importado,Italia
1,2,09/03/2020,Buga,Valle del Cauca,Recuperado,34,M,Importado,España
2,3,09/03/2020,Medellín,Antioquia,Recuperado,50,F,Importado,España
3,4,11/03/2020,Medellín,Antioquia,Recuperado,55,M,Relacionado,Colombia
4,5,11/03/2020,Medellín,Antioquia,Recuperado,25,M,Relacionado,Colombia


In [49]:
# Clean empty rows
covid_df = covid_df[(covid_df['id_case'] != '') | (covid_df['date'] != '')]
# Show dataframe
covid_df.tail()

Unnamed: 0,id_case,date,city,dept_dist,care,age,sex,kind,country_origin
1775,1776,07/04/2020,Tuluá,Valle del Cauca,Casa,25,M,Importado,Panamá
1776,1777,07/04/2020,Tuluá,Valle del Cauca,Casa,29,M,Importado,Panamá
1777,1778,07/04/2020,Palmira,Valle del Cauca,Casa,12,M,En estudio,Colombia
1778,1779,07/04/2020,Palmira,Valle del Cauca,Casa,36,M,Importado,Ecuador
1779,1780,07/04/2020,Medellin,Antioquia,Casa,15,F,Relacionado,0


In [50]:
# Remove accents
covid_df['city'] = covid_df['city'].transform(lambda value: unidecode.unidecode(value))
covid_df['dept_dist'] = covid_df['dept_dist'].transform(lambda value: unidecode.unidecode(value))
# Show dataframe
covid_df.head()

Unnamed: 0,id_case,date,city,dept_dist,care,age,sex,kind,country_origin
0,1,06/03/2020,Bogota,Bogota D.C.,Recuperado,19,F,Importado,Italia
1,2,09/03/2020,Buga,Valle del Cauca,Recuperado,34,M,Importado,España
2,3,09/03/2020,Medellin,Antioquia,Recuperado,50,F,Importado,España
3,4,11/03/2020,Medellin,Antioquia,Recuperado,55,M,Relacionado,Colombia
4,5,11/03/2020,Medellin,Antioquia,Recuperado,25,M,Relacionado,Colombia


In [51]:
# Add Day, Month, Year, Month Name and Day Name
covid_df['day'] = covid_df['date'].transform(lambda value: value.split('/')[0])
covid_df['month'] = covid_df['date'].transform(lambda value: value.split('/')[1])
covid_df['year'] = covid_df['date'].transform(lambda value: value.split('/')[2])
# English
#covid_df['month_name'] = covid_df['month'].transform(lambda value: month_name[int(value)])
#covid_df['day_name'] = covid_df['date'].transform(lambda value: day_name[date(int(value.split('/')[2]), int(value.split('/')[1]), int(value.split('/')[0])).weekday()])
# Spanish
nombre_mes = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre']
nombre_dia = ['Lunes', 'Martes', 'Miercoles', 'Jueves', 'Viernes', 'Sabado', 'Domingo']
covid_df['month_name'] = covid_df['month'].transform(lambda value: nombre_mes[int(value) - 1])
covid_df['day_name'] = covid_df['date'].transform(lambda value: nombre_dia[date(int(value.split('/')[2]), int(value.split('/')[1]), int(value.split('/')[0])).weekday()])
# Show dataframe
covid_df.head()

Unnamed: 0,id_case,date,city,dept_dist,care,age,sex,kind,country_origin,day,month,year,month_name,day_name
0,1,06/03/2020,Bogota,Bogota D.C.,Recuperado,19,F,Importado,Italia,6,3,2020,Marzo,Viernes
1,2,09/03/2020,Buga,Valle del Cauca,Recuperado,34,M,Importado,España,9,3,2020,Marzo,Lunes
2,3,09/03/2020,Medellin,Antioquia,Recuperado,50,F,Importado,España,9,3,2020,Marzo,Lunes
3,4,11/03/2020,Medellin,Antioquia,Recuperado,55,M,Relacionado,Colombia,11,3,2020,Marzo,Miercoles
4,5,11/03/2020,Medellin,Antioquia,Recuperado,25,M,Relacionado,Colombia,11,3,2020,Marzo,Miercoles


In [52]:
# Update Case ID
covid_df['id_case'] = covid_df['id_case'].transform(lambda value: ShortId().generate())
covid_df['id_case'] = covid_df['sex'] + covid_df['id_case'] + covid_df['age']
covid_df.head()

Unnamed: 0,id_case,date,city,dept_dist,care,age,sex,kind,country_origin,day,month,year,month_name,day_name
0,FXq1eC4YH19,06/03/2020,Bogota,Bogota D.C.,Recuperado,19,F,Importado,Italia,6,3,2020,Marzo,Viernes
1,MGMr_TuXK34,09/03/2020,Buga,Valle del Cauca,Recuperado,34,M,Importado,España,9,3,2020,Marzo,Lunes
2,FjWolGuPV50,09/03/2020,Medellin,Antioquia,Recuperado,50,F,Importado,España,9,3,2020,Marzo,Lunes
3,M1Fr0vQls55,11/03/2020,Medellin,Antioquia,Recuperado,55,M,Relacionado,Colombia,11,3,2020,Marzo,Miercoles
4,MDGO90OKb25,11/03/2020,Medellin,Antioquia,Recuperado,25,M,Relacionado,Colombia,11,3,2020,Marzo,Miercoles


In [53]:
# Sort columns
covid_df = covid_df[['id_case', 'date', 'day', 'month', 'year', 'month_name', 'day_name', 'city', 'dept_dist', 'age', 'sex', 'kind', 'country_origin', 'care']]
covid_df.head()

Unnamed: 0,id_case,date,day,month,year,month_name,day_name,city,dept_dist,age,sex,kind,country_origin,care
0,FXq1eC4YH19,06/03/2020,6,3,2020,Marzo,Viernes,Bogota,Bogota D.C.,19,F,Importado,Italia,Recuperado
1,MGMr_TuXK34,09/03/2020,9,3,2020,Marzo,Lunes,Buga,Valle del Cauca,34,M,Importado,España,Recuperado
2,FjWolGuPV50,09/03/2020,9,3,2020,Marzo,Lunes,Medellin,Antioquia,50,F,Importado,España,Recuperado
3,M1Fr0vQls55,11/03/2020,11,3,2020,Marzo,Miercoles,Medellin,Antioquia,55,M,Relacionado,Colombia,Recuperado
4,MDGO90OKb25,11/03/2020,11,3,2020,Marzo,Miercoles,Medellin,Antioquia,25,M,Relacionado,Colombia,Recuperado


## Covid 19 Dataset
> ***Output file***: covid19_co.csv

In [54]:
# Save dataframe
covid_df.to_csv(os.path.join(OUTPUT_DIR, 'covid19_co.csv'), index=False)

---

In [55]:
# Cases by Date
covid_df_by_date = covid_df.groupby('date')['date'].count()
covid_df_by_date = pd.DataFrame(data={'date': covid_df_by_date.index, 'total': covid_df_by_date.values}, columns=['date', 'total'])
covid_df_by_date['date_iso'] = pd.to_datetime(covid_df_by_date['date'], format='%d/%m/%Y')
covid_df_by_date = covid_df_by_date.sort_values(by=['date_iso'], ascending=True)
covid_df_by_date['cumsum'] = covid_df_by_date['total'].cumsum()
covid_df_by_date = covid_df_by_date.drop(columns=['date_iso'])
covid_df_by_date.reset_index(inplace=True, drop=True)
# Show dataframe
covid_df_by_date.tail()

Unnamed: 0,date,total,cumsum
25,03/04/2020,106,1267
26,04/04/2020,139,1406
27,05/04/2020,79,1485
28,06/04/2020,94,1579
29,07/04/2020,201,1780


## Cases by Date
> ***Output file***: covid_19_by_date.csv

In [56]:
# Save dataframe
covid_df_by_date.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_date.csv'), index=False)

---

In [57]:
# Cases by Care
covid_df_by_care = covid_df.groupby('care')['care'].count().sort_values(ascending=False)
covid_df_by_care = pd.DataFrame(data={'care': covid_df_by_care.index, 'total': covid_df_by_care.values}, columns=['care', 'total'])
# Show dataframe
covid_df_by_care.head()

Unnamed: 0,care,total
0,Casa,1383
1,Hospital,171
2,Recuperado,100
3,Hospital UCI,76
4,Fallecido,50


## Cases by Care
> ***Output file***: covid_19_by_care.csv

In [58]:
# Save dataframe
covid_df_by_care.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_care.csv'), index=False)

---

In [59]:
# Cases by Sex
covid_df_by_sex = covid_df.groupby('sex')['sex'].count().sort_values(ascending=False)
covid_df_by_sex = pd.DataFrame(data={'sex': covid_df_by_sex.index, 'total': covid_df_by_sex.values}, columns=['sex', 'total'])
# Show dataframe
covid_df_by_sex.head()

Unnamed: 0,sex,total
0,M,905
1,F,875


## Cases by Sex
> ***Output file***: covid_19_by_sex.csv

In [60]:
# Save dataframe
covid_df_by_sex.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_sex.csv'), index=False)

---

In [61]:
# Cases by Age
covid_df_by_age = covid_df.groupby('age')['age'].count().sort_values(ascending=False)
covid_df_by_age = pd.DataFrame(data={'age': covid_df_by_age.index, 'total': covid_df_by_age.values}, columns=['age', 'total'])
# Show dataframe
covid_df_by_age.head()

Unnamed: 0,age,total
0,29,55
1,32,51
2,28,50
3,30,48
4,26,47


## Cases by Age
> ***Output file***: covid_19_by_age.csv

In [62]:
# Save dataframe
covid_df_by_age.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_age.csv'), index=False)

---

In [63]:
# Cases by Age and Sex
covid_df_by_age_sex = covid_df.groupby(['age', 'sex'])['id_case'].count().sort_values(ascending=False)
covid_df_by_age_sex = pd.DataFrame(data={'age': covid_df_by_age_sex.index.get_level_values('age'), 'sex': covid_df_by_age_sex.index.get_level_values('sex'), 'total': covid_df_by_age_sex.values}, columns=['age', 'sex', 'total'])
# Show dataframe
covid_df_by_age_sex.head()

Unnamed: 0,age,sex,total
0,35,M,29
1,30,F,29
2,29,M,29
3,28,M,28
4,32,M,26


## Cases by Age and Sex
> ***Output file***: covid_19_by_age_sex.csv

In [64]:
# Save dataframe
covid_df_by_age_sex.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_age_sex.csv'), index=False)

---

In [65]:
# Build dataframe by Age and Sex using intervals
def age_sex_intervals(dataframe):
    intervals = []
    i = 0
    while i < 100:
        interval_i = dataframe[(dataframe['age'] >= i) & (dataframe['age'] < i+10)]
        interval_i = interval_i.groupby('sex')['total'].sum().sort_values(ascending=False)
        if len(interval_i.values) > 0:
            interval_i = pd.DataFrame(data={'age': [ str(i) + '-' + str(i+9), str(i) + '-' + str(i+9)], 'sex': interval_i.index, 'total': interval_i.values}, columns=['age', 'sex', 'total'])
            intervals.append(interval_i)
        i = i + 10
    return pd.concat(intervals).reset_index(drop=True)
# Cases by Age and Sex using intervals
covid_df_by_age_sex_interval = covid_df_by_age_sex
covid_df_by_age_sex_interval['age'] = pd.to_numeric(covid_df_by_age_sex_interval['age'])
covid_df_by_age_sex_interval = age_sex_intervals(covid_df_by_age_sex_interval)
# Show dataframe
covid_df_by_age_sex_interval.head()

Unnamed: 0,age,sex,total
0,0-9,F,20
1,0-9,M,17
2,10-19,M,31
3,10-19,F,29
4,20-29,M,175


## Cases by Age and Sex Interval
> ***Output file***: covid_19_by_age_sex_interval.csv

In [66]:
# Save dataframe
covid_df_by_age_sex_interval.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_age_sex_interval.csv'), index=False)

---

In [67]:
# Cases by City
covid_df_by_city = covid_df.groupby('city')['city'].count().sort_values(ascending=False)
covid_df_by_city = pd.DataFrame(data={'city': covid_df_by_city.index, 'total': covid_df_by_city.values}, columns=['city', 'total'])
# Show dataframe
covid_df_by_city.head()

Unnamed: 0,city,total
0,Bogota,861
1,Cali,188
2,Medellin,122
3,Cartagena,60
4,Barranquilla,48


In [68]:
# Find city geolocation
def findgeopoint(city):
    geo = geolocator.geocode(city + ', Colombia')
    if geo:
        return geo.point
    else:
        return geolocator.geocode('Colombia').point

In [69]:
# Add city geolocation
covid_df_by_city['geo'] = covid_df_by_city['city'].transform(lambda value: findgeopoint(value))
# Add city latitude and longitude
covid_df_by_city['lat'] = covid_df_by_city['geo'].transform(lambda value: value.latitude)
covid_df_by_city['lng'] = covid_df_by_city['geo'].transform(lambda value: value.longitude)
covid_df_by_city = covid_df_by_city.drop(columns=['geo'])
# Show dataframe
covid_df_by_city.head()

Unnamed: 0,city,total,lat,lng
0,Bogota,861,4.59808,-74.076044
1,Cali,188,3.451792,-76.532494
2,Medellin,122,6.244338,-75.573553
3,Cartagena,60,10.419584,-75.527122
4,Barranquilla,48,10.979967,-74.801309


## Cases by City
> ***Output file***: covid_19_by_city.csv

In [70]:
# Save dataframe
covid_df_by_city.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_city.csv'), index=False)

---

In [71]:
# Cases by Department or District
covid_df_by_dept_dist = covid_df.groupby('dept_dist')['dept_dist'].count().sort_values(ascending=False)
covid_df_by_dept_dist = pd.DataFrame(data={'dept_dist': covid_df_by_dept_dist.index, 'total': covid_df_by_dept_dist.values}, columns=['dept_dist', 'total'])
# Show dataframe
covid_df_by_dept_dist.head()

Unnamed: 0,dept_dist,total
0,Bogota D.C.,861
1,Valle del Cauca,250
2,Antioquia,209
3,Cundinamarca,60
4,Cartagena D.T. y C,60


In [72]:
# Add dept_dist geolocation
covid_df_by_dept_dist['geo'] = covid_df_by_dept_dist['dept_dist'].transform(lambda value: findgeopoint(value))
# Add city latitude and longitude
covid_df_by_dept_dist['lat'] = covid_df_by_dept_dist['geo'].transform(lambda value: value.latitude)
covid_df_by_dept_dist['lng'] = covid_df_by_dept_dist['geo'].transform(lambda value: value.longitude)
covid_df_by_dept_dist = covid_df_by_dept_dist.drop(columns=['geo'])
# Show dataframe
covid_df_by_dept_dist.head()

Unnamed: 0,dept_dist,total,lat,lng
0,Bogota D.C.,861,4.59808,-74.076044
1,Valle del Cauca,250,4.063957,-76.123377
2,Antioquia,209,7.153843,-75.44404
3,Cundinamarca,60,5.000009,-74.166676
4,Cartagena D.T. y C,60,2.889443,-73.783892


## Cases by Department or District
> ***Output file***: covid_19_by_dept_dist.csv

In [73]:
# Save dataframe
covid_df_by_dept_dist.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_dept_dist.csv'), index=False)

---

In [74]:
# Cases by Care by Date
list_care = list(set(covid_df['care'].values))
print('list_care', list_care)
cases_by_care_by_date = []
# Each Care
for care in list_care:
    covid_df_care_by_date = covid_df[covid_df['care'] == care]
    covid_df_care_by_date = covid_df_care_by_date.groupby('date')['date'].count()
    covid_df_care_by_date = pd.DataFrame(data={'date': covid_df_care_by_date.index, 'care': ([care] * len(covid_df_care_by_date.index)), 'total': covid_df_care_by_date.values}, columns=['date', 'care', 'total'])
    covid_df_care_by_date['date_iso'] = pd.to_datetime(covid_df_care_by_date['date'], format='%d/%m/%Y')
    covid_df_care_by_date = covid_df_care_by_date.sort_values(by=['date_iso'], ascending=True)
    covid_df_care_by_date['cumsum'] = covid_df_care_by_date['total'].cumsum()
    covid_df_care_by_date = covid_df_care_by_date.drop(columns=['date_iso'])
    covid_df_care_by_date.reset_index(inplace=True, drop=True)
    cases_by_care_by_date.append(covid_df_care_by_date)
# Show dataframe
for i, care in enumerate(list_care):
    print(care, '\n', cases_by_care_by_date[i].tail())

list_care ['Hospital', 'Fallecido', 'Casa', 'Hospital UCI', 'Recuperado']
Hospital 
           date      care  total  cumsum
13  03/04/2020  Hospital     21      80
14  04/04/2020  Hospital     13      93
15  05/04/2020  Hospital     22     115
16  06/04/2020  Hospital     17     132
17  07/04/2020  Hospital     39     171
Fallecido 
           date       care  total  cumsum
12  02/04/2020  Fallecido      2      34
13  03/04/2020  Fallecido      7      41
14  04/04/2020  Fallecido      5      46
15  05/04/2020  Fallecido      2      48
16  06/04/2020  Fallecido      2      50
Casa 
           date  care  total  cumsum
22  03/04/2020  Casa     75     991
23  04/04/2020  Casa    116    1107
24  05/04/2020  Casa     52    1159
25  06/04/2020  Casa     67    1226
26  07/04/2020  Casa    157    1383
Hospital UCI 
           date          care  total  cumsum
14  03/04/2020  Hospital UCI      3      55
15  04/04/2020  Hospital UCI      5      60
16  05/04/2020  Hospital UCI      3      63
17 

## Cases by Care by Date
> ***Output files***: cases_by_care_by_date_(int).csv

In [75]:
# Save dataframe
for i, care in enumerate(list_care):
    cases_by_care_by_date[i].to_csv(os.path.join(OUTPUT_DIR, 'cases_by_care_by_date_' + str(i) + '.csv'), index=False)

---

In [76]:
# Cases by Country Origin
covid_df_by_country_origin = covid_df.groupby('country_origin')['country_origin'].count().sort_values(ascending=False)
covid_df_by_country_origin = pd.DataFrame(data={'country_origin': covid_df_by_country_origin.index, 'total': covid_df_by_country_origin.values}, columns=['country_origin', 'total'])
# Show dataframe
covid_df_by_country_origin.head()

Unnamed: 0,country_origin,total
0,Colombia,1106
1,España,222
2,Estados Unidos,154
3,Ecuador,37
4,colombia,32


## Cases by Country Origin
> ***Output file***: covid_19_by_country_origin.csv

In [77]:
# Save dataframe
covid_df_by_country_origin.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_country_origin.csv'), index=False)

---

In [78]:
# Cases by Kind
covid_df_by_kind = covid_df.groupby('kind')['kind'].count().sort_values(ascending=False)
covid_df_by_kind = pd.DataFrame(data={'kind': covid_df_by_kind.index, 'total': covid_df_by_kind.values}, columns=['kind', 'total'])
# Show dataframe
covid_df_by_kind.head()

Unnamed: 0,kind,total
0,Importado,635
1,Relacionado,578
2,En estudio,567


## Cases by Kind
> ***Output file***: covid_19_by_kind.csv

In [79]:
# Save dataframe
covid_df_by_kind.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_by_kind.csv'), index=False)

---

In [80]:
# Descarted Cases
# Reading the json as a dict
with requests.get('https://infogram.com/api/live/flex/5eb73bf0-6714-4bac-87cc-9ef0613bf697/c9a25571-e7c5-43c6-a7ac-d834a3b5e872?') as original_dataset:
    data = original_dataset.json()
#print(data['data'][0][3][0])

# Get attributes and data
attrs = data['data'][0][3][0]
del data
#print(attrs)
descarted_cases = attrs.split('<b>')[1].split('</b>')[0].replace('.', '')
print(descarted_cases)

28665


---

In [81]:
# Samples Processed
# Reading the json as a dict
with requests.get('https://infogram.com/api/live/flex/bc384047-e71c-47d9-b606-1eb6a29962e3/523ca417-2781-47f0-87e8-1ccc2d5c2839?') as original_dataset:
    data = original_dataset.json()
#print(data['data'])
#print(data['data'][2])

# Get attributes and data
attrs = data['data'][2][0]
attrs[0] = 'Periodo'
del data['data'][2][0]
#print(attrs)
data = data['data'][2]
#print(data)

# Build dataframe
covid_df_samples_processed = pd.DataFrame(data=data, columns=attrs)

# Size dataframe
covid_df_samples_processed.head()

Unnamed: 0,Periodo,Muestras procesadas,Acumulado procesadas
0,01/02/20-29/02/20,599,599
1,02/03/20-08/03/20,160,759
2,09/03/20-15/03/20,1584,2343
3,16/03/20-22/03/20,4818,7161
4,23/03/20-29/03/20,8903,16064


In [82]:
# Rename columns
covid_df_samples_processed.rename(columns={
    "Periodo": "period",
    "Muestras procesadas": "total_samples",
    "Acumulado procesadas": "accum_samples"}, inplace=True)
# Show dataframe
covid_df_samples_processed.head()

Unnamed: 0,period,total_samples,accum_samples
0,01/02/20-29/02/20,599,599
1,02/03/20-08/03/20,160,759
2,09/03/20-15/03/20,1584,2343
3,16/03/20-22/03/20,4818,7161
4,23/03/20-29/03/20,8903,16064


In [83]:
# Update date format
def update_date_format(period):
    date1 = period.split('-')[0]
    date2 = period.split('-')[1]
    if date1.split('/')[-1] == '20':
        date1 = '/'.join(date1.split('/')[0:-1]) + '/2020'
    if date2.split('/')[-1] == '20':
        date2 = '/'.join(date2.split('/')[0:-1]) + '/2020'
    return date1 + '-' + date2
# Example
#update_date_format('02/03/20-08/03/20')
# Update date format
covid_df_samples_processed['period'] = covid_df_samples_processed['period'].transform(lambda value: update_date_format(value))
# Show dataframe
covid_df_samples_processed.tail()

Unnamed: 0,period,total_samples,accum_samples
1,02/03/2020-08/03/2020,160,759
2,09/03/2020-15/03/2020,1584,2343
3,16/03/2020-22/03/2020,4818,7161
4,23/03/2020-29/03/2020,8903,16064
5,30/03/2020-05/04/2020,11621,27685


## Samples Processed
> ***Output file***: covid_19_samples_processed.csv

In [84]:
# Save dataframe
covid_df_samples_processed.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_samples_processed.csv'), index=False)

---

In [85]:
# Resume
data = []

# Resume Attributes
data.append(['Confirmados', covid_df_by_date.values[-1][-1]])
data.append(['Recuperados', cases_by_care_by_date[4].values[-1][-1]])
data.append(['Muertes', cases_by_care_by_date[2].values[-1][-1]])
data.append(['Casos descartados', descarted_cases])
data.append(['Importado', covid_df_by_kind[covid_df_by_kind['kind'] == 'Importado'].values[0][-1]])
data.append(['Relacionado', covid_df_by_kind[covid_df_by_kind['kind'] == 'Relacionado'].values[0][-1]])
data.append(['En estudio', covid_df_by_kind[covid_df_by_kind['kind'] == 'En estudio'].values[0][-1]])
data.append(['Muestras procesadas', covid_df_samples_processed.values[-1][-1]])

# Resume Dataframe
covid_df_resume = pd.DataFrame(data=data, columns=['title', 'total'])
# Show dataframe
covid_df_resume.head(10)

Unnamed: 0,title,total
0,Confirmados,1780
1,Recuperados,100
2,Muertes,1383
3,Casos descartados,28665
4,Importado,635
5,Relacionado,578
6,En estudio,567
7,Muestras procesadas,27685


## Resume
> ***Output file***: covid_19_resume.csv

In [86]:
# Save dataframe
covid_df_resume.to_csv(os.path.join(OUTPUT_DIR, 'covid_19_resume.csv'), index=False)

---