# Preparación de las series de datos para el nuevo artículo

In [None]:
from ipywidgets import IntProgress
from IPython.display import display
import time

from covid19_vulnerabilidad_mex.datos import *
from covid19_vulnerabilidad_mex.vulnerabilidad import *

import pandas as pd
import geopandas as gpd

import glob
import os
import matplotlib.pyplot as plt
import numpy as np

from datetime import timedelta
import datetime
from datetime import timedelta, date, datetime
import pickle

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.cross_decomposition import PLSRegression

import seaborn as sns
import imageio
from pygifsicle import optimize

plt.style.use('ggplot')

In [None]:
%load_ext autoreload
%autoreload 2

## Nuevos datos base

A partir de los datos preparados en `08_Actualizacion_Datos` vamos a preparar dos dataframes, uno con la selección de variables básica derivada de Acharya y otro con una selección extendida.

El diccionario `datos/municipios/actualizacion_2020/diccionario_dysovi.csv` nos dice qué variables se usan en cada selección a través de las columnas `Base Index` y `Extended`, mientras que la columna `datos` contiene el nombre de la columna en la base actualizada

In [None]:
diccionario = pd.read_csv("datos/municipios/actualizacion_2020/diccionario_dysovi.csv")
# Estas columnas tienen información de identificación que no vamos a usar en los modelos
id_vars = ["entidad_cvegeo", "State", "municipio_cvegeo", "Municipality"]
diccionario.head()

Unnamed: 0,id,datos,Base Index,Extended
0,0,entidad_cvegeo,,
1,1,State,,
2,2,municipio_cvegeo,,
3,3,Municipality,,
4,4,Population,,


Obtenemos las variables de los índices base y extendido

In [None]:
# Las columnas
base_index_vars = diccionario.loc[diccionario['Base Index'].notnull()]['datos'].values
extended_index_vars = diccionario.loc[diccionario['Extended'].notnull()]['datos'].values
# Leemos datos
indicadores_municipales = pd.read_csv("datos/municipios/actualizacion_2020/indicadores_finales.csv", dtype={'entidad_cvegeo':str, 'municipio_cvegeo':str})
indicadores_base = indicadores_municipales.loc[:, id_vars + list(base_index_vars) + ['Population']]
indicadores_extended = indicadores_municipales.loc[:,id_vars + list(extended_index_vars) + ['Population']]
indicadores_extended

Unnamed: 0,entidad_cvegeo,State,municipio_cvegeo,Municipality,Male,Female,60 and older,60 and older males,60 and older females,65 and older,...,People under minnimum wellness line,Beds in public hospitals per (state) habitant,Beds in private hospitals per (state) habitant,Number of public hospitals per (state) habitant,People with hypertension,People with diabetes,People with obesity,Population density,Urbanization,Population
0,01,Aguascalientes,01001,Aguascalientes,0.486910,0.513090,0.108523,0.048750,0.059773,0.071593,...,10.0,0.959461,0.993091,0.981481,14.942242,7.495861,31.486541,0.000808,0.949900,948990.0
1,01,Aguascalientes,01002,Asientos,0.490162,0.509838,0.091140,0.044260,0.046880,0.064634,...,24.4,0.959461,0.993091,0.981481,15.320425,7.953634,32.282284,0.000094,0.344613,51536.0
2,01,Aguascalientes,01003,Calvillo,0.490352,0.509648,0.134403,0.064961,0.069442,0.096841,...,26.8,0.959461,0.993091,0.981481,13.751906,9.172624,40.004293,0.000063,0.486987,58250.0
3,01,Aguascalientes,01004,Cosío,0.487765,0.512235,0.094882,0.045706,0.049176,0.065765,...,16.6,0.959461,0.993091,0.981481,16.431493,7.383116,32.596450,0.000135,0.498647,17000.0
4,01,Aguascalientes,01005,Jesús María,0.494262,0.505738,0.075541,0.036274,0.039268,0.050320,...,7.9,0.959461,0.993091,0.981481,12.356755,6.745819,34.731715,0.000256,0.750595,129929.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,32,Zacatecas,32053,Villa González Ortega,0.475091,0.524909,0.123107,0.059737,0.063371,0.088431,...,38.8,0.991437,1.000000,0.824074,18.491491,10.269404,28.455299,0.000031,0.697229,13208.0
1646,32,Zacatecas,32054,Villa Hidalgo,0.488738,0.511262,0.109328,0.054510,0.054818,0.077754,...,38.5,0.991437,1.000000,0.824074,16.920279,9.895491,31.075955,0.000053,0.249460,19446.0
1647,32,Zacatecas,32055,Villanueva,0.494011,0.505989,0.178655,0.086729,0.091926,0.135655,...,28.7,0.991437,1.000000,0.824074,21.746589,12.050064,30.526273,0.000015,0.556151,31558.0
1648,32,Zacatecas,32056,Zacatecas,0.481074,0.518926,0.125977,0.056261,0.069716,0.082677,...,8.5,0.991437,1.000000,0.824074,20.010453,11.814172,36.822130,0.000339,0.943786,149607.0


In [None]:
indicadores_base

Unnamed: 0,entidad_cvegeo,State,municipio_cvegeo,Municipality,60 and older,Disabled,Dwellers per room,Índice de rezago social,Poverty,Educational lag,...,Basic services lag,Beds in public hospitals per (state) habitant,Beds in private hospitals per (state) habitant,Number of public hospitals per (state) habitant,People with hypertension,People with diabetes,People with obesity,Population density,Urbanization,Population
0,01,Aguascalientes,01001,Aguascalientes,0.108523,0.050080,0.84,-1.315320,26.1,11.7,...,3.0,0.959461,0.993091,0.981481,14.942242,7.495861,31.486541,0.000808,0.949900,948990.0
1,01,Aguascalientes,01002,Asientos,0.091140,0.056000,1.07,-0.857301,54.0,19.8,...,16.2,0.959461,0.993091,0.981481,15.320425,7.953634,32.282284,0.000094,0.344613,51536.0
2,01,Aguascalientes,01003,Calvillo,0.134403,0.070180,0.97,-0.918554,56.8,23.2,...,7.7,0.959461,0.993091,0.981481,13.751906,9.172624,40.004293,0.000063,0.486987,58250.0
3,01,Aguascalientes,01004,Cosío,0.094882,0.047353,1.14,-1.004023,43.2,14.7,...,7.6,0.959461,0.993091,0.981481,16.431493,7.383116,32.596450,0.000135,0.498647,17000.0
4,01,Aguascalientes,01005,Jesús María,0.075541,0.038760,0.95,-1.173361,25.0,17.3,...,6.3,0.959461,0.993091,0.981481,12.356755,6.745819,34.731715,0.000256,0.750595,129929.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,32,Zacatecas,32053,Villa González Ortega,0.123107,0.069428,0.96,-0.787429,69.1,23.6,...,14.7,0.991437,1.000000,0.824074,18.491491,10.269404,28.455299,0.000031,0.697229,13208.0
1646,32,Zacatecas,32054,Villa Hidalgo,0.109328,0.072200,1.08,-0.692240,71.8,20.8,...,20.2,0.991437,1.000000,0.824074,16.920279,9.895491,31.075955,0.000053,0.249460,19446.0
1647,32,Zacatecas,32055,Villanueva,0.178655,0.067463,0.84,-0.884952,61.0,26.1,...,16.0,0.991437,1.000000,0.824074,21.746589,12.050064,30.526273,0.000015,0.556151,31558.0
1648,32,Zacatecas,32056,Zacatecas,0.125977,0.056515,0.75,-1.336926,25.9,10.4,...,4.6,0.991437,1.000000,0.824074,20.010453,11.814172,36.822130,0.000339,0.943786,149607.0


## Preparación series de tiempo

### Parámetros generales del análisis

In [None]:
antier = date.today() - timedelta(days=2)
fecha_analisis = antier - timedelta(weeks=2)
fecha_analisis = pd.to_datetime(fecha_analisis)
inicio = pd.to_datetime('20-05-01', yearfirst=True)
fin = antier - timedelta(days=7)
metodo = 'PLS'
ventana_dias = 28
actualizar_datos_salud(fecha_inicio=antier.strftime('%d-%m-%Y')) # Asegurarse de que estan los datos de antier

### Serie base

In [None]:
serie_municipal_base = serie_covid_indicadores_municipales_from_df(indicadores_base,
                                                                       antier.strftime('%y%m%d'),
                                                                       acumulativa=False,
                                                                       dias=ventana_dias,
                                                                       )
serie_municipal_base[['conteo', 'defunciones']] = serie_municipal_base[[f'conteo_{ventana_dias}dias',
                                                              f'defunciones_{ventana_dias}dias']]
serie_municipal_base = serie_municipal_base.drop(columns=[f'conteo_{ventana_dias}dias',
                                   f'defunciones_{ventana_dias}dias'])                                                              
serie_municipal_base = agregar_tasas_municipales(serie_municipal_base, pop_column='Population')
serie_municipal_base.head()

  df['SEMANA_AÑO_INGRESO'] = df.index.week
  return (left.assign(key=1).merge(right.assign(key=1), on='key').drop('key', 1))


Unnamed: 0,FECHA_INGRESO,ENTIDAD_RES,CLAVE_ENTIDAD_RES,MUNICIPIO_RES,CLAVE_MUNICIPIO_RES,RESULTADO,conteo,defunciones,geometry,60 and older,...,Number of public hospitals per (state) habitant,People with hypertension,People with diabetes,People with obesity,Population density,Urbanization,Population,covid_confirmados_100k,covid_defun_100k,tasa_covid_letal
4,2020-02-27,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.108523,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
5,2020-02-28,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.108523,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
12,2020-02-29,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.108523,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
15,2020-03-01,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.108523,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
24,2020-03-02,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.108523,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,


### Serie extendida

In [None]:
serie_municipal_extended = serie_covid_indicadores_municipales_from_df(indicadores_extended,
                                                                       antier.strftime('%y%m%d'),
                                                                       acumulativa=False,
                                                                       dias=ventana_dias,
                                                                       )
serie_municipal_extended[['conteo', 'defunciones']] = serie_municipal_extended[[f'conteo_{ventana_dias}dias',
                                                              f'defunciones_{ventana_dias}dias']]
serie_municipal_extended = serie_municipal_extended.drop(columns=[f'conteo_{ventana_dias}dias',
                                   f'defunciones_{ventana_dias}dias'])                                                              
serie_municipal_extended = agregar_tasas_municipales(serie_municipal_extended, pop_column='Population')
serie_municipal_extended.head()

  df['SEMANA_AÑO_INGRESO'] = df.index.week
  return (left.assign(key=1).merge(right.assign(key=1), on='key').drop('key', 1))


Unnamed: 0,FECHA_INGRESO,ENTIDAD_RES,CLAVE_ENTIDAD_RES,MUNICIPIO_RES,CLAVE_MUNICIPIO_RES,RESULTADO,conteo,defunciones,geometry,Male,...,Number of public hospitals per (state) habitant,People with hypertension,People with diabetes,People with obesity,Population density,Urbanization,Population,covid_confirmados_100k,covid_defun_100k,tasa_covid_letal
4,2020-02-27,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.48691,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
5,2020-02-28,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.48691,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
12,2020-02-29,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.48691,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
15,2020-03-01,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.48691,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,
24,2020-03-02,AGUASCALIENTES,1,AGUASCALIENTES,1001,Positivo SARS-CoV-2,0.0,0.0,"POLYGON ((-102.09775 22.02325, -102.11598 22.0...",0.48691,...,0.981481,14.942242,7.495861,31.486541,0.000808,0.9499,948990.0,0.0,0.0,


Guardamos las dos series como pickles

In [None]:
serie_municipal_extended.to_pickle("datos/nuevo_articulo/serie_municipal_extended.pkl")
serie_municipal_base.to_pickle("datos/nuevo_articulo/serie_municipal_base.pkl")