# Dataset Caso de uso Mexico

In [0]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd 
import chardet
import multiprocessing
import random

from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import timeit

# Dataset

## Read dataset

Let's read the dataset to use.

In [0]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc


my_encoding = find_encoding('/200502COVID19MEXICO.csv')

In [0]:
df = pd.read_csv('/200502COVID19MEXICO.csv', encoding=my_encoding)

In [8]:
my_encoding

'ISO-8859-1'

## Data cleaning

In [9]:
df.shape

(93791, 35)

The dataset consists of:

- 93791 **rows** or instances
- 35 **columns** or variables.

Let's see their contents.

In [10]:
df.head()

Unnamed: 0,FECHA_ACTUALIZACION,ID_REGISTRO,ORIGEN,SECTOR,ENTIDAD_UM,SEXO,ENTIDAD_NAC,ENTIDAD_RES,MUNICIPIO_RES,TIPO_PACIENTE,FECHA_INGRESO,FECHA_SINTOMAS,FECHA_DEF,INTUBADO,NEUMONIA,EDAD,NACIONALIDAD,EMBARAZO,HABLA_LENGUA_INDIG,DIABETES,EPOC,ASMA,INMUSUPR,HIPERTENSION,OTRA_COM,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,OTRO_CASO,RESULTADO,MIGRANTE,PAIS_NACIONALIDAD,PAIS_ORIGEN,UCI
0,2020-05-02,0ceb0f,1,4,1,1,1,1,1,1,2020-01-14,2020-01-14,9999-99-99,97,2,28,1,2,2,2,2,2,2,2,2,2,2,2,2,99,1,99,MÃ©xico,99,97
1,2020-05-02,1d9580,1,12,2,2,15,2,2,1,2020-04-19,2020-04-17,9999-99-99,97,2,23,1,97,2,2,2,2,2,2,2,2,2,2,2,1,1,99,MÃ©xico,99,97
2,2020-05-02,119b76,1,4,21,2,21,21,114,1,2020-03-23,2020-03-23,9999-99-99,97,2,49,1,97,2,1,2,2,2,1,2,2,2,2,2,99,1,99,MÃ©xico,99,97
3,2020-05-02,0e6c05,1,4,2,2,2,2,2,2,2020-03-24,2020-03-16,9999-99-99,2,1,28,1,97,2,2,2,2,2,2,2,2,1,2,2,99,1,99,MÃ©xico,99,2
4,2020-05-02,140d7f,1,12,29,1,29,29,21,1,2020-03-28,2020-03-23,9999-99-99,97,2,31,1,2,99,2,2,2,2,2,2,2,2,2,2,2,2,99,MÃ©xico,99,97


In [11]:
df.columns

Index(['FECHA_ACTUALIZACION', 'ID_REGISTRO', 'ORIGEN', 'SECTOR', 'ENTIDAD_UM',
       'SEXO', 'ENTIDAD_NAC', 'ENTIDAD_RES', 'MUNICIPIO_RES', 'TIPO_PACIENTE',
       'FECHA_INGRESO', 'FECHA_SINTOMAS', 'FECHA_DEF', 'INTUBADO', 'NEUMONIA',
       'EDAD', 'NACIONALIDAD', 'EMBARAZO', 'HABLA_LENGUA_INDIG', 'DIABETES',
       'EPOC', 'ASMA', 'INMUSUPR', 'HIPERTENSION', 'OTRA_COM',
       'CARDIOVASCULAR', 'OBESIDAD', 'RENAL_CRONICA', 'TABAQUISMO',
       'OTRO_CASO', 'RESULTADO', 'MIGRANTE', 'PAIS_NACIONALIDAD',
       'PAIS_ORIGEN', 'UCI'],
      dtype='object')

FECHA_ACTUALIZACION

ID_REGISTRO = ID_USER (TEXTO)

ENTIDAD_RES = CCAA -> RIESGO_DE_ZONA (01-32,36,97-does not apply,98-is ignored,99-not specified)

FECHA_SINTOMAS = beginning of symptoms (date recorded by app) there is not 9999-99-99

EDAD = NUMERIC

ENFERMEDADES = INTUBADO, NEUMONIA, DIABETES, EPOC, ASMA, INMUSUPR, HIPERTENSION, CARDIOVASCULAR, OBESIDAD, RENAL_CRONICA, OTRA_COM(other deseases)
-> YES(1), NO(2), NOT APPLIES(97), IGNORED(98), UNSPECIFIED(99)

FECHA_DEF = AAAA-MM-DD (9999-99-99-> Not dead=90.492)

TABAQUISMO = SMOKER: SI(1), NO(2), DOES NOT APPLY(97), IS IGNORED(98), UNSPECIFIED(99)

OTRO_CASO = DIRECT CONTACT: YES(1), NO(2), DOES NOT APPLY(97), IS IGNORED(98), UNSPECIFIED(99)

RESULTADO = POSITIVE(1), NEGATIVE(2), PENDENT(3)

UCI = TARGET SEVERE SEVERITY: YES(1), NO(2), DOES NOT APPLY(97), IS IGNORED(98), UNSPECIFIED(99)

*PREGNANCY?¿?*

STEPS
1. Remove the columns we don't need
2. Check that the id_registry's are unique
3. Rename columns
4. Decode the options for each variable
5. Replace "Not applicable"/"Ignored"/"Not specified"/9999-99-99 with NaNs
6. The target column is categorical(uci=!1:mild, uci=1:severe,date_def=!NaN: very severe)-> Group and transform ICU and DEF_DATE columns
7. Make OHE-dummies if required to have only ones and zeros
8. Define predictive model
9. Creating and applying an ML model


We check that the update date is the current or most recent, to ensure that the data we are processing is the latest real data, which will allow us to make the algorithm as accurate/precise as possible.

In [12]:
df.FECHA_ACTUALIZACION.unique()

array(['2020-05-02'], dtype=object)

### 1. Remove the columns we don't need

In [0]:
list_to_drop=['FECHA_ACTUALIZACION', 'ORIGEN','SECTOR', 'ENTIDAD_UM',
       'SEXO', 'ENTIDAD_NAC', 'MUNICIPIO_RES', 'TIPO_PACIENTE',
       'FECHA_INGRESO', 'NACIONALIDAD', 'HABLA_LENGUA_INDIG', 
       'RESULTADO', 'MIGRANTE', 'PAIS_NACIONALIDAD',
       'PAIS_ORIGEN']

In [14]:
df=df.drop(columns=list_to_drop)
df

Unnamed: 0,ID_REGISTRO,ENTIDAD_RES,FECHA_SINTOMAS,FECHA_DEF,INTUBADO,NEUMONIA,EDAD,EMBARAZO,DIABETES,EPOC,ASMA,INMUSUPR,HIPERTENSION,OTRA_COM,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,OTRO_CASO,UCI
0,0ceb0f,1,2020-01-14,9999-99-99,97,2,28,2,2,2,2,2,2,2,2,2,2,2,99,97
1,1d9580,2,2020-04-17,9999-99-99,97,2,23,97,2,2,2,2,2,2,2,2,2,2,1,97
2,119b76,21,2020-03-23,9999-99-99,97,2,49,97,1,2,2,2,1,2,2,2,2,2,99,97
3,0e6c05,2,2020-03-16,9999-99-99,2,1,28,97,2,2,2,2,2,2,2,1,2,2,99,2
4,140d7f,29,2020-03-23,9999-99-99,97,2,31,2,2,2,2,2,2,2,2,2,2,2,2,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,1c0d84,9,2020-04-28,9999-99-99,2,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2
93787,10a607,29,2020-05-02,9999-99-99,97,2,39,2,2,2,2,2,2,2,2,2,2,2,2,97
93788,032660,4,2020-04-22,9999-99-99,97,1,42,97,2,2,2,2,2,2,2,1,2,2,1,97
93789,023eb2,15,2020-04-30,9999-99-99,97,2,33,97,2,2,2,2,2,2,2,2,2,2,1,97


### 2. Check that the id_registration's are unique

In [15]:
df.ID_REGISTRO.nunique()

93791

There are 93,791 unique records that match the number of rows, so we validate that there are no duplicates in the report.

### 3. Rename columns

We rename the columns so as not to confuse terms and to equate the names to the terminology applied in the previous codes.

In [16]:
df.columns

Index(['ID_REGISTRO', 'ENTIDAD_RES', 'FECHA_SINTOMAS', 'FECHA_DEF', 'INTUBADO',
       'NEUMONIA', 'EDAD', 'EMBARAZO', 'DIABETES', 'EPOC', 'ASMA', 'INMUSUPR',
       'HIPERTENSION', 'OTRA_COM', 'CARDIOVASCULAR', 'OBESIDAD',
       'RENAL_CRONICA', 'TABAQUISMO', 'OTRO_CASO', 'UCI'],
      dtype='object')

In [17]:
df=df.rename(columns={'ID_REGISTRO':'USER_ID', 'ENTIDAD_RES':'REGION', 'FECHA_SINTOMAS':'SYMPTOMS_DATE', 'FECHA_DEF':'DEAD', 'INTUBADO':'INTUBATED',
       'NEUMONIA':'PNEUMONIA', 'EDAD':'AGE', 'EMBARAZO':'PREGNANT', 'DIABETES':'DIABETES', 'EPOC':'EPOC', 'ASMA':'ASTHMA', 'INMUSUPR':'IMMUNOSUPPRESSION',
       'HIPERTENSION':'HYPERTENSION', 'OTRA_COM':'OTHERS_DISEASES', 'CARDIOVASCULAR':'CARDIOVASCULAR', 'OBESIDAD':'OBESITY',
       'RENAL_CRONICA':'CHRONIC_RENAL_FAILURE', 'TABAQUISMO':'SMOKER', 'UCI':'ICU'})
df

Unnamed: 0,USER_ID,REGION,SYMPTOMS_DATE,DEAD,INTUBATED,PNEUMONIA,AGE,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU
0,0ceb0f,1,2020-01-14,9999-99-99,97,2,28,2,2,2,2,2,2,2,2,2,2,2,99,97
1,1d9580,2,2020-04-17,9999-99-99,97,2,23,97,2,2,2,2,2,2,2,2,2,2,1,97
2,119b76,21,2020-03-23,9999-99-99,97,2,49,97,1,2,2,2,1,2,2,2,2,2,99,97
3,0e6c05,2,2020-03-16,9999-99-99,2,1,28,97,2,2,2,2,2,2,2,1,2,2,99,2
4,140d7f,29,2020-03-23,9999-99-99,97,2,31,2,2,2,2,2,2,2,2,2,2,2,2,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,1c0d84,9,2020-04-28,9999-99-99,2,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2
93787,10a607,29,2020-05-02,9999-99-99,97,2,39,2,2,2,2,2,2,2,2,2,2,2,2,97
93788,032660,4,2020-04-22,9999-99-99,97,1,42,97,2,2,2,2,2,2,2,1,2,2,1,97
93789,023eb2,15,2020-04-30,9999-99-99,97,2,33,97,2,2,2,2,2,2,2,2,2,2,1,97


### 4. Replace the values "No aplica"/"Se ignora"/"No especificado"/9999-99-99 to NaNs

In order to implement the Regex function to replace several strings with NaN, we first have to convert all integer variables to objects. To do this, we look at which variables are objects and which integers.

In [18]:
df.dtypes

USER_ID                  object
REGION                    int64
SYMPTOMS_DATE            object
DEAD                     object
INTUBATED                 int64
PNEUMONIA                 int64
AGE                       int64
PREGNANT                  int64
DIABETES                  int64
EPOC                      int64
ASTHMA                    int64
IMMUNOSUPPRESSION         int64
HYPERTENSION              int64
OTHERS_DISEASES           int64
CARDIOVASCULAR            int64
OBESITY                   int64
CHRONIC_RENAL_FAILURE     int64
SMOKER                    int64
OTRO_CASO                 int64
ICU                       int64
dtype: object

The REGION and AGE columns are stored in independent series so that they can be retrieved and added to the dataframe at any time, thus avoiding unnecessary conversion of variables to objects.

In [0]:
df_REGION=df['REGION']
df_AGE=df['AGE']
df=df.drop(columns='AGE')

In [20]:
df

Unnamed: 0,USER_ID,REGION,SYMPTOMS_DATE,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU
0,0ceb0f,1,2020-01-14,9999-99-99,97,2,2,2,2,2,2,2,2,2,2,2,2,99,97
1,1d9580,2,2020-04-17,9999-99-99,97,2,97,2,2,2,2,2,2,2,2,2,2,1,97
2,119b76,21,2020-03-23,9999-99-99,97,2,97,1,2,2,2,1,2,2,2,2,2,99,97
3,0e6c05,2,2020-03-16,9999-99-99,2,1,97,2,2,2,2,2,2,2,1,2,2,99,2
4,140d7f,29,2020-03-23,9999-99-99,97,2,2,2,2,2,2,2,2,2,2,2,2,2,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,1c0d84,9,2020-04-28,9999-99-99,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2
93787,10a607,29,2020-05-02,9999-99-99,97,2,2,2,2,2,2,2,2,2,2,2,2,2,97
93788,032660,4,2020-04-22,9999-99-99,97,1,97,2,2,2,2,2,2,2,1,2,2,1,97
93789,023eb2,15,2020-04-30,9999-99-99,97,2,97,2,2,2,2,2,2,2,2,2,2,1,97


We transform the remaining variables into objects.

In [21]:
df = df.applymap(str)
df

Unnamed: 0,USER_ID,REGION,SYMPTOMS_DATE,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU
0,0ceb0f,1,2020-01-14,9999-99-99,97,2,2,2,2,2,2,2,2,2,2,2,2,99,97
1,1d9580,2,2020-04-17,9999-99-99,97,2,97,2,2,2,2,2,2,2,2,2,2,1,97
2,119b76,21,2020-03-23,9999-99-99,97,2,97,1,2,2,2,1,2,2,2,2,2,99,97
3,0e6c05,2,2020-03-16,9999-99-99,2,1,97,2,2,2,2,2,2,2,1,2,2,99,2
4,140d7f,29,2020-03-23,9999-99-99,97,2,2,2,2,2,2,2,2,2,2,2,2,2,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,1c0d84,9,2020-04-28,9999-99-99,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2
93787,10a607,29,2020-05-02,9999-99-99,97,2,2,2,2,2,2,2,2,2,2,2,2,2,97
93788,032660,4,2020-04-22,9999-99-99,97,1,97,2,2,2,2,2,2,2,1,2,2,1,97
93789,023eb2,15,2020-04-30,9999-99-99,97,2,97,2,2,2,2,2,2,2,2,2,2,1,97


It is checked that all the variables have been transformed into objects.

In [22]:
df.dtypes

USER_ID                  object
REGION                   object
SYMPTOMS_DATE            object
DEAD                     object
INTUBATED                object
PNEUMONIA                object
PREGNANT                 object
DIABETES                 object
EPOC                     object
ASTHMA                   object
IMMUNOSUPPRESSION        object
HYPERTENSION             object
OTHERS_DISEASES          object
CARDIOVASCULAR           object
OBESITY                  object
CHRONIC_RENAL_FAILURE    object
SMOKER                   object
OTRO_CASO                object
ICU                      object
dtype: object

The values defined as "no aplica", "se ignora", "no especificado" are replaced by NaNs.

In [23]:
df=df.replace({'9999-99-99|97|98|99': 'NaN'}, regex=True)
df

Unnamed: 0,USER_ID,REGION,SYMPTOMS_DATE,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU
0,0ceb0f,1,2020-01-14,,,2,2,2,2,2,2,2,2,2,2,2,2,,
1,1d9580,2,2020-04-17,,,2,,2,2,2,2,2,2,2,2,2,2,1,
2,119b76,21,2020-03-23,,,2,,1,2,2,2,1,2,2,2,2,2,,
3,0e6c05,2,2020-03-16,,2,1,,2,2,2,2,2,2,2,1,2,2,,2
4,140d7f,29,2020-03-23,,,2,2,2,2,2,2,2,2,2,2,2,2,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,1c0d84,9,2020-04-28,,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2
93787,10a607,29,2020-05-02,,,2,2,2,2,2,2,2,2,2,2,2,2,2,
93788,032660,4,2020-04-22,,,1,,2,2,2,2,2,2,2,1,2,2,1,
93789,023eb2,15,2020-04-30,,,2,,2,2,2,2,2,2,2,2,2,2,1,


### 5. Decode the options for each variable.

ID_REGISTRO = ID_USER (TEXTO)

ENTIDAD_RES = CCAA -> RIESGO_DE_ZONA (01-32,36,97-no aplica,98-se ignora,99-no especificado)

FECHA_SINTOMAS = inicio de los síntomas (fecha registrada por la app)no hay 9999-99-99

EDAD = NUMÉRICA

ENFERMEDADES = INTUBADO, NEUMONIA, DIABETES, EPOC, ASMA, INMUSUPR, HIPERTENSION, CARDIOVASCULAR, OBESIDAD, RENAL_CRONICA, OTRA_COM(otras enfermedades)
-> SI(1), NO(2), NO APLICA(97), SE IGNORA(98), NO ESPECIFICADO(99)

FECHA_DEF = AAAA-MM-DD (9999-99-99-> No ha  fallecido=90.492)

TABAQUISMO = FUMADOR: SI(1), NO(2), NO APLICA(97), SE IGNORA(98), NO ESPECIFICADO(99)

OTRO_CASO = CONTACTO DIRECTO: SI(1), NO(2), NO APLICA(97), SE IGNORA(98), NO ESPECIFICADO(99)

RESULTADO = POSITIVO(1), NEGATIVO(2), PENDIENTE (3)

UCI = TARGET GRAVEDAD GRAVE: SI(1), NO(2), NO APLICA(97), SE IGNORA(98), NO ESPECIFICADO(99)

*EMBARAZO?¿?*
*OTRAS ENFERMEDADES¿?¿*

The variable REGION is eliminated because it contains numerical values that we do not want to transform since it is an identifier.

In [24]:
df=df.drop(columns={'REGION','USER_ID','SYMPTOMS_DATE'})
df

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU
0,,,2,2,2,2,2,2,2,2,2,2,2,2,,
1,,,2,,2,2,2,2,2,2,2,2,2,2,1,
2,,,2,,1,2,2,2,1,2,2,2,2,2,,
3,,2,1,,2,2,2,2,2,2,2,1,2,2,,2
4,,,2,2,2,2,2,2,2,2,2,2,2,2,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2
93787,,,2,2,2,2,2,2,2,2,2,2,2,2,2,
93788,,,1,,2,2,2,2,2,2,2,1,2,2,1,
93789,,,2,,2,2,2,2,2,2,2,2,2,2,1,


All those results that do not meet a condition are shown as a negative dummy(0). Therefore, the values "9999-99-99" are transformed into the deceased category, and "2" into the other categories.

In [0]:
df['DEAD']=df['DEAD'].replace({'NaN':'0'}, regex=True)

In [26]:
df=df.replace({'2':'0'}, regex=True)
df

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,


We regroup the generic dataframe in the EDAD column.

In [27]:
df1=pd.concat([df,df_AGE],axis=1)
df1

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,28
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,23
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,,49
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0,28
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,39
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,,42
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,33


It is checked that the EDAD variable does not contain NaNs or inconsistencies.

In [28]:
df1.AGE.unique()

array([ 28,  23,  49,  31,  32,  41,   1,  34,  64,  37,  84,  67,  54,
        38,  22,   3,  59,  50,  30,  58,  52,  47,  16,  45,  27,   7,
        83,  42,  14,  68,  44,  89,  61,  53,  48,  55,  26,  57,  43,
        93,  29,  69,  62,  92,  39,  56,  78,  60,  36,  15,  66,  21,
        46,  19,  90,  35,   5,  79,  13,  33,  40,   2,  25,  65,   0,
        76,  73,  75,  17,  87,  51,  86,  10,  20,  70,  24,  63,  88,
        12,  71,  80,  81,  85,  18,  72,  77,  11,  82,   8,  74,   4,
         6,   9,  99,  95,  91,  96,  97,  98,  94, 102, 100, 113, 101])

The death dates are converted into a positive dummy(1), replacing all those strings that have date format(xxxx-xx-xx) by another string(1).

In [0]:
df1=df1.replace({'DEAD': r'^....-..-..$'}, {'DEAD': '1'}, regex=True)

We check that we only have the two values we need (0 and 1).

In [30]:
df1.DEAD.unique()

array(['0', '1'], dtype=object)

In [31]:
df1

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,28
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,23
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,,49
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0,28
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,39
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,,42
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,33


### 6. Grouping ages in ranges

In [0]:
df1['AGE']=pd.cut(df1['AGE'], bins=[0,10,20,30,40,50,60,70,80,90, np.inf])

In [33]:
df1

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,"(20.0, 30.0]"
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,"(20.0, 30.0]"
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,,"(40.0, 50.0]"
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0,"(20.0, 30.0]"
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(30.0, 40.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"(0.0, 10.0]"
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(30.0, 40.0]"
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,,"(40.0, 50.0]"
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,"(30.0, 40.0]"


Dummies of the age variable are made to have the ranges in columns, but before we semaparted the variable of the datset to facilitate its treatment.

In [34]:
df_AGE=pd.DataFrame(df1['AGE'])
df_AGE

Unnamed: 0,AGE
0,"(20.0, 30.0]"
1,"(20.0, 30.0]"
2,"(40.0, 50.0]"
3,"(20.0, 30.0]"
4,"(30.0, 40.0]"
...,...
93786,"(0.0, 10.0]"
93787,"(30.0, 40.0]"
93788,"(40.0, 50.0]"
93789,"(30.0, 40.0]"


In [35]:
dummies = pd.get_dummies(df_AGE)
df_AGE = dummies
df_AGE.head()

Unnamed: 0,"AGE_(0.0, 10.0]","AGE_(10.0, 20.0]","AGE_(20.0, 30.0]","AGE_(30.0, 40.0]","AGE_(40.0, 50.0]","AGE_(50.0, 60.0]","AGE_(60.0, 70.0]","AGE_(70.0, 80.0]","AGE_(80.0, 90.0]","AGE_(90.0, inf]"
0,0,0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0


We renamed the variables to have the same terminology as the other scripts.

In [0]:
df_AGE=df_AGE.rename(columns={'AGE_(0.0, 10.0]':'AGE_0-9', 'AGE_(10.0, 20.0]':'AGE_10-19', 'AGE_(20.0, 30.0]':'AGE_20-29', 'AGE_(30.0, 40.0]':'AGE_30-39',
       'AGE_(40.0, 50.0]':'AGE_40-49', 'AGE_(50.0, 60.0]':'AGE_50-59', 'AGE_(60.0, 70.0]':'AGE_60-69', 'AGE_(70.0, 80.0]':'AGE_70-79', 'AGE_(80.0, 90.0]':'AGE_80-89', 'AGE_(90.0, inf]':'AGE_>90'})

We unified the old dummies with the general DataFrame.

In [0]:
df2=pd.concat([df1,df_AGE],axis=1)

The AGE column is removed so as not to have the data repeated.

In [38]:
df2=df2.drop(columns='AGE')
df2

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE_0-9,AGE_10-19,AGE_20-29,AGE_30-39,AGE_40-49,AGE_50-59,AGE_60-69,AGE_70-79,AGE_80-89,AGE_>90
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,1,0,0,0,0,0,0,0
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,0,0,1,0,0,0,0,0,0,0
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,,0,0,0,0,1,0,0,0,0,0
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0,0,0,1,0,0,0,0,0,0,0
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0,0
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,,0,0,0,0,1,0,0,0,0,0
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,0,0,0,1,0,0,0,0,0,0


### 7. The target column is categorical(uci=!1:leve, uci=1:grave,fecha_def=!NaN: muy grave)-> Group and transform UCI and FECHA_DEF columns.

In [39]:
df2

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE_0-9,AGE_10-19,AGE_20-29,AGE_30-39,AGE_40-49,AGE_50-59,AGE_60-69,AGE_70-79,AGE_80-89,AGE_>90
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,1,0,0,0,0,0,0,0
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,0,0,1,0,0,0,0,0,0,0
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,,0,0,0,0,1,0,0,0,0,0
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0,0,0,1,0,0,0,0,0,0,0
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0,0
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,,0,0,0,0,1,0,0,0,0,0
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,0,0,0,1,0,0,0,0,0,0


The function for classifying a patient's condition (mild, severe and very severe) from UCI and DEFUNCIÓN data is defined.

In [0]:
def gravity (a,b): 

    # If you have a date of death, the virus is considered to have affected you very seriously
    if (b == '1'):         
      x = 'muy grave'

    else:

     # If you enter the UCI, it is considered serious
      if (a == '1'):

        x = 'grave' 
     
     # If you do not enter the UCI, it is considered minor
      else:
        x= 'leve'
               
    return (x)

def gravity_state(): 
      
    df2['State'] = df2.apply(lambda row : gravity(row['ICU'],row['DEAD']), axis = 1) 
   
    return df2

In [41]:
df3 = gravity_state()
df3

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE_0-9,AGE_10-19,AGE_20-29,AGE_30-39,AGE_40-49,AGE_50-59,AGE_60-69,AGE_70-79,AGE_80-89,AGE_>90,State
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,1,0,0,0,0,0,0,0,leve
1,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,0,0,1,0,0,0,0,0,0,0,leve
2,0,,0,,1,0,0,0,1,0,0,0,0,0,,,0,0,0,0,1,0,0,0,0,0,leve
3,0,0,1,,0,0,0,0,0,0,0,1,0,0,,0,0,0,1,0,0,0,0,0,0,0,leve
4,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0,0,leve
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,leve
93787,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0,0,leve
93788,0,,1,,0,0,0,0,0,0,0,1,0,0,1,,0,0,0,0,1,0,0,0,0,0,leve
93789,0,,0,,0,0,0,0,0,0,0,0,0,0,1,,0,0,0,1,0,0,0,0,0,0,leve


In [42]:
df3['State'].value_counts()

leve         88907
muy grave     3299
grave         1585
Name: State, dtype: int64

In [43]:
df3.nunique()

DEAD                     2
INTUBATED                3
PNEUMONIA                3
PREGNANT                 3
DIABETES                 3
EPOC                     3
ASTHMA                   3
IMMUNOSUPPRESSION        3
HYPERTENSION             3
OTHERS_DISEASES          3
CARDIOVASCULAR           3
OBESITY                  3
CHRONIC_RENAL_FAILURE    3
SMOKER                   3
OTRO_CASO                3
ICU                      3
AGE_0-9                  2
AGE_10-19                2
AGE_20-29                2
AGE_30-39                2
AGE_40-49                2
AGE_50-59                2
AGE_60-69                2
AGE_70-79                2
AGE_80-89                2
AGE_>90                  2
State                    3
dtype: int64

In [44]:
df4=df3.replace({'NaN': '2'}, regex=True)
df4

Unnamed: 0,DEAD,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,ICU,AGE_0-9,AGE_10-19,AGE_20-29,AGE_30-39,AGE_40-49,AGE_50-59,AGE_60-69,AGE_70-79,AGE_80-89,AGE_>90,State
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,1,0,0,0,0,0,0,0,leve
1,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,2,0,0,1,0,0,0,0,0,0,0,leve
2,0,2,0,2,1,0,0,0,1,0,0,0,0,0,2,2,0,0,0,0,1,0,0,0,0,0,leve
3,0,0,1,2,0,0,0,0,0,0,0,1,0,0,2,0,0,0,1,0,0,0,0,0,0,0,leve
4,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,leve
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,leve
93787,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,leve
93788,0,2,1,2,0,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,1,0,0,0,0,0,leve
93789,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,1,0,0,0,0,0,0,leve


# Split in Train/Validación/Test

We will use the usually recommended ratios as an example:

Train: 70%.

Validation: 15%.

Test: 15%.

In [0]:
X=pd.DataFrame()

In [0]:
y = df4['State']
X = df4.loc[:, df4.columns != 'State']
X = X.loc[:, X.columns != 'DEAD']
X = X.loc[:, X.columns != 'ICU']


In [47]:
X = X.apply(pd.to_numeric) 
X

Unnamed: 0,INTUBATED,PNEUMONIA,PREGNANT,DIABETES,EPOC,ASTHMA,IMMUNOSUPPRESSION,HYPERTENSION,OTHERS_DISEASES,CARDIOVASCULAR,OBESITY,CHRONIC_RENAL_FAILURE,SMOKER,OTRO_CASO,AGE_0-9,AGE_10-19,AGE_20-29,AGE_30-39,AGE_40-49,AGE_50-59,AGE_60-69,AGE_70-79,AGE_80-89,AGE_>90
0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0
1,2,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
2,2,0,2,1,0,0,0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0
3,0,1,2,0,0,0,0,0,0,0,1,0,0,2,0,0,1,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93786,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
93787,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
93788,2,1,2,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0
93789,2,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0


In [0]:
perc_values = [0.7, 0.15, 0.15];

We create the train, validation and test sets with the selected size but respecting the time axis.

In [49]:
y=y.replace({'leve':0,'grave':1,'muy grave':2})
y.value_counts()

0    88907
2     3299
1     1585
Name: State, dtype: int64

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train_rand, X_valtest_rand, y_train_rand, y_valtest_rand = train_test_split(X, y, test_size=perc_values[1] + perc_values[2], random_state=1);

X_val_rand, X_test_rand, y_val_rand, y_test_rand = train_test_split(X_valtest_rand, y_valtest_rand, test_size= perc_values[2] / (perc_values[1] + perc_values[2]), random_state=1)

In [52]:
print('Train data size = ' + str(X_train_rand.shape))
print('Train target size = ' + str(y_train_rand.shape))
print('Validation data size = ' + str(X_val_rand.shape))
print('Validation target size = ' + str(y_val_rand.shape))
print('Test data size = ' + str(X_test_rand.shape))
print('Test target size = ' + str(y_test_rand.shape))

Train data size = (65653, 24)
Train target size = (65653,)
Validation data size = (14069, 24)
Validation target size = (14069,)
Test data size = (14069, 24)
Test target size = (14069,)


# Random Forest

1) Import model.

In this case we have to go outside of scikit-learn to apply Random Forest Classifier model.

In [0]:
from sklearn.ensemble import RandomForestClassifier

2) Import metric

In [0]:
from sklearn.metrics import roc_auc_score as auc;
from sklearn.metrics import accuracy_score as acc;

3) Define the method

In [55]:
model = RandomForestClassifier(n_estimators = 15,  random_state = 1)

model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

4) Call the fit method to train the model

In [56]:
start = timeit.default_timer()

model.fit(X_train_rand, np.array(y_train_rand))

stop = timeit.default_timer()
print('Time: ', stop - start) 

Time:  0.5140937559999657


5) Call the predict method to generate the predictions.

In [57]:
start = timeit.default_timer()

pred_train = model.predict(X_train_rand)
pred_val = model.predict(X_val_rand)
pred_test = model.predict(X_test_rand)

stop = timeit.default_timer()
print('Time: ', stop - start) 

Time:  0.24722334499995213


In [0]:


# Label test

lb = preprocessing.LabelBinarizer()
lb.fit(y_test_rand)

y_test_lb = lb.transform(y_test_rand)
val_lb = lb.transform(pred_test)

#Label train

lb.fit(y_train_rand)

y_train_lb = lb.transform(y_train_rand)
val_train_lb = lb.transform(pred_train)

#Label validation

lb.fit(y_val_rand)

y_val_lb = lb.transform(y_val_rand)
val_val_lb = lb.transform(pred_val)


6) Calculate metrics using the predictions obtained in the previous step.

In [0]:
acc_train = acc(y_train_lb, val_train_lb);
acc_val = acc(y_val_lb, val_val_lb);
acc_test = acc(y_test_lb, val_lb);
results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':['Random Forest (Default)'],'acc_train':[acc_train],'acc_val':[acc_val],'acc_test':[acc_test]}, columns=['model',  'acc_train','acc_val', 'acc_test']), ignore_index=True)

In [64]:
results

Unnamed: 0,model,acc_train,acc_val,acc_test
0,Random Forest (Default),0.964556,0.942498,0.946407


IMPORTANCIA DE LAS VARIABLES

In [0]:
importances = list(model.feature_importances_)

In [0]:
# tuplas de importancia y variable
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(X, importances)]
# Ordenamos las variables por importancia
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)


In [67]:
[print(pair) for pair in feature_importances];

('INTUBATED', 0.272)
('PNEUMONIA', 0.098)
('OTRO_CASO', 0.071)
('PREGNANT', 0.057)
('DIABETES', 0.05)
('SMOKER', 0.048)
('OBESITY', 0.046)
('HYPERTENSION', 0.042)
('OTHERS_DISEASES', 0.041)
('EPOC', 0.033)
('IMMUNOSUPPRESSION', 0.031)
('CARDIOVASCULAR', 0.031)
('CHRONIC_RENAL_FAILURE', 0.031)
('ASTHMA', 0.022)
('AGE_50-59', 0.019)
('AGE_70-79', 0.019)
('AGE_60-69', 0.018)
('AGE_40-49', 0.016)
('AGE_80-89', 0.014)
('AGE_30-39', 0.013)
('AGE_20-29', 0.01)
('AGE_>90', 0.006)
('AGE_0-9', 0.005)
('AGE_10-19', 0.005)


Grid Search


In [68]:
param_grid = [{'bootstrap': [True, False],
  'n_estimators': [20, 45, 55], 'max_features': [None, 0.01],'min_samples_leaf': [1, 2, 4],
  'max_depth': [10,  15, None]}
 ]

param_grid

[{'bootstrap': [True, False],
  'max_depth': [10, 15, None],
  'max_features': [None, 0.01],
  'min_samples_leaf': [1, 2, 4],
  'n_estimators': [20, 45, 55]}]

In [69]:
params_values = param_grid[0]
params_values

{'bootstrap': [True, False],
 'max_depth': [10, 15, None],
 'max_features': [None, 0.01],
 'min_samples_leaf': [1, 2, 4],
 'n_estimators': [20, 45, 55]}

In [70]:
num_iteraciones = len(params_values['bootstrap'])*len(params_values['n_estimators'])*len(params_values['max_features'])*len(params_values['min_samples_leaf'])*len(params_values['max_depth'])

print('El numero de iteraciones es', num_iteraciones)

El numero de iteraciones es 108


In [0]:
metric = acc
accuracy = []
accuracy= pd.DataFrame(columns=('Accuracy', 'bootstrap', 'n_estimators', 'max_features', 'min_samples_leaf', 'max_depth'))
num_iter = 0

In [72]:
start = timeit.default_timer()

for a in range(0,len(params_values['bootstrap'])):
    for b in range(0,len(params_values['n_estimators'])):
        for c in range(0,len(params_values['max_features'])):
            for d in range(0,len(params_values['min_samples_leaf'])):
                for e in range(0,len(params_values['max_depth'])):
                    # print control iteracion modelo
                    print('Numero de iteración = ' + str(num_iter) +
                          ', parametro boostrap = ' + str(params_values['bootstrap'][a]) + 
                          ', parametro number of trees = '  + str(params_values['n_estimators'][b]) +
                          ', parametro maximo de variables = '  + str(params_values['max_features'][c]) +
                          ', parametro minumun sample = '  + str(params_values['min_samples_leaf'][d]) +
                          ', parametro profundidad maxima = '  + str(params_values['max_depth'][e]))

                    # definicion del modelo con sus parametros
                    model = RandomForestClassifier(bootstrap = params_values['bootstrap'][a], 
                                                  n_estimators = params_values['n_estimators'][b],
                                                  max_features = params_values['max_features'][c],
                                                  min_samples_leaf = params_values['min_samples_leaf'][d],
                                                  max_depth = params_values['max_depth'][e], 
                                                  random_state = 1)

                    # entrenamiento del modelo
                    model.fit(X_train_rand, np.array(y_train_rand))

                    # prediccion del conjunto de validacion
                    pred_val = model.predict(X_val_rand)

                    lb.fit(y_val_rand)

                    y_val_lb = lb.transform(y_val_rand)
                    val_val_lb = lb.transform(pred_val)

                    # Calculo de la metrica de error
                    accuracy_iter = metric(y_val_lb, val_val_lb);

                    # print error
                    print('Accuracy validacion = ' + str( accuracy_iter))

                    # guarda el error
                    accuracy.loc[num_iter]=[accuracy_iter,
                                            params_values['bootstrap'][a], 
                                            params_values['n_estimators'][b],
                                            params_values['max_features'][c],
                                            params_values['min_samples_leaf'][d],
                                            params_values['max_depth'][e]] 
                    num_iter += 1

stop = timeit.default_timer()
print('Time: ', stop - start) 

Numero de iteración = 0, parametro boostrap = True, parametro number of trees = 20, parametro maximo de variables = None, parametro minumun sample = 1, parametro profundidad maxima = 10
Accuracy validacion = 0.9464069941005047
Numero de iteración = 1, parametro boostrap = True, parametro number of trees = 20, parametro maximo de variables = None, parametro minumun sample = 1, parametro profundidad maxima = 15
Accuracy validacion = 0.9443457246428317
Numero de iteración = 2, parametro boostrap = True, parametro number of trees = 20, parametro maximo de variables = None, parametro minumun sample = 1, parametro profundidad maxima = None
Accuracy validacion = 0.9422844551851589
Numero de iteración = 3, parametro boostrap = True, parametro number of trees = 20, parametro maximo de variables = None, parametro minumun sample = 2, parametro profundidad maxima = 10
Accuracy validacion = 0.9469045419006326
Numero de iteración = 4, parametro boostrap = True, parametro number of trees = 20, parame

In [73]:
accuracy

Unnamed: 0,Accuracy,bootstrap,n_estimators,max_features,min_samples_leaf,max_depth
0,0.946407,True,20,,1,10
1,0.944346,True,20,,1,15
2,0.942284,True,20,,1,
3,0.946905,True,20,,2,10
4,0.945838,True,20,,2,15
...,...,...,...,...,...,...
103,0.945554,False,55,0.01,2,15
104,0.945838,False,55,0.01,2,
105,0.945554,False,55,0.01,4,10
106,0.945554,False,55,0.01,4,15


In [74]:
ind_max = accuracy['Accuracy'].idxmax()
best_parameters = accuracy.iloc[ind_max]
params_values['max_depth']

[10, 15, None]

In [75]:
import math
if math.isnan(best_parameters['max_features']):
    best_parameters['max_features'] = None
model = RandomForestClassifier(bootstrap = best_parameters['bootstrap'], 
                                                  n_estimators = best_parameters['n_estimators'],
                                                  max_features = best_parameters['max_features'],
                                                  min_samples_leaf = best_parameters['min_samples_leaf'],
                                                  max_depth = best_parameters['max_depth'],
                                                  random_state = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [76]:
## Ponemos un contador para ver cuanto tarda cada modelo
start = timeit.default_timer()

model.fit(X_train_rand, np.array(y_train_rand))

stop = timeit.default_timer()
print('Time: ', stop - start) 

Time:  1.6540911179999966


In [77]:
start = timeit.default_timer()

pred_train = model.predict(X_train_rand)
pred_val = model.predict(X_val_rand)
pred_test = model.predict(X_test_rand)

stop = timeit.default_timer()
print('Time: ', stop - start) 

Time:  0.2835633309999821


In [0]:
# Label test

lb = preprocessing.LabelBinarizer()
lb.fit(y_test_rand)

y_test_lb = lb.transform(y_test_rand)
val_lb = lb.transform(pred_test)

#Label train

lb.fit(y_train_rand)

y_train_lb = lb.transform(y_train_rand)
val_train_lb = lb.transform(pred_train)

#Label validation

lb.fit(y_val_rand)

y_val_lb = lb.transform(y_val_rand)
val_val_lb = lb.transform(pred_val)

In [0]:
acc_train = acc(y_train_lb, val_train_lb);
acc_val = acc(y_val_lb, val_val_lb);
acc_test = acc(y_test_lb, val_lb);
results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':['Random Forest (Default)'],'acc_train':[acc_train],'acc_val':[acc_val],'acc_test':[acc_test]}, columns=['model',  'acc_train','acc_val', 'acc_test']), ignore_index=True)

In [82]:
results

Unnamed: 0,model,acc_train,acc_val,acc_test
0,Random Forest (Default),0.955067,0.947473,0.952306
