## Análise de Séries Temporais sobre a COVID 19 no Brasil

#### Importar Bibliotecas

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
import re
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt 

#### Importar Base de Dados

In [2]:
dados = pd.read_csv("./covid_19_database.csv", parse_dates=["ObservationDate", "Last Update"])
dados.head(10)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,2020-01-22,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,2020-01-22,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,2020-01-22,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,2020-01-22,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,2020-01-22,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [3]:
# Analisando os Tipos de Dados
dados.dtypes

SNo                         int64
ObservationDate    datetime64[ns]
Province/State             object
Country/Region             object
Last Update                object
Confirmed                 float64
Deaths                    float64
Recovered                 float64
dtype: object

#### Tratamento dos Dados


In [4]:
# Corrigindo os nomes das colunas 
def corrige_colunas(col_name):
    return re.sub(r"[/| ]", "", col_name).lower()

dados.columns = [corrige_colunas(col) for col in dados.columns]

dados.head(10)

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,2020-01-22,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,2020-01-22,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,2020-01-22,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,2020-01-22,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,2020-01-22,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [5]:
# Corrigindo o tipo de dados da coluna "lastupdate"
dados['lastupdate'] = pd.to_datetime(dados['lastupdate'],format='mixed')
dados.dtypes

sno                         int64
observationdate    datetime64[ns]
provincestate              object
countryregion              object
lastupdate         datetime64[ns]
confirmed                 float64
deaths                    float64
recovered                 float64
dtype: object

In [7]:
# Trabalhando somente com dados do Brasil
brasil = dados.loc[(dados.countryregion == 'Brazil') & (dados.confirmed >0)]
brasil.head(10)

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
2903,2904,2020-03-01,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
3032,3033,2020-03-02,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
3173,3174,2020-03-03,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
3322,3323,2020-03-04,,Brazil,2020-03-04 20:33:02,4.0,0.0,0.0
3486,3487,2020-03-05,,Brazil,2020-03-04 20:33:02,4.0,0.0,0.0
3647,3648,2020-03-06,,Brazil,2020-03-06 20:33:03,13.0,0.0,0.0


### Casos Confirmados

In [9]:
 # Visualizando os casos confirmados em um gráfico
 fig = px.line(brasil, 'observationdate', 'confirmed', 
    width= 920,
    labels={'observationdate':'Período', 'confirmed': 'Nº Casos Confirmados'},
    title= 'Casos Confirmados no Brasil'
)

fig.update_layout(template = 'plotly_dark')
fig.show()


###  Novos Casos de COVID por Dia

In [12]:
# Vamos implementar uma função para fazer a contagem de novos cados de COVID 19
brasil['novoscasos'] = list(map(
    lambda x: 0 if (x==0) else brasil['confirmed'].iloc[x] - brasil['confirmed'].iloc[x-1],
    np.arange(brasil.shape[0] )
))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
fig = px.line(brasil, 'observationdate', 'novoscasos', title='Novos Casos por Dia',width=920,
    labels={'observationdate': 'Período', 'novoscasos': 'Novos Casos'}
    )
fig.update_layout(template='plotly_dark')
fig.show()

### Número de Mortes

In [15]:
fig = go.Figure()

fig.add_trace(
    go.Scatter( x=brasil.observationdate, y=brasil.deaths,  name='Número de Mortes', mode='lines+markers',
        line=dict(color='red')
        )
)
fig.update_layout(title='Mortes por COVID 19 no Brasil', 
    xaxis_title='Período', 
    yaxis_title='Número de Mortes', 
    template='plotly_dark',
    width=920)
fig.show()

### Taxa de Crescimento

taxa_crescimento = (presente/passado)**(1/n)-1

In [16]:
# Vamos calcular a taxa de crescimento do COVID 19 desde o primeiro dia
def taxa_crescimento(data, variable, data_inicio=None, data_fim=None):
    """
    Calcula a taxa de crescimento percentual de uma variável ao longo de um período de tempo.

    Parâmetros:
    - data (DataFrame): O conjunto de dados contendo a variável e a data de observação.
    - variable (str): O nome da variável para a qual a taxa de crescimento será calculada.
    - data_inicio (str ou None): A data de início do período. Se for None, usa a primeira data disponível no dataset.
    - data_fim (str ou None): A data de término do período. Se for None, usa a última data disponível no dataset.

    Retorna:
    - taxa (float): A taxa de crescimento percentual da variável ao longo do período especificado.
    """
    
    # Se a data_inicio for None, define como a primeira data disponível no dataset
    if data_inicio == None:
        data_inicio = data.observationdate.loc[data.confirmed > 0].min()
    else:
        data_inicio = pd.to_datetime(data_inicio)
    
    # Se a data_fim for None, define como a última data disponível no dataset
    if data_fim == None:
        data_fim = data.observationdate.iloc[-1]
    else:
        data_fim = pd.to_datetime(data_fim)

    # Define os valores de presente e passado
    passado = data.loc[data.observationdate == data_inicio, variable].values[0]
    presente = data.loc[data.observationdate == data_fim, variable].values[0]

    # Define o número de pontos no tempo que iremos avaliar
    n = (data_fim - data_inicio).days

    # Calcular a taxa
    taxa = (presente/passado)**(1/n) - 1

    return taxa

In [17]:
cresc_medio = taxa_crescimento(brasil,'confirmed')
print(f'O crescimento médio do COVID 19 no Brasil no período avaliado foi de {cresc_medio:.2%}.')

O crescimento médio do COVID 19 no Brasil no período avaliado foi de 16.27%.


#### Crescimento Diário

In [18]:
# Vamos observar o comportamento da taxa de crescimento diária
def taxa_crescimento_diária(data, variable, data_inicio=None):
    if data_inicio == None:
        data_inicio = data.observationdate.loc[data[variable] > 0].min()
    else:
        data_inicio = pd.to_datetime(data_inicio)

    data_fim = data.observationdate.max()

    n = (data_fim - data_inicio).days

    taxas =     list(map(
        lambda x: (data[variable].iloc[x] - data[variable].iloc[x-1]) / data[variable].iloc[x-1],
        range(1, n+1)    
    ))
    return np.array(taxas) * 100

In [19]:
taxa_dia = taxa_crescimento_diária(brasil, 'confirmed')
taxa_dia

array([  0.        ,   0.        , 100.        ,   0.        ,
         0.        ,   0.        , 100.        ,   0.        ,
       225.        ,   0.        ,  53.84615385,  25.        ,
        24.        ,  22.58064516,  36.84210526, 190.38461538,
         0.        ,   7.28476821,  23.45679012,  60.5       ,
        15.88785047,  66.93548387,  27.69726248,  28.75157629,
        51.4201763 ,  24.45019405,  16.78794179,  13.66266133,
        16.87548943,  14.47236181,  14.25226807,   9.01639344,
         7.58928571,  24.8525879 ,  19.57320273,  17.67115272,
        12.58080557,  14.39929329,   7.43243243,   9.26325247,
        15.40169394,  15.22017956,  11.88620903,   8.54521335,
         5.54537122,   7.06807546,   5.57858688,   7.81903542,
        12.10513815,   7.4329096 ,  10.70501233,   8.83557983,
         5.44492335,   5.4043566 ,   5.73350023,   6.21648599,
         9.35157462,   8.00823407,   9.77184834,   6.36504619,
         6.88748019,   8.58316283,   8.80726429,   9.41

In [21]:
primeiro_dia = brasil.observationdate.loc[brasil.confirmed > 0].min()
primeiro_dia

Timestamp('2020-02-26 00:00:00')

In [22]:
fig = px.line(x= pd.date_range(primeiro_dia, brasil.observationdate.max())[1:], y= taxa_dia,
    title='Taxa diária de casos confirmados no Brasil',width=920,
    labels={'y': 'Taxa de Crescimento', 'x': 'Período'}
    )
fig.update_layout(template='plotly_dark')
fig.show()

### Predições

In [26]:
# Vamos construir um modelo de séries temporais para prever os novos casos.
novos_casos = brasil.novoscasos
novos_casos.index = brasil.observationdate
novos_casos

observationdate
2020-02-26        0.0
2020-02-27        0.0
2020-02-28        0.0
2020-02-29        1.0
2020-03-01        0.0
               ...   
2020-05-15    17126.0
2020-05-16    13220.0
2020-05-17     7569.0
2020-05-18    14288.0
2020-05-19    16517.0
Name: novoscasos, Length: 84, dtype: float64