### Importações Mysql

In [15]:
import pandas as pd
import mysql.connector
from mysql.connector import errorcode
from dotenv import load_dotenv
import os

### Criando conexão e dataframe inicial com o MySQL

In [16]:
# conexão com Mysql
try:
    # conexão com os dados
    cnx = mysql.connector.connect(
        user=os.getenv('user_mysql'), 
        password=os.getenv('password_mysql'),
        host=os.getenv('host_mysql'),
        database=os.getenv('database_mysql'))
    cursor = cnx.cursor()
    print('Connection estabilished with Mysql!')

    # query inicial na tabela para criação do dataframe
    query = (f"SELECT * FROM DADOS_COVID \
                WHERE city_ibge_code NOT IN (12, 27)")
    cursor.execute(query)

    # armazena dados da tabela em results
    results = cursor.fetchall()

    # coleta nome das colunas
    column_names = [desc[0] for desc in cursor.description]
    
    # criação do dataframe usando o pandas
    df_covid = pd.DataFrame(results, columns=column_names)
    print('Dataframe has been created!!')

    # tratamento na codificação da coluna city para utf-8 latin
    def fix_encoding(text):
      decoded_text = text.encode('latin1').decode('utf-8')
      fixed_text = decoded_text.encode('utf-8')
      return fixed_text.decode('utf-8')

    df_covid['city'] = df_covid['city'].apply(fix_encoding)
    # display(df_covid)

    # salvando csv e encerrando conexão
    df_covid.to_csv('covid_data.csv',index=False)
    cnx.close()
    print('Connection closed!')
    
# em caso de erro, retorna o motivo para tratamento
except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)
else:
  cnx.close()

Connection estabilished with Mysql!
Dataframe has been created!!
Connection closed!


### Transformações Covid

##### Acumulado de casos confirmados


In [5]:
###### Acumulado dia a dia com diferença entre períodos

# correção na coluna date para formato de data
df_covid['date'] = pd.to_datetime(df_covid['date'])

# ordena pela coluna 'date'
df_covid = df_covid.sort_values('date')

# cria o acumulado de casos confirmados em uma nova coluna 'cumulative_confirme'
df_covid['cumulative_confirmed'] = df_covid['new_confirmed'].cumsum().astype(int) # Acumulado por caso confirmado
df_covid['previous_period'] = df_covid['cumulative_confirmed'].shift().fillna(0).astype(int) # Período anterior
df_covid['diff_by_period'] = df_covid['cumulative_confirmed'] - df_covid['previous_period']

df_cumulative = df_covid[['date','city_ibge_code','epidemiological_week','cumulative_confirmed','previous_period','diff_by_period']]
df_cumulative.to_csv('covid_cumulative.csv',index=False)

display(df_cumulative)

Unnamed: 0,date,city_ibge_code,epidemiological_week,cumulative_confirmed,previous_period,diff_by_period
12614,2020-03-08,2704302,202011,1,0,1
12615,2020-03-09,2704302,202011,1,1,0
12616,2020-03-10,2704302,202011,1,1,0
12617,2020-03-11,2704302,202011,1,1,0
12618,2020-03-12,2704302,202011,1,1,0
...,...,...,...,...,...,...
12605,2021-11-22,1200807,202147,88075,88075,0
12603,2021-11-22,1200336,202147,88075,88075,0
12592,2021-11-22,1200013,202147,88075,88075,0
12613,2021-11-22,1200708,202147,88075,88075,0


##### Acumulado de casos confirmados

In [18]:
##### Agrupando e exportando por epidemiological_week

df_grouped = df_cumulative.groupby('epidemiological_week')['cumulative_confirmed'].max().reset_index() # agrupando e coletando valor maximo acumulado do período
df_grouped['previous_period'] = df_grouped['cumulative_confirmed'].shift(1).fillna(0).astype(int) # tratamento para a tipagem e NaN
df_grouped['diff_by_period'] = df_grouped['cumulative_confirmed'] - df_grouped['previous_period'] # calculo de diferença entre período
df_grouped['year'] = df_grouped['epidemiological_week'].astype(str).str.slice(0,4) # coleta do ano
df_grouped['week'] = df_grouped['epidemiological_week'].astype(str).str.slice(-2) # coleta da semana do ano

df_grouped.to_csv('covid_cumulative_grouped.csv', index=False)
display(df_grouped)

Unnamed: 0,epidemiological_week,cumulative_confirmed,previous_period,diff_by_period,year,week
0,202011,1,0,1,2020,11
1,202012,18,1,17,2020,12
2,202013,39,18,21,2020,13
3,202014,60,39,21,2020,14
4,202015,86,60,26,2020,15
...,...,...,...,...,...,...
85,202143,88065,88038,27,2021,43
86,202144,88075,88065,10,2021,44
87,202145,88075,88075,0,2021,45
88,202146,88075,88075,0,2021,46
