# COVID-19 in Brazil
---
In this Jupyter Notebook you'll find a statistical analysis of COVID-19 in Brazil.

## Project Setup

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import csv
import gzip
import io
import os
import os.path 
import time
from datetime import datetime
from urllib.request import Request, urlopen

## Initial Settings

#### List of Regions

In [2]:
region_list = ["Norte", "Nordeste", "Centro-Oeste", "Sudeste", "Sul"]

#### Dictionary of Regions:States

In [3]:
regions = {
    "Norte" : ["AC", "AP", "AM", "TO", "PA", "RR", "RO"],
    "Nordeste" : ["AL", "BA", "PB", "PE", "SE", "PI", "CE", "MA", "RN"],
    "Centro-Oeste" : ["MT", "GO", "MS", "DF"],
    "Sudeste" : ["SP", "ES", "RJ", "MG"],
    "Sul" : ["SC", "RS", "PR"],
}

#### List of States

In [4]:
state_list = ['AC', 'AP', 'AM', 'TO', 'PA', 'RR', 'RO', 'AL', 'BA', 'PB', 'PE', 'SE', 'PI', 'CE', 'MA', 'RN', 'MT', 'GO', 'MS', 'DF', 'SP', 'ES', 'RJ', 'MG',  'SC', 'RS', 'PR']

#### Setting Plotting Theme

In [5]:
plt.style.use(["seaborn-whitegrid", "./util/styles/custom_style.mplstyle"])

## Auxiliary Functions

#### Save a Dataset to a specific path

In [6]:
def save_dataset(dataset, path):
    return dataset.to_csv(path, index=False)

#### Return a region based on a state

In [7]:
def get_region(state, regions):
    for region, states in regions.items():  
        if state in states:
            return region

#### Generate a fancy report

In [8]:
def generate_report(dataset, last_available_date):
    file = open(f"reports/{last_available_date[5:7]}-{last_available_date[0:4]}/report_{last_available_date[-2:]}{last_available_date[5:7]}{last_available_date[0:4]}.txt", "w", encoding='utf8')
    
    file.write(f"Atualização COVID-19 no Brasil [{last_available_date[-2:]}/{last_available_date[5:7]}/{last_available_date[0:4]}]\n\n")
    
    file.write(("| Casos Acumulados: {0:,}\n").format(dataset["accumulated_num_cases"].iat[-1]).replace(',','.'))
    file.write(("| Óbitos Acumulados: {0:,}\n\n").format(dataset["accumulated_num_deaths"].iat[-1]).replace(',','.'))
    
    file.write(("| Qntd. de NOVOS Casos: {0:,}\n").format(dataset["new_num_cases"].iat[-1]).replace(',','.'))
    file.write(("| Qntd. de NOVOS Óbitos: {0:,}\n\n").format(dataset["new_num_deaths"].iat[-1]).replace(',','.'))
    
    file.write("Fonte dos dados: http://brasil.io / Secretarias de Saúde\n")
    file.write("#ficaemcasa #DataScience #COVID19")
    
    file.write("\n\n=================================\n\n")
    
    file.write("Visualização por Semanas Epidemiológicas e visualizações Regionais:")
    
    file.write("\n\n=================================\n\n")
    
    file.write("Repositório do GitHub:\n")
    file.write("https://github.com/evnrodr/covid19-brazil")
    
    file.close()

## Data Preprocessing

### Downloading the Data

In [9]:
def download_dataset():
    
    # Downloads the dataset from request/response
    request = Request("https://data.brasil.io/dataset/covid19/caso_full.csv.gz", headers={"User-Agent": "python-urllib"})
    response = urlopen(request)
    
    # Create a DataFrame from dictionary
    dataset = pd.DataFrame.from_dict(csv.DictReader(io.StringIO(gzip.decompress(response.read()).decode("utf-8"))))
    
    # Saves the dataset
    save_dataset(dataset, "data/processed/covid19-dataset-brasilio-original.csv")
    
    # Returns the reading of the saved dataset (This process it's necessary in order to avoid type errors and possible errors during data reading)
    return pd.read_csv("data/processed/covid19-dataset-brasilio-original.csv")

### Data Cleaning

In [13]:
def data_cleaning(dataset, regions):
    
    # List of columns that will be droped
    columns_to_drop = ["city", "city_ibge_code", "estimated_population_2019", "is_repeated", "last_available_confirmed_per_100k_inhabitants", "last_available_death_rate", "order_for_place", "place_type"]
    
    # Droping the columns and resetting the indexes;
    dataset = (dataset[~dataset["place_type"].isin(["city"])]
                .drop(columns_to_drop, axis=1)
                .reset_index(drop=True))
    
    # Adding the "Region" column (refer to the getRegion utility function)
    dataset['region'] = [get_region(state, regions) for state in dataset.state.tolist()]
    
    # Reordering and renaming columns
    dataset = (dataset[['date', 'last_available_date', "is_last", 'region', 'state', 'epidemiological_week', 'last_available_confirmed', 'last_available_deaths', 'new_confirmed', 'new_deaths']]
               .rename(columns={"last_available_confirmed": "accumulated_cases",
                                 "new_confirmed": "new_cases",
                                 "last_available_deaths": "accumulated_deaths",
                                 "new_deaths": "new_deaths",
                                }))
    
    # Saving the updated dataset
    save_dataset(dataset, "data/processed/covid19-dataset-brasilio_cleaned.csv")
    
    return dataset, dataset.last_available_date.max()

## Processing National Data

In [67]:
def process_national_data_alt(dataset, date_list, last_updated):
    dataFrames = []
    
    for date in date_list[:-1]:
        temp_dataframe = pd.DataFrame(dataset[dataset['date'] == date.strftime("%Y-%m-%d")].loc[:, 'accumulated_cases':'new_deaths'].sum()).T
        temp_dataframe.insert(0, "date", date.strftime("%Y-%m-%d"), True)
        temp_dataframe.insert(1, "epidemiological_week", dataset[dataset['date'] == date.strftime("%Y-%m-%d")]['epidemiological_week'].values[0], True)
        dataFrames.append(temp_dataframe)
    
    
    
    return pd.concat(dataFrames)

In [70]:
process_national_data_alt(covid_dataset_clean, date_list, last_updated)

Unnamed: 0,date,epidemiological_week,accumulated_cases,accumulated_deaths,new_cases,new_deaths
0,2020-02-25,202009,1,0,1,0
0,2020-02-26,202009,1,0,0,0
0,2020-02-27,202009,1,0,0,0
0,2020-02-28,202009,2,0,1,0
0,2020-02-29,202009,2,0,0,0
...,...,...,...,...,...,...
0,2021-04-21,202116,14134150,381971,73988,3197
0,2021-04-22,202116,14182258,384022,48108,2051
0,2021-04-23,202116,14251539,387067,69281,3045
0,2021-04-24,202116,14315320,389830,63781,2763


In [22]:
def process_national_data(dataset, date_list, last_updated):
    
    # Creating auxiliary lists
    acc_cases = []
    acc_deaths = []
    n_cases = []
    n_deaths = []
    epi_week = []
    
    # Iterating over each date
    for date in date_list[:-1]:
        
        # Summing over all data for specific date
        summatory = dataset[dataset['date'] == date.strftime("%Y-%m-%d")].sum()
        
        # Getting the epidemiological week for each date
        epi_week.append(dataset[dataset['date'] == date.strftime("%Y-%m-%d")]['epidemiological_week'].values[0])
        
        # Getting summed data of each column
        acc_cases.append(summatory[6])
        acc_deaths.append(summatory[7])
        n_cases.append(summatory[8])
        n_deaths.append(summatory[9])
    
    # For today's data
    # Summing over all data for specific date
    summatory = dataset[dataset["is_last"] == True].sum()

    # Getting the epidemiological week for each date
    epi_week.append(dataset[dataset['date'] == last_updated].epidemiological_week.iat[0])

    # Getting summed data of each column
    acc_cases.append(summatory[6])
    acc_deaths.append(summatory[7])
    n_cases.append(summatory[8])
    n_deaths.append(summatory[9])
    
    
    # Creating a new DataFrame of the processed data
    national_dataframe = pd.DataFrame({"date": pd.date_range(start = "2020-02-25", end = last_updated), 
                                       "country": "Brazil", 
                                       "epidemiological_week": epi_week, 
                                       "accumulated_num_cases": acc_cases, 
                                       "accumulated_num_deaths": acc_deaths,
                                       "new_num_cases": n_cases,
                                       "new_num_deaths": n_deaths
                                      })
    
    # Saving the dataset
    
    save_dataset(national_dataframe, "data/national/covid19-dataset-brasil.csv")
    
    # Returning generated DataFrame
    return national_dataframe

## Execution

### Data Preprocessing

In [11]:
if len(os.listdir("data/processed")) == 1:
    covid_dataset = download_dataset()
else:
    to_read = os.listdir("data/processed")[0]
    covid_dataset = pd.read_csv(f"data/processed/{thecsv}")

In [16]:
covid_dataset_clean, last_updated = data_cleaning(covid_dataset, regions)

In [18]:
date_list = pd.date_range(start = "2020-02-25", end = last_updated)

### Getting National Data

In [48]:
pd.DataFrame(covid_dataset_clean[covid_dataset_clean['date'] == "2021-03-20"].loc[:, 'accumulated_cases':'new_deaths'].sum()).T

Unnamed: 0,accumulated_cases,accumulated_deaths,new_cases,new_deaths
0,11958814,293076,73154,2368


In [69]:
national_dataset = process_national_data(covid_dataset_clean, date_list, last_updated)

In [15]:
to_read = os.listdir("data/processed")[0]
os.remove(f"data/processed/{to_read}")