## Hello,
This notebook aims to serve as a project to consolidate all the knowledge that I have acquired so far about data science.
I am still a beginner in this area, however, I see the need to put what I learned into practice in order to learn new things.


Comments explaining the lines of code will be made in Portuguese (pt-br) which is my native language.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importando as bibliotecas que sera utilizadas para a analise exploratoria do conjunto de dados.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')

# Data cleaning

In [None]:
#o dataset se encontra dessa forma
df.head()

In [None]:
#selecionando colunas que serão utilizadas na analise
df.columns

In [None]:
df = df[['country','date', 'daily_vaccinations', 'daily_vaccinations_per_million','vaccines' ]]

In [None]:
#visualizando formato das variaveis no dataset
df.info()

In [None]:
#transformando a variavel date para o formato de datas, as outras variaveis não precisaram ser transformadas
df.date = pd.to_datetime(df.date)

In [None]:
#a coluna ['daily_vaccinations'] é a que menos tem valores NaN, então as linhas serão removidas de 
#acordo com essa coluna
print(df.info())
df.drop(df[df.daily_vaccinations.isna()==True].index, inplace=True)

In [None]:
#tudo certo agora
df.info()

# Countries with more vaccinations so far

In [None]:
# os dados serão agrupados pelo pais e será feita uma soma do numero de vacinas até o momento
df_country = df.groupby(['country']).daily_vaccinations.agg(sum)
df_country = pd.DataFrame(df_country)
df_country.rename(columns={'daily_vaccinations':'total_vaccinations'}, inplace=True)
df_country = df_country.sort_values(by='total_vaccinations', ascending=False)
df_country

In [None]:
#plotagem com os vinte paises que mais vacinaram até o momento
plt.figure(figsize=(14,8))
plt.xticks(rotation=45)
sns.barplot(data=df_country.head(20), x=df_country.index[:20], y='total_vaccinations')
plt.title('Top 20 country vaccinations')

In [None]:
#plotagem com 20 paises que menos vacinaram até agora
#vale lembrar que o baixo numero de vaccinações pode não ser por conta de ineficiencia
#tambem existe o fator que de infecção que pode ter sido baixo nesse pais
plt.figure(figsize=(14,8))
plt.xticks(rotation=45)
sns.barplot(data=df_country.tail(20), x=df_country.index[-20:], y='total_vaccinations')
plt.title('Top 20 country with less vaccinations')

In [None]:
#aparentenmente na colombia não teve vaccinações de acordo com o grafico acima
#numero de vacinados na colombia
df_country[df_country.index == 'Colombia']

# Relative growth in the number of daily vaccines

In [None]:
#preparando subdatasets pra fazer uma comparação da evolução da vaccinação, entre o brasil e alguns
#dos paises que mais vacinaram
df_bra = df[df.country == 'Brazil']
df_in = df[df.country == 'India']
df_chi = df[df.country == 'China']
df_is = df[df.country == 'Israel']
df_en = df[df.country == 'England']


In [None]:
#plotagem da distribuição do numero de vacinas diarias por paises
plt.figure(figsize=(14,8))
sns.kdeplot(data=df_bra.daily_vaccinations, shade=True)
sns.kdeplot(data=df_in.daily_vaccinations, shade=True)
sns.kdeplot(data=df_chi.daily_vaccinations, shade=True)
sns.kdeplot(data=df_is.daily_vaccinations, shade=True)
sns.kdeplot(data=df_en.daily_vaccinations, shade=True)
plt.legend(['Brazil', 'India','China','Israel','England'])
plt.title('Distribution of the number of vaccines per day')

In [None]:
#preparação para criação da variavel que registra o crescimento relativo do numero de vacinas
df_bra['relative'] = df_bra.daily_vaccinations / df_bra.daily_vaccinations.iloc[0]
df_in['relative'] = df_in.daily_vaccinations / df_in.daily_vaccinations.iloc[0]
df_chi['relative'] = df_chi.daily_vaccinations / df_chi.daily_vaccinations.iloc[0]
df_is['relative'] = df_is.daily_vaccinations / df_is.daily_vaccinations.iloc[0]
df_en['relative'] = df_en.daily_vaccinations / df_en.daily_vaccinations.iloc[0]

In [None]:
#concatenando os subdatasets criados, menos o brasil
df_comparative = pd.concat([df_in,df_chi,df_is,df_en])

In [None]:
#plotando o crescimento relativo de 4 paises em relação ao brasil
figure, axes = plt.subplots(1, 2, figsize=(14,7), gridspec_kw={'wspace': 0.2})
figure.suptitle('Relative growth in the number of daily vaccines')
sns.lineplot(data=df_comparative[['country','date','relative']],x='date',y='relative',hue="country", ax=axes[0])
plt.xticks(rotation=45)
sns.lineplot(data=df_bra[['date','relative']],x='date', y='relative', ax=axes[1], color='purple')
plt.legend(['Brasil'])


# Most used vaccines in countries

In [None]:
#criando dataset que mostra o nome das vacinas utilizadas sem duplicatas na coluna pais
df_vaccines = df[['country','vaccines']].copy()
df_vaccines.country = df_vaccines.country.drop_duplicates()
df_vaccines.dropna(inplace=True)

In [None]:
#mostrando a quantidade que cada vaccina é utilizada
df_vaccines.vaccines.value_counts()

In [None]:
#preparando dataframe para plotagem do numero de vacinas
df_vac = pd.DataFrame(df_vaccines.vaccines.value_counts())

In [None]:
plt.figure(figsize=(6,8))
plt.ylabel('Vaccines')
plt.xlabel('Count')
plt.title('Most used vaccines')
sns.barplot(data=df_vac, y=df_vac.index, x='vaccines', orient='h')