In [None]:
#Import necessary packages

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date



In [None]:
#Read the dataset
df=pd.read_csv('../input/novel-covid-data-till-august-2021/covid-data.csv')
df.tail()

The last entry is in 28th of August 2021

In [None]:
#Checking the data types
print(df.dtypes)

In [None]:
#Changing the data types
for col in ['iso_code','continent','location']:
    df[col]=df[col].astype('category')
df['date']=pd.to_datetime(df['date'])
print(df.dtypes)

In [None]:
#Let's look at the country that has most total cases
last_entry_cond=df['date']=='2021-08-28' #Condition for the last entry data
top_total_cases=df[last_entry_cond].dropna(subset=['continent'])\
                .groupby('iso_code').sum('total_cases')\
                .sort_values('total_cases',ascending=False)
top_total_cases.head(10)



In [None]:
#Looking at the biggest share of covid-19 cases
top_total_cases['percentage']=top_total_cases['total_cases']/sum(top_total_cases['total_cases'])
percentage=0
i=0
while percentage <0.5:
    percentage=percentage+top_total_cases.iloc[i,-1]
    i=i+1
print('50% of total cases in the world came from the largest',+i,'countries')

Hence, the 6 top countries in total cases would be the used to represent the covid 19 condition for the whole world

In [None]:
#How is the covid-19 transmitting until today?
for country in top_total_cases.loc['USA':'GBR'].index:
    df[df['iso_code']==country].resample('M',on='date')['new_cases'].mean().plot(label=country)
plt.yticks()
plt.xlim(right='2021-08-28')
plt.ylabel('Daily Cases')
plt.legend()
plt.show()

In [None]:
#How about the death caused by Covid 19?
for country in top_total_cases.loc['USA':'GBR'].index:
    df[df['iso_code']==country].resample('M',on='date')['new_deaths'].mean().plot(label=country)
plt.yticks()
plt.xlim(right='2021-08-28')
plt.ylabel('Daily Deaths')
plt.legend()
plt.show()

The increase in cases tend to be consistent with the increase in deaths. And it looks like all top 6 countries in total cases had suffered from the second wave, even USA is currently suffering on its third wave

In [None]:
#Does beds per people ratio affect the number of deaths?
df['death_ratio']=df['total_deaths']/df['total_cases'] #Death ratio of people who got infected with virus

sns.scatterplot(x='hospital_beds_per_thousand',y='death_ratio',data=df[last_entry_cond],hue='continent')
#plt.yscale('log')
plt.show()

Based on the data, hospital beds per thousand of people 'kinda' correlate to deaths for people already infected by the virus, but not very much. This could be because of the inaccuracy in data collection, especially in poorer countries where many deaths go unreported.

In [None]:
#Does number of deaths per million of people differ in each continent?
sns.boxplot(x='continent',y='total_deaths_per_million',data=df[last_entry_cond],hue='continent')
plt.show()

It seems many countries in Europe and South America has the higher deaths per million than in any other continents

In [None]:
#How about the number of cases?
sns.boxplot(x='continent',y='total_cases_per_million',data=df[last_entry_cond],hue='continent')
plt.show()

The boxplot for total cases per million for Europe, North America, and South America is consistent with their boxplot for the total deaths per million. However, the majority of asian countries record more cases than in North America, but less deaths. This is probably because there are more young people in Asia which make them less likely to die when infected by the virus unlike in North America. Or this could also be resulted from the inaccuracy of data in some Asian countries.

In [None]:
#Does countries with high death rate by cardioviskular would also suffer high death rate by Covid-19?
sns.scatterplot(x='cardiovasc_death_rate',y='death_ratio',data=df[last_entry_cond],hue='continent')
plt.xscale('log')
plt.show()

Interestingly, there seem to be minimum relevancy between cardiovaskular death rate with total deaths by covid 19 per million of people

In [None]:
#How about diabetes prevalance?
sns.scatterplot(x='diabetes_prevalence',y='death_ratio',data=df[last_entry_cond],hue='continent')
plt.yscale('log')
plt.xscale('log')
plt.show()

Same with diabetes, there seem to be minimal correlation between diabetes prevalence and Covid-19 death ratio

The result of this data analysis didn't help us much to get better understanding of the Covid-19. Various causes is to blame, such as inaccurate data and the lack of data. However, this unknown information about the virus should alarm as all: since not much about the virus is known, anything is possible, thus, we should be more careful.