This is a very simple Exploratory Data Analysis on Covid19 data.

In [None]:
# Let's import the modules first
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
# Reading the Dataset
covid_df = pd.read_csv("../input/covid19-data-from-john-hopkins-university/RAW_global_confirmed_cases.csv")
covid_df.head(10)#Let's see if it's loaded

In [None]:
#Let's check the shape of our dataframe
covid_df.shape

In [None]:
#Let's take a look at our columns
columns = covid_df.columns
columns

In [None]:
#Latitude and Longitude are not important features.Let's drop them
covid_df.drop(columns = ['Lat','Long'], axis =1, inplace = True)

In [None]:
#Let's check if it's dropped
covid_df.head()

In [None]:
#Let's aggregate the dataset by country
covid_df_aggregated = covid_df.groupby("Country/Region").sum()
covid_df_aggregated.head()

In [None]:
most_affected=covid_df_aggregated.sort_values(by = columns[-1],ascending = False)
most_affected=most_affected.head(10)

In [None]:
#Let's do a visualization of top 10 countries in confirmed cases
ax = sns.barplot(x=most_affected[most_affected.columns[-1]], y=most_affected.index.values)
ax.set_title("Total confirmed cases so far")

In [None]:
#After aggregation, the index of the df is the column at which we aggregated
covid_df_aggregated.shape

In [None]:
#Visualizing data related to a country for example India
covid_df_aggregated.loc["India"]

In [None]:
#Let's do a plot of confirmed cases in India
covid_df_aggregated.loc['India'].plot()


In [None]:
#Let's do a plot of confirmed cases in China
covid_df_aggregated.loc['China'].plot()

In [None]:
#Let's do a plot of confirmed cases in US
covid_df_aggregated.loc['US'].plot()

In [None]:
#Let's do a plot of confirmed cases in Italy
covid_df_aggregated.loc['Italy'].plot()

In [None]:
#Let's do a comparison
covid_df_aggregated.loc['India'].plot()
covid_df_aggregated.loc['China'].plot()
covid_df_aggregated.loc['US'].plot()
covid_df_aggregated.loc['Italy'].plot()
plt.legend()

In [None]:
#Spread of the virus in India for the first 30 dates only
covid_df_aggregated.loc['India'][:30].plot()

In [None]:
#caculating the first derivative of the curve
covid_df_aggregated.loc["India"].diff().plot()

In [None]:
#find maxmimum infection rate for India
covid_df_aggregated.loc["India"].diff().max()


In [None]:
#find maxmimum infection rate for US
covid_df_aggregated.loc["US"].diff().max()


In [None]:
#find maximum infection rate for all of the countries.
countries = list(covid_df_aggregated.index)
max_infection_rates = []
for c in countries :
    max_infection_rates.append(covid_df_aggregated.loc[c].diff().max())
max_infection_rates

In [None]:
#Let's add our findings to our data
covid_df_aggregated["max_infection_rates"] = max_infection_rates

In [None]:
#Let's see if it's added
covid_df_aggregated.head()

In [None]:
#Extracting just the useful column.
covid_data = pd.DataFrame(covid_df_aggregated["max_infection_rates"])

In [None]:
covid_data.head()

In [None]:
covid_data.columns=covid_data.columns.str.strip()
#covid_data.head()

In [None]:
#Let's sort our findings
covid_data.sort_values(by ='max_infection_rates', inplace=True, ascending = False)

In [None]:
#Let's see the most affected 10 countries
top_10=covid_data.head(10)

In [None]:
ax = sns.barplot(x=top_10['max_infection_rates'], y=top_10.index.values)
ax.set_title("Countries with biggest infection rates")

Here we used maximum infection rate because it can capture the spread better than total confirmed cases considering the variation in population.