### 1. Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
sns.set()

### 2. Import Data

In [None]:
data_raw = pd.read_csv("../input/novel-corona-virus-2019-dataset/covid_19_data.csv")

In [None]:
data_raw.describe(include="all")

In [None]:
data_raw.head(10)

In [None]:
data_raw.describe(include="all")

In [None]:
data_raw.dtypes

### 3. Check Duplicate

In [None]:
data_raw[data_raw.duplicated()]

#### No duplicate data, so we don't have to remove duplicate

### 4. Check Missing data

In [None]:
data_raw.isnull()

In [None]:
data_raw.isnull().sum()

####  There are so many data that don't have province. 


In [None]:
data_raw.describe(include="all")

In [None]:
data_no_miss_val = data_raw.dropna()

### 5. Check minus value in Confirmed Cases

In [None]:
data_no_miss_val.isnull().sum()

In [None]:
data_no_miss_val.describe(include="all")

In [None]:
data_positive_min = data_no_miss_val[data_no_miss_val["Confirmed"] < 0]
data_positive_min.head()

### 6. Check minus value in Recovered cases

In [None]:
data_recovered_min = data_no_miss_val[data_no_miss_val["Recovered"] < 0]
data_recovered_min.head()

### 7. Check minus value Deaths cases

In [None]:
data_deaths_min = data_no_miss_val[data_no_miss_val["Deaths"] < 0]
data_deaths_min.head()

### 8. Get positive value from Confirmed cases

In [None]:
data_conf_pos = data_no_miss_val[data_no_miss_val["Confirmed"] >= 0]
data_conf_pos.describe()

### 9. Get Positive Valuse Deaths Cases

In [None]:
data_death_pos = data_conf_pos[data_conf_pos["Deaths"] >= 0]
data_death_pos.describe()

### 10. Get Positive Recovered Cases

In [None]:
data_recove_pos = data_death_pos[data_death_pos["Recovered"] >= 0]
data_recove_pos.describe()

### 11. Shortened variable

In [None]:
data_pos = data_recove_pos

In [None]:
data_pos.describe(include="all")

## 12. Identify Outliers

In [None]:
sns.distplot(data_pos["Confirmed"])

In [None]:
sns.distplot(data_pos["Deaths"])

In [None]:
sns.distplot(data_pos["Recovered"])

### 13. Removing outlier with quantil

In [None]:
percentage_data = 0.95

In [None]:
q = data_pos["Confirmed"].quantile(percentage_data)
q

### 13.1 Remove outlier from Confirmed by ensuring the 95% data

In [None]:
data_confirmed_rm_out = data_pos[data_pos["Confirmed"] < q]

In [None]:
sns.distplot(data_confirmed_rm_out["Confirmed"])

### 13.2 Removing outlier from Deaths

In [None]:
q = data_confirmed_rm_out["Deaths"].quantile(percentage_data)
q

In [None]:
data_death_rm_out = data_confirmed_rm_out[data_confirmed_rm_out["Deaths"] < q]

In [None]:
sns.distplot(data_death_rm_out['Deaths'])

### 13.3 Remove Recovered outlier

In [None]:
q = data_death_rm_out["Recovered"].quantile(percentage_data)
q

In [None]:
data_recove_rm_out = data_death_rm_out[data_death_rm_out["Recovered"] < q]

In [None]:
sns.distplot(data_recove_rm_out["Recovered"])

In [None]:
data_clean = data_recove_rm_out.reset_index(drop=True)

In [None]:
data_clean.describe(include="all")

## 14. Visualization

### 14.1 Lets group by month-year, we need to prepare the required column, so we can plot such as inscreased Confirmed, Death, Recovered per Month-Year

In [None]:
data_clean["ObservationDate"] = pd.to_datetime(data_clean["ObservationDate"])

In [None]:
data_clean["Month"] = data_clean["ObservationDate"].dt.strftime('%m')

In [None]:
data_clean["Year"] = data_clean["ObservationDate"].dt.strftime('%Y')

In [None]:
data_clean["Day"] = data_clean["ObservationDate"].dt.strftime('%d')

In [None]:
data_clean["Year-Month"] = data_clean["ObservationDate"].dt.strftime('%Y-%m')

#### We only need the data for year 2020 for consistency in plot, because cumulative data is reseted between the end of 2020 and start of 2021

In [None]:
data_clean = data_clean[data_clean["Year"] == "2020"]

In [None]:
data_clean.tail()

In [None]:
data_clean.describe()

In [None]:
data_group_year_month = data_clean.groupby([
    "Year-Month"
]).sum().loc[:, ["Confirmed", "Deaths", "Recovered"]].reset_index()

In [None]:
data_group_year_month

### Visualize Addionional Confirmed, Death, Recovered

In [None]:
data_group_year_month["ConfirmedDiff"] = np.abs(data_group_year_month[
    "Confirmed"].diff().fillna(0))
data_group_year_month["DeathDiff"] = np.abs(data_group_year_month["Deaths"].diff().fillna(0))
data_group_year_month["RecoveredDiff"] = np.abs(data_group_year_month[
    "Recovered"].diff().fillna(0))

In [None]:
data_group_year_month.head()

In [None]:
data_group_year_month.head(500)

In [None]:
data_group_year_month.describe(include="all")

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["ConfirmedDiff"],)
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["DeathDiff"],)
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["RecoveredDiff"],)
ax.set_title("Additional Confirmed, Deaths, Recovered Covid Cases per Year-Month In the world",
             fontweight='bold',
             fontsize=15)
ax.legend(("Additional Confirmed", "Additional Deaths", "Additional Recovered"))
ax.set_xlabel('Year-Month')
ax.set_ylabel('Scale')

#### It's Clear that the additional Confirmed Case Increased drasitcally between May to June, and between August to November

### 14.2. Vizualize  Cumulative Confirmed Covid Cases per Year-Month

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["Confirmed"],
        color='b')
ax.set_title("Cumulative Confirmed Covid Cases per Year-Month",
             fontweight='bold',
             fontsize=15)
ax.set_xlabel('Year-Month')
ax.set_ylabel('Confirmed Cases')

This plot shows the total of confirmed cases in the world per year-month. We can see the Cmulative confirmed cases between january 2020 until December 2020. The significance increase in Cumulative number of cases can cause by the lateness of WHO to advice every people to wear mask. In March 2020, WHO only advices people to wear mask only for the sickness.

### 14.3. Visualize Covid Deaths per Month-Year

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["Deaths"],
        color='b')
ax.set_title("Cumulative Total Covid Deaths per Month-Year",
             fontweight='bold',
             fontsize=15)
ax.set_xlabel('Month-Year')
ax.set_ylabel('Confirmed Cases')

We can see from this plot that the Cumulative amount of deaths cases in the world increased drastically between May 2020 to July 2020.

### 14.4. Visualize Covid Recovered per Month-Year

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["Recovered"],
        color='b')
ax.set_title("Cumulative Total Covid Recovered per Month-Year",
             fontweight='bold',
             fontsize=15)
ax.set_xlabel('Month-Year')
ax.set_ylabel('Confirmed Cases')

We can see from this plot that the Cumulative amount of Recovered cases in the world increased drastically from May to July 2020.

#### We can summarize that the more Cumulative Confirmed Cases, the more Cumulative Deaths Cases and Recovered Cases increased. But it seems that the Confirmed Cases, Deaths, and Recovered cases don't happen at the same scale. For example, the Y Axis of Confirmed Cases happened at the scale of 1e8(one hundred million) with aproximate max value is 3*1e8, and the Y Axis of the Deaths cases happened at the scale of 1e6(one million) with the aproximate max is 6.5*1e8, the Y axis of Recovered happened at the scale of 1e8 with the aproximate max 1.5*1e8  . 

### 14.51 Visualize Additional Confirmed, Deaths, and Recovered Differece between each month

In [None]:
fig, ax = plt.subplots(figsize=(19, 8))
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["ConfirmedDiff"],
        color='b')
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["DeathDiff"],
        color='r')
ax.plot(data_group_year_month["Year-Month"],
        data_group_year_month["RecoveredDiff"],
        color='g')
ax.set_title("Additional Confirmed Covid Cases per Year-Month",
             fontweight='bold',
             fontsize=15)
ax.set_xlabel('Year-Month')
ax.set_ylabel('Confirmed Cases')

#### We can see that the confirmed and Recovered cases increased drastically between  May 2020 and June 2020

### 14.5. Let's try to group Cumulative Confirmed, Deaths, Recovered by country

In [None]:
data_group_by_country = data_clean.groupby([
    "Country/Region"
]).sum().loc[:, ["Confirmed", "Deaths", "Recovered"]].reset_index()

In [None]:
data_group_by_country.head()

In [None]:
data_group_by_country.describe()

### 14.6. Visualize Cumulative Confirmed Cases Per Country


#### Sort the data by confirmed 

In [None]:
data_group_by_country_conf_sort = data_group_by_country.sort_values(
    by=["Confirmed"])

In [None]:
data_group_by_country_conf_sort.describe()

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
plt.barh(data_group_by_country_conf_sort["Country/Region"],
         data_group_by_country_conf_sort["Confirmed"],
         color='maroon')
plt.title('Cumulative Total of Confirmed Cases per Country')
plt.xlabel('Confirmed')
plt.ylabel('Country')
plt.show()

We can see that USA has the most confirmed cases between january 2020 - december 2021

### 14.7 .Visualize Cumulative Deaths Cases Per Country

#### We need to sort the data by Deaths

In [None]:
data_group_by_country_death_sort = data_group_by_country.sort_values(
    by=["Deaths"], ascending=True)
data_group_by_country_death_sort.head()

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
plt.barh(data_group_by_country_death_sort["Country/Region"],
         data_group_by_country_death_sort["Deaths"],
         color='maroon')
plt.title('Cumulative Total of Deaths Cases per Country')
plt.xlabel('Deaths')
plt.ylabel('Country')
plt.show()

We can see that USA has the most confirmed cases between january 2020 - december 2021

### 14.8. Visualize Cumulative Recovered Cases Per Country


#### We need to sort the data by Recovered 

In [None]:
data_group_by_country_recov_sort = data_group_by_country.sort_values(
    by=["Recovered"])

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
plt.barh(data_group_by_country_recov_sort["Country/Region"],
         data_group_by_country_recov_sort["Recovered"],
         color='maroon')
plt.title('Cumulative Total of Recovered Cases per Country')
plt.xlabel('Recovered')
plt.ylabel('Country')
plt.show()

### 14.9 Vizualize Cumulative Covid By Country 

In [None]:
data_group_country_year_month = data_clean.groupby([
    "Country/Region", "Year-Month"
]).sum().loc[:, ["Confirmed", "Deaths", "Recovered"]].reset_index()

In [None]:
data_group_country_year_month.head()

### 14.91 Cumulative Covid Cases in Russia

In [None]:
data_covid_rus = data_group_country_year_month[
    data_group_country_year_month["Country/Region"] == "Russia"]
data_covid_rus.sort_values(by="Deaths", ascending=False)

### 14.91 Cumulative Covid Cases in Russia

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(data_covid_rus['Year-Month'],
        data_covid_rus['Confirmed'],
        label='Confirmed')
ax.plot(data_covid_rus['Year-Month'], data_covid_rus['Deaths'], label='Deaths')
ax.plot(data_covid_rus['Year-Month'],
        data_covid_rus['Recovered'],
        label='Recovered')
ax.set_title("Cumulative Covid Confirmed-Deaths-Recovered Cases in  Russia",
             fontsize=20)
ax.legend(('Confirmed', 'Deaths', 'Recovered'))
ax.set_xlabel('Year-Month', fontsize=15)
ax.set_ylabel('Scale', fontsize=15)

### 14.92 Cumulative Covid Cases in Mexico

In [None]:
data_covid_mex = data_group_country_year_month[
    data_group_country_year_month["Country/Region"] == "Mexico"]
data_covid_mex.sort_values(by="Deaths", ascending=False)

In [None]:
data_covid_mex.describe()

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(data_covid_mex['Year-Month'],
        data_covid_mex['Confirmed'],
        label='Confirmed')
ax.plot(data_covid_mex['Year-Month'], data_covid_mex['Deaths'], label='Deaths')
ax.plot(data_covid_mex['Year-Month'],
        data_covid_mex['Recovered'],
        label='Recovered')
ax.set_title("Cumulative Covid Confirmed-Deaths-Recovered Cases in  Mexico",
             fontsize=20)
ax.legend(('Confirmed', 'Deaths', 'Recovered'))
ax.set_xlabel('Year-Month', fontsize=15)
ax.set_ylabel('Scale', fontsize=15)

The Recovered cases seems pretty good in mexico, but what if we compare with US ? 

### 15. Visualize Cumulative covid cases in US vs Mexico VS Russia Comparison

In [None]:
data_covid_us = data_group_country_year_month[
    data_group_country_year_month["Country/Region"] == "US"]
data_covid_us.sort_values(by="Deaths", ascending=False)

In [None]:
data_covid_us.describe()

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(data_covid_us['Year-Month'],
        data_covid_us['Confirmed'],
        label='Confirmed')
ax.plot(data_covid_us['Year-Month'], data_covid_us['Deaths'], label='Deaths')
ax.plot(data_covid_us['Year-Month'],
        data_covid_us['Recovered'],
        label='Recovered')
ax.plot(data_covid_mex['Year-Month'],
        data_covid_mex['Confirmed'],
        label='Confirmed')
ax.plot(data_covid_mex['Year-Month'], data_covid_mex['Deaths'], label='Deaths')
ax.plot(data_covid_mex['Year-Month'],
        data_covid_mex['Recovered'],
        label='Recovered')

ax.plot(data_covid_rus['Year-Month'],
        data_covid_rus['Confirmed'],
        label='Confirmed')
ax.plot(data_covid_rus['Year-Month'], data_covid_rus['Deaths'], label='Deaths')
ax.plot(data_covid_rus['Year-Month'],
        data_covid_rus['Recovered'],
        label='Recovered')

ax.legend(('Confirmed-US', 'Deaths-US', 'Recovered-US', "Confirmed-Mexico",
           'Deaths-Mexico', 'Recovered-Mexico', "Confirmed-Russia",
           "Deaths-Russia", "Recovered-Russia"))
ax.set_title("Cumulative Covid Cases in US in comparison with Mexico and Russia",
             fontsize=20)
ax.set_xlabel('Year-Month', fontsize=15)
ax.set_ylabel('Scale', fontsize=15)

We can see that even though the Cumulative Confirmed Cases in US is far that Mexico, the Cumulative Deaths in Mexico is almost the same with the Cumulative Deaths in US. The cumulative recovered in Russia is the highest, and the cumulative recovered in US in lowest. The margin between Cumulative Confirmed and Cumulative Recovered in US  is pretty high (Diference between Cumulative Confirmed and Cumulative Recovered), and higher than the margin between Cumulative Confirmed and Cumulative Deaths (Difference between Cumulative Confirmed and Cumulative Deaths) in US between June 2020 - December 2020. It means that People who got Covid in US has a pretty high chance to die than recovered.



### 16. Visualize Pairplot

In [None]:
sns.pairplot(data_clean)

### 16.1 . Relationship between Cumulative Confirmed Case and Cumulative Deaths 

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

sns.scatterplot(x='Confirmed', y='Deaths', data=data_clean)
ax.set_title('Scatter Plot Cumulative Cases and Deaths ',
             fontsize=15,
             fontweight='bold')

We can see that as the number of Cumulative Confirmed Cases grow, the number of Cumulative Deaths also grows significantly

In [None]:
data_clean_us = data_clean[data_clean["Country/Region"] == "US"].reset_index(drop=True)
data_clean_us.head()

### 17. Additional Confirmed, Deaths, Recovered in US

#### We need to get the difference between Confirmed, Deaths, and Recovered in US

In [None]:
data_clean_us.describe()

In [None]:
data_us_group_year_month = data_clean_us.groupby([
    "Year-Month"
]).sum().loc[:, ["Confirmed", "Deaths", "Recovered"]].reset_index()


In [None]:
data_us_group_year_month.tail()

In [None]:

data_us_group_year_month["ConfirmedDiff"] = np.abs(data_us_group_year_month["Confirmed"].diff().fillna(0))
data_us_group_year_month["DeathDiff"] = np.abs(data_us_group_year_month["Deaths"].diff().fillna(0))
data_us_group_year_month["RecoveredDiff"] = np.abs(data_us_group_year_month["Recovered"].diff().fillna(0))
data_us_group_year_month.head(500)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(data_us_group_year_month["Year-Month"],
        data_us_group_year_month["ConfirmedDiff"])
ax.plot(data_us_group_year_month["Year-Month"],
        data_us_group_year_month["DeathDiff"])
ax.plot(data_us_group_year_month["Year-Month"],
        data_us_group_year_month["RecoveredDiff"])
ax.set_title("Additional Confirmed Covid Cases per Year-Month",
             fontweight='bold',
             fontsize=15)
ax.legend(("Additional Confirmed", "Additional Deaths", "Additional Recovered"))
ax.set_xlabel('Year-Month')
ax.set_ylabel('Additional Scale')

* We can see that the Confirmed Cases increased drastically between March 2020 to April2020 accross all province in US. It's pretty sad that the addional Recovered and Deaths are almost the same