  <tr>
        <td width="60%">
            <img src="EDA.jpg">
        </td>
        <td>
            <div align="center">
                <font size=24px>
                    <b> Exploratory Data Analysis: Corona Cases
                    </b>
                </font>
            </div>
        </td>
    </tr>

In [None]:
# libraries for Web Scrapping
import requests
from bs4 import BeautifulSoup

# library for advance string manipulation
import string

# library for data manipulation
import pandas as pd

# library for advance mathematical operations
import numpy as np

# library for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file

In [None]:
# reading the csv file
df_corona = pd.read_csv('covid May-09-2020 21-13-46.csv')

In [None]:
# getting a quick info of the features
df_corona.info()

In [None]:
# all the features are in string format. So, to clean them we will convert the datatype  of the column NewDeaths to string
df_corona['NewDeaths'] = df_corona['NewDeaths'].astype(str)

In [None]:
# replacing the ',' and the '+' in the values with an empty string
# stripping out the whitespace from the values 
for column in df_corona:
    df_corona[column] = df_corona[column].str.replace(',','')
    df_corona[column] = df_corona[column].str.replace('+','')
    df_corona[column] = df_corona[column].str.strip()
    
df_corona.head()

In [None]:
# creating a list of the columns
columns_to_convert = ['TotalCases', 'NewCases', 'NewDeaths',
       'TotalRecovered', 'ActiveCases', 'Serious,Critical', 'TotCases/1M pop',
       'Deaths/1M pop', 'TotalTests', 'Tests/1M pop']

# after cleaning, converting the datatype of the columns to float
for column in columns_to_convert:
    df_corona[column] = df_corona[column].astype(float)

In [None]:
# we are not able to convert the column 'TotalDeaths' to float because there is a blank string value in that column
# we are forcing the column to convert into numeric form
# errors='coerce' converts the inconvertable values to NaN
df_corona.TotalDeaths = pd.to_numeric(df_corona.TotalDeaths, errors='coerce')

In [None]:
df_corona.tail()

In [None]:
# replacing the null values in the column 'TotalDeaths' with 0
df_corona['TotalDeaths'] = df_corona['TotalDeaths'].replace(np.nan,0)
df_corona.tail()

# Now, df_corona is ready for EDA

In [None]:
# presetting the size of the images
plt.rcParams['figure.figsize'] = 16, 8

In [None]:
# sorting the  dataframe in the descending order according to the TotalCases  
df_corona = df_corona.sort_values('TotalCases', ascending=False)
df_corona.head(15)

## Creating a barplot for the top 10 `Countries` and the `Total Cases` for each country 

In [None]:
sns.barplot(x=df_corona.Country[:10], y=df_corona.TotalCases[:10])

**Interpretation:** `USA` has the highest number of cases in the world i.e. more than `1200000`

In [None]:
sns.barplot(x=df_corona.Country[-5:], y=df_corona.TotalCases[-5:])

**Interpretation:** `Saint Pierre Miquelon` has the least number of cases in the world i.e. `1`

## A barplot for the top 10 `Countries` and the `Total Deaths` in each country 

In [None]:
# sorting the  dataframe in the descending order according to the TotalDeaths  
df_corona = df_corona.sort_values('TotalDeaths', ascending=False)
sns.barplot(x=df_corona.Country[:10], y=df_corona.TotalDeaths[:10])

**Interpretation:** `USA` has the highest number of deaths in the world i.e. around `80000`

## A barplot for the top 10 `Countries` and the `Active Cases` in each country 

In [None]:
# sorting the  dataframe in the descending order according to the ActiveCases  
df_corona = df_corona.sort_values('ActiveCases', ascending=False)
sns.barplot(x=df_corona.Country[:10], y=df_corona.ActiveCases[:10])

## A barplot for the top 10 `Countries` and the `Total Recovered` cases in each country 

In [None]:
# sorting the  dataframe in the descending order according to the TotalRecovered  
df_corona = df_corona.sort_values('TotalRecovered', ascending=False)
sns.barplot(x=df_corona.Country[:10], y=df_corona.TotalRecovered[:10])

In [None]:
sns.boxplot(df_corona.TotalCases)

In [None]:
sns.distplot(df_corona.TotalCases, color='red')

In [None]:
sns.rugplot(df_corona.ActiveCases, height=0.5)

In [None]:
corrmat = df_corona.corr()
sns.heatmap(corrmat[corrmat>0.8], annot = True)

b, t = plt.ylim()  # discover the values for bottom and top
b += 0.5           # Add 0.5 to the bottom
t -= 0.5           # Subtract 0.5 from the top
plt.ylim(b, t)     # update the ylim(bottom, top) values
plt.show() 

In [None]:
sns.jointplot(df_corona.TotalCases, df_corona.TotalDeaths, kind='reg')

In [None]:
# group the dataframe on the basis of Continents
group1 = df_corona.groupby('Continent')

In [None]:
# find the total cases for each continent
group1['TotalCases'].sum()

In [None]:
# find the average of each column for each continent
group1.agg(np.mean)

## What is this continent 0 and the value 721

In [None]:
df_corona[df_corona.Continent=='0']

In [None]:
# reading the csv file
df_corona_cont = pd.read_csv('covid_per_continent May-09-2020 21-13-46.csv')

In [None]:
df_corona_cont

  <tr>
        <td width="30%">
            <img src="stage2.png" class="center" width=100%>
        </td>
   
    </tr>