In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

![](https://media.graytvinc.com/images/810*455/coronavirus+outbreak+mgn.jpg)
## What is Coronavirus<br>
2019 Novel Coronavirus (2019-nCoV) is a virus (more specifically, a coronavirus) identified as the cause of an outbreak of respiratory illness first detected in Wuhan, China. Early on, many of the patients in the outbreak in Wuhan, China reportedly had some link to a large seafood and animal market, suggesting animal-to-person spread. However, a growing number of patients reportedly have not had exposure to animal markets, indicating person-to-person spread is occurring. At this time, it’s unclear how easily or sustainably this virus is spreading between people - CDC<br>
<br>
This dataset has daily level information on the number of affected cases, deaths and recovery from 2019 novel coronavirus.<br>
<br>
The data is available from 22 Jan 2020.

![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcR7VR8lF8-Y4kzV7lSuqJreoqphJvk6AzFf2Xa2x0oaLZ9m6m3_)

## Define the Problem<br>
Coronaviruses are a large family of viruses that are common in many different species of animals, including camels, cattle, cats, and bats. Rarely, animal coronaviruses can infect people and then spread between people such as with MERS, SARS, and now with 2019-nCoV.<br>
<br>
Outbreaks of novel virus infections among people are always of public health concern. The risk from these outbreaks depends on characteristics of the virus, including whether and how well it spreads between people, the severity of resulting illness, and the medical or other measures available to control the impact of the virus (for example, vaccine or treatment medications).<br>
<br>
This is a very serious public health threat. The fact that this virus has caused severe illness and sustained person-to-person spread in China is concerning, but it’s unclear how the situation in the United States will unfold at this time.<br>
<br>
The risk to individuals is dependent on exposure. At this time, some people will have an increased risk of infection, for example healthcare workers caring for 2019-nCoV patients and other close contacts. For the general American public, who are unlikely to be exposed to this virus, the immediate health risk from 2019-nCoV is considered low. The goal of the ongoing U.S. public health response is to prevent sustained spread of 2019-nCov in this country.<br>

## Origins<br>
Researchers think the new virus probably came from bats, as the SARS virus did, with which it shares 80 percent of its genetic makeup.
<br>
But we still don't know which animal passed it on to humans. Last week a Chinese team suggested it could be a snake, but that was immediately challenged by other experts, who think a mammal is the more likely culprit.
<br>
Identifying which animal it was could help fight the outbreak.
<br>
With SARS, it turned out to be a civet, whose meat is very popular in China.
<br>
Banning the consumption of civet and closing their breeding farms helped stop SARS from making a comeback, says Arnaud Fontanet of the Pasteur Institute in Paris.
<br>
On the other hand, one reason it was harder to stem the MERS outbreak is because it came from dromedary camels, a widely used working animal.

![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTshKz-MJNJgeW__aErbpFMgp0bt9TOf6sXOhggrsWRxZdNM_CU)

<br>
<br>
## Precautions<br>
Health authorities and scientists say the same precautions against other viral illnesses can be used: wash your hands frequently, cover up your coughs, try not to touch your face.
<br>
And anyone who does come down with the virus should be placed in isolation.
<br>
"Considering that substantial numbers of patients with SARS and MERS were infected in health-care settings", precautions need to be taken to prevent that happening again, the Chinese team warned in The Lancet.

![](https://www.cdc.gov/coronavirus/2019-ncov/images/infographic-symptoms.jpg)

## Gather the Data<br>
### Acknowledgements<br>
Johns Hopkins university has made the data available in google sheets format here. Sincere thanks to them.
<br>
Thanks to WHO, CDC, NHC and DXY for making the data available in first place.

## Prepare Data for Consumption

### Load the dataset

In [None]:
# using pandas read_csv
#dataset = '/kaggle/input/novel-corona-virus-2019-dataset/2019_nCoV_data.csv'
dataset = '/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv'

df = pd.read_csv(dataset)

#### Print the Top 5 rows from the Dataframe

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
EMPTY_VAL = "Unknown"

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state

In [None]:
df_T = df.copy()
df_T.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)
df_T['Province/State'].fillna(EMPTY_VAL, inplace=True)
df_T['Date'] = pd.to_datetime(df_T['Date'], infer_datetime_format=True)

In [None]:
df_T.loc[: , ['Date', 'Country', 'Province/State', 'Confirmed', 'Deaths', 'Recovered']].groupby(['Country', 'Province/State']).max().groupby(['Date', 'Country']).sum().sort_values(by='Confirmed', ascending=False).reset_index()[:15].style.background_gradient(cmap='rainbow')

In [None]:
#df_T.loc[: , ['Date', 'Country', 'Province/State', 'Confirmed', 'Deaths', 'Recovered']].groupby(['Country', 'Province/State']).max().sort_values(by='Confirmed', ascending=False).reset_index()[:15].style.background_gradient(cmap='rainbow')

In [None]:
import plotly.express as px
top_10_countries = df_T.loc[: , ['Date', 'Country', 'Province/State', 'Confirmed', 'Deaths', 'Recovered']].groupby(['Country', 'Province/State']).max().groupby(['Date', 'Country']).sum().sort_values(by='Confirmed', ascending=False).reset_index().loc[:, 'Country'][:10]
df_plot = df_T.loc[df_T.Country.isin(top_10_countries), ['Date', 'Country', 'Province/State', 'Confirmed', 'Deaths', 'Recovered']].groupby(['Date', 'Country', 'Province/State']).max().groupby(['Date', 'Country']).sum().reset_index()

fig = px.line(df_plot, x="Date", y="Confirmed", color='Country')
fig.update_layout(title='No.of Confirmed Cases per Day for Top 10 Countries',
                   xaxis_title='Date',
                   yaxis_title='No.of Confirmed Cases')
fig.show()

In [None]:
fig = px.line(df_plot, x="Date", y="Deaths", color='Country')
fig.update_layout(title='No.of Death Cases per Day for Top 10 Countries',
                   xaxis_title='Date',
                   yaxis_title='No.of Death Cases')
fig.show()

In [None]:
df.iloc[np.r_[0:5, -6:-1], :]

### Dataframe feature Definition
<br>
**Sno** - Serial number<br>
**Province/State** - Province or State of observation<br>
**Country** - Country of observation<br>
**Last Update** - Date of observation<br>
**Confirmed** - Number of confirmed cases<br>
**Deaths** - Number of deaths<br>
**Recovered** - Number of recovered cases<br>

In [None]:
df

#### Information from dataframe

In [None]:
df.info()

1. Province/State has few empty values.

#### Descriptive Statistics from dataframe

In [None]:
df.describe()

#### Observations<br>
1. Dataset has 64133 rows which has information of Coronavirus Affected People records like whether the virus is confirmed or are there any deaths occured or anyone recovered from Virus
2. The Dataset is collected from countries like China, Japan, United States, etc.
3. The ratio of Confirmed vs Recovered is dead low, which conveys the seriousness of the virus being spread without any prevention control mechanisms.

**How many entries were collected?**

In [None]:
df.shape

### Data Cleaning: Correcting, Completing, Creating, and Converting

**Whether any feature has empty values**

In [None]:
df.isnull().sum()

The feature Province/State does have EMPTY values

### Correcting

**No.of Persons affected with Virus**

In [None]:
df_T.loc[: , ['Date', 'Country', 'Province/State', 'Confirmed', 'Deaths', 'Recovered']].groupby(['Country', 'Province/State']).max().groupby(['Date', 'Country']).sum().reset_index().select_dtypes(include='float64').sum()

### Clean Data

**Remove the serial number feature from the dataframe**

In [None]:
# Let's get rid of the Sno column as it's redundant
df.drop(['SNo'], axis=1, inplace=True)

**Remove the Date feature from the dataframe as it's redundant with Last Update**

In [None]:
df.drop('Last Update', axis=1, inplace=True)

In [None]:
df.tail()

**Replace empty values of State with most of no.of State values**

In [None]:
df['Province/State'].value_counts()

In [None]:
tempState = df['Province/State'].mode()
#print(tempState)
#df['Province/State'].fillna(tempState, inplace=True)

In [None]:
df.tail()

### Converting

In [None]:
df.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)

**Convert the Date Time stamp to Date**

In [None]:
from datetime import datetime
# 1/22/2020 12:00
# 1/26/2020 23:00
# 1/23/20 12:00 PM
# 2020-01-02 23:33:00
# 
def try_parsing_date_time(text):
    for fmt in ('%Y-%m-%d', '%d.%m.%Y', '%m/%d/%Y', '%m/%d/%Y %h:%m', '%m/%d/%Y %H:%M','%m/%d/%Y %H:%M:%S','%m/%d/%y %I:%M %p', '%m/%d/%Y %I:%M %p', '%Y-%d-%m %H:%M:%S'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date time format found', text)


def try_parsing_date(text):
    for fmt in ('%m/%d/%Y', '%m/%d/%y', '%Y-%d-%m', '%d.%m.%Y'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found', text)

In [None]:
'''
dateTime = df['Last Update'].apply(try_parsing_date)
df['DateTime'] = dateTime
df['Time'] = dateTime.dt.time
df['Date'] = dateTime.dt.date
df.drop('Last Update', axis=1, inplace=True)
'''

In [None]:
df['Date']

In [None]:
df['Date'] = df['Date'].apply(try_parsing_date_time)

In [None]:
df['Date']

Replace the Country for the States of Taiwan and Hong Kong

In [None]:
df[df['Province/State'] == 'Taiwan']['Country'] = 'Taiwan'
df[df['Province/State'] == 'Hong Kong']['Country'] = 'Hong Kong'

In [None]:
df.replace({'Country': 'Mainland China'}, 'China', inplace=True)

In [None]:
#df[['Confirmed', 'Deaths', 'Recovered']].astype('int32')

**Da-Double Check Cleaned Data**

In [None]:
df.info()

In [None]:
#df['Date'].unique()

**List the Countries that are affected with Virus**

In [None]:
df.Country.unique()

In [None]:
df[['Country', 'Confirmed']].groupby(['Country']).count()

**List all the Provinces/States that were affected with Virus**

In [None]:
df['Province/State'].unique()

In [None]:
df_IN = df.loc[(df.Country == 'India') & (df.Date >= '2020-06-10'), :]
df_IN.loc[:, '%Change'] = df_IN['Confirmed'].transform(lambda x : round(100 * (x - x.shift(1)) / x.shift(1)))
df_IN

In [None]:
import plotly.express as px
%matplotlib inline

df_plot = df_IN.loc[:,['Date', 'Confirmed', '%Change']]
df_plot.loc[:, 'Date'] = df_plot.Date.dt.strftime("%m%d")

px.bar(df_plot, x='Date', y='Confirmed')

In [None]:
px.bar(df_plot, x='Date', y='%Change')

## Perform Exploratory Analysis

**Which Country has recorded most with virus?**

In [None]:
df['Country'].value_counts()

**No.of Countries that were affected with Virus?**

In [None]:
df.Country.nunique()

The Countries ***Mainland China*** is recored most and spread acorss the Globe.[](http://)

In [None]:
import plotly.express as px
pxdf = px.data.gapminder()

country_isoAlpha = pxdf[['country', 'iso_alpha']].drop_duplicates()
country_isoAlpha.rename(columns = {'country':'Country'}, inplace=True)
country_isoAlpha.set_index('Country', inplace=True)
country_map = country_isoAlpha.to_dict('index')

In [None]:
def getCountryIsoAlpha(country):
    try:
        return country_map[country]['iso_alpha']
    except:
        return country

In [None]:
df['iso_alpha'] = df['Country'].apply(getCountryIsoAlpha)
df.info()

### Rise of Coronavirus Confirmed cases around the Globe

In [None]:
df_plot = df.loc[:,['Date', 'Country', 'Confirmed']]
df_plot.loc[:, 'Date'] = df_plot.Date.dt.strftime("%Y-%m-%d")
df_plot.loc[:, 'Size'] = np.where(df_plot['Country']=='China', df_plot['Confirmed'], df_plot['Confirmed']*200)
fig = px.scatter_geo(df_plot.groupby(['Date', 'Country']).max().reset_index(),
                     locations="Country",
                     locationmode = "country names",
                     hover_name="Country",
                     color="Confirmed",
                     animation_frame="Date", 
                     size='Size',
                     #projection="natural earth",
                     title="Rise of Coronavirus Confirmed Cases")
fig.show()


#### No.of persons **confirmed** with Virus around the World

In [None]:
df[['Date', 'iso_alpha', 'Confirmed']].groupby('iso_alpha').max()

In [None]:
df_plot = df.groupby('iso_alpha').max().reset_index()
fig = px.choropleth(df_plot, locations="iso_alpha",
                    color="Confirmed", 
                    hover_name="iso_alpha", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

#### No.of persons **confirmed** with Virus around the World (excluding China)

df_plot = df[df.Country != 'China'].groupby('iso_alpha').max().reset_index()
fig = px.choropleth(df_plot, locations="iso_alpha",
                    color="Confirmed", 
                    hover_name="iso_alpha", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

#### No.of persons **died** with Virus around the World

In [None]:
df_plot = df.groupby('iso_alpha').max().reset_index()
fig = px.choropleth(df_plot, locations="iso_alpha",
                    color="Deaths", 
                    hover_name="iso_alpha", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

#### No.of persons **recovered** from Virus around the World

In [None]:
df_plot = df.groupby('iso_alpha').max().reset_index()
fig = px.choropleth(df_plot, locations="iso_alpha",
                    color="Recovered", 
                    hover_name="iso_alpha")
fig.show()

**Which Country has maximum no.of people Confirmed with virus?**

In [None]:
df.groupby('Country')['Confirmed'].max()

***China*** has the most no.of Virus Confirmations


**List top 10 countries in the world affected with Virus**

In [None]:
df.groupby('Country')['Confirmed'].max().sort_values(ascending=False)[0:10]

**Which Country has highest no.of deaths due to virus?**

In [None]:
df.groupby('Country')['Deaths'].max().sort_values(ascending=False)

***Mainland China*** is the **only** country which has reported most no.of people died with Virus

**Which Country has most no.of people Recovered from virus?**

In [None]:
df.groupby('Country')['Recovered'].max()

***Mainland China*** has most no.of people recovered from Virus
***Thailand*** has reported second most no.of recovery

**Which Country's States were affected the most?**


In [None]:
# df[df.Country == 'China'][['Province/State', 'Confirmed']].groupby('Province/State').max()

In [None]:
df['Province/State'].unique()


In [None]:
import plotly.graph_objs as go
fig = go.Figure(go.Choroplethmapbox(geojson='china', locations=df['Province/State'].unique(), z=df[df.Country == 'China'][['Province/State', 'Confirmed']].groupby('Province/State').max(),
                                    colorscale='Cividis', zmin=0, zmax=17,
                                    marker_opacity=0.5, marker_line_width=0))
fig.update_layout(mapbox_style="carto-positron",
                  mapbox_zoom=3, mapbox_center = {"lat": 35.8617, "lon": 104.1954})

**When did Virus Confirmed initially?**


In [None]:
df.sort_values(by='Date')['Date'][0]

**When was the Virus Confirmed recently?**

In [None]:
df['Date'].max()

**How many total no.of persons were identified with Virus on each day**

In [None]:
df.groupby('Date')[['Confirmed', 'Deaths', 'Recovered']].max().reset_index()

In [None]:
df[df.Country == 'India'][['Date', 'Country', 'Confirmed']].groupby('Country').plot(x='Date', y='Confirmed', kind='line', legend='Confirmed cases in India')

## Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.rcParams["figure.figsize"] = (16,9)
plt.figure(figsize=(16,9));

**Show the no.of Persons affected with Virus**

In [None]:
df[['Confirmed', 'Deaths', 'Recovered']].max().plot(kind='bar')

**How many entries are being recorded per Country?**

In [None]:
plt.figure(figsize=(12,7))
chart = sns.countplot(data=df, x='Country', palette='Set1')
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', fontweight='light');

In [None]:
chart = sns.catplot(
    data=df,
    x='Country',
    kind='count',
    palette='Set1',
    row='Date',
    aspect=3,
    height=3
)

**List the Records entried per day for each Country using Widgets**

In [None]:
df.loc[(df['Country'] == "China") & (df['Date'] == '2020-01-23 12:00:00')]

In [None]:
df.loc[df.Country == "China"]

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

from IPython.display import display

In [None]:
ALL = 'ALL'
def uniqueSortedValues(fromList):
    fromList.dropna(inplace=True)
    values = fromList.unique().tolist()
    values.sort()
    values.insert(0, ALL)
    return values

def showRecordsPerDay():
    df_plot = df.loc[(df['Country'] == country) & (df['Province/State'] == state)].groupby(['Country']).max()
   
    return None

In [None]:
#df['Province/State'].unique()

In [None]:
dropdown_country = widgets.Dropdown(options = uniqueSortedValues(df['Country']), description = 'Country')

In [None]:
dropdown_state = widgets.Dropdown(options = uniqueSortedValues(df['Province/State']), description = 'State')

In [None]:
output = widgets.Output()

In [None]:
def common_filtering_output(country, state):
    output.clear_output()
    if country == 'ALL' and state == 'ALL':
        out = df
    elif country == 'ALL':
        out = df[df['Province/State'] == state]
    elif state == 'ALL':
        out = df[df.Country == country]
    else:
        out = df[(df.Country == country) & (df['Province/State'] == state)]
    
    with output:
        display(out)
    

In [None]:
def dropdown_country_eventhandler(change):
    common_filtering_output(change.new, dropdown_state.value)

In [None]:
def dropdown_state_eventhandler(change):
    common_filtering_output(dropdown_country.value, change.new)

In [None]:
dropdown_country.observe(dropdown_country_eventhandler, names='value')
dropdown_state.observe(dropdown_state_eventhandler, names='value')

In [None]:
#display(dropdown_country)
#display(dropdown_state)

In [None]:
df.loc[df['Country'] == 'China']

In [None]:
df[df.Country == 'China'].tail(10)

In [None]:
'''
column = 'Country'
country = 'China'
start_date = '2020-02-09 23:20:00'
df_plot = df.loc[(df[column] == country) & (df['Date'].dt.strftime('%Y-%m-%d') == '2020-02-08')]
chart = sns.catplot(
                data=df_plot,
                x='Country',
                y='Confirmed',
                kind = 'bar',
                palette='viridis',
                row='Date',
                aspect=1,
                height=5)
'''

In [None]:
@interact
def showRecordsPerDay(column = ['Confirmed', 'Deaths', 'Recovered'], country = df.Country.unique(), start_date = widgets.DatePicker(value = pd.to_datetime('2020-01-22')) ):
    """
    This function does display the records entered per day for each selected Country
    """
    try:
        df_plot = df.loc[(df['Country'] == country) & (df['Date'].dt.strftime('%Y-%m-%d') == start_date.strftime("%Y-%m-%d"))]
        chart = sns.catplot(
                data=df_plot,
                x='Country',
                y=column,
                kind = 'bar',
                palette='viridis',
                row='Date',
                aspect=1,
                height=3)
    except:
        print("No records exists for", country, "on", start_date.strftime("%Y-%m-%d"))
        return None
    
    return None


In [None]:
byState = df.groupby(['Country', 'Province/State']).size().unstack()
#print(byState)

In [None]:
plt.figure(figsize=(10,10))
g = sns.heatmap(
    byState, 
    square=True,
    cbar_kws={'fraction' : 0.01},
    cmap='RdYlGn_r',
    linewidth=1
)

#g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')
#g.set_yticklabels(g.get_yticklabels(), rotation=0, horizontalalignment='right')

**List the States in China which were affected**

In [None]:
df[df.Country == 'China'][['Province/State', 'Deaths', 'Recovered']].groupby('Province/State').max().plot(kind='bar')

**List the States in China which were affected except Hubei**

In [None]:
f, axes = plt.subplots(1,3, figsize=(30,20))
df[df.Country == 'China'][df['Province/State'] != 'Hubei'].groupby('Province/State')['Confirmed', 'Deaths', 'Recovered'].max().plot(kind='pie', subplots=True, ax=axes, legend=None);

**No.of Confirmed vs Deaths vs Recovered except China**

In [None]:
df[df.Country != 'China'].groupby('Date').max().plot(kind='line')

**No.of Confirmed vs Deaths vs Recovered in China**

In [None]:
df[df.Country == 'China'].groupby('Date').max().plot(kind='line')

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
plt.rcParams['figure.figsize'] = (13, 13)
wordcloud = WordCloud(stopwords=STOPWORDS,background_color = 'black', width = 1000,  height = 1000, max_words = 121).generate(' '.join(df['Country']))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Popular Countries affected with Coronavirus',fontsize = 30)
plt.show()

**No.of Confirmed vs Deaths vs Recovered in India**