# Data Source

https://github.com/CSSEGISandData/COVID-19

# Libraries

In [570]:
# import libraries
# ================

# for date and time opeations
from datetime import datetime, timedelta
# for file and folder operations
import os
# for regular expression opeations
import re
# for listing files in a folder
import glob
# for getting web contents
import requests 
# storing and analysing data
import pandas as pd
# for scraping web contents
from bs4 import BeautifulSoup
# to download data
import wget
# numerical analysis
import numpy as np

# Downloading data

In [571]:
# remove all existing csv files
! rm *.csv

# urls of the files
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv']

# download files
for url in urls:
    filename = wget.download(url)

# Dataframes

In [572]:
# dataset
# ======

conf_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('time_series_covid19_deaths_global.csv')
recv_df = pd.read_csv('time_series_covid19_recovered_global.csv')

In [573]:
# conf_df.head()
# deaths_df.head()
# recv_df.head()

In [574]:
# conf_df.columns
# deaths_df.columns
# recv_df.columns

In [575]:
# conf_df.columns[4:]
# deaths_df.columns[4:]
# recv_df.columns[4:]

# Merging dataframes

In [576]:
# extract dates
dates = conf_df.columns[4:]

# melt dataframes into longer format
# ==================================
conf_df_long = conf_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Confirmed')

deaths_df_long = deaths_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Deaths')

recv_df_long = recv_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Recovered')

recv_df_long = recv_df_long[recv_df_long['Country/Region']!='Canada']

print(conf_df_long.shape)
print(deaths_df_long.shape)
print(recv_df_long.shape)

(54264, 6)
(54264, 6)
(51408, 6)


In [577]:
# merge dataframes
# ================

full_table = pd.merge(left=conf_df_long, right=deaths_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])
full_table = pd.merge(left=full_table, right=recv_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])

full_table.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.93911,67.709953,1/22/20,0,0,0.0
1,,Albania,41.1533,20.1683,1/22/20,0,0,0.0
2,,Algeria,28.0339,1.6596,1/22/20,0,0,0.0
3,,Andorra,42.5063,1.5218,1/22/20,0,0,0.0
4,,Angola,-11.2027,17.8739,1/22/20,0,0,0.0


# Preprocessing

In [578]:
# Convert to proper date format
full_table['Date'] = pd.to_datetime(full_table['Date'])

# fill na with 0
full_table['Recovered'] = full_table['Recovered'].fillna(0)

# convert to int datatype
full_table['Recovered'] = full_table['Recovered'].astype('int')

In [579]:
# fixing Country names
# ====================

# renaming countries, regions, provinces
full_table['Country/Region'] = full_table['Country/Region'].replace('Korea, South', 'South Korea')

# Greenland
full_table.loc[full_table['Province/State']=='Greenland', 'Country/Region'] = 'Greenland'

# Mainland china to China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')

In [580]:
# removing
# =======

# removing canada's recovered values
full_table = full_table[full_table['Province/State'].str.contains('Recovered')!=True]

# removing county wise data to avoid double counting
full_table = full_table[full_table['Province/State'].str.contains(',')!=True]

In [581]:
# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']

# filling missing values 
# ======================
# fill missing province/state value with ''
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
# fill missing numerical values with 0
cols = ['Confirmed', 'Deaths', 'Recovered', 'Active']
full_table[cols] = full_table[cols].fillna(0)

# fixing datatypes
full_table['Recovered'] = full_table['Recovered'].astype(int)

# random rows
full_table.sample(6)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
40160,,Vietnam,14.058324,108.277199,2020-06-20,349,0,327,22
40884,Aruba,Netherlands,12.5211,-69.9683,2020-06-23,101,3,98,0
16959,,Paraguay,-23.4425,-58.4438,2020-03-25,37,3,0,34
27045,,Monaco,43.7333,7.4167,2020-05-02,95,4,78,13
48148,,Algeria,28.0339,1.6596,2020-07-21,24278,1100,16646,6532
16816,Chongqing,China,30.0572,107.874,2020-03-25,578,6,570,2


# Fixing off data

In [582]:
# new values
feb_12_conf = {'Hubei' : 34874}

In [583]:
# function to change value
def change_val(date, ref_col, val_col, dtnry):
    for key, val in dtnry.items():
        full_table.loc[(full_table['Date']==date) & (full_table[ref_col]==key), val_col] = val

In [584]:
# changing values
change_val('2/12/20', 'Province/State', 'Confirmed', feb_12_conf)

In [585]:
# checking values
full_table[(full_table['Date']=='2/12/20') & (full_table['Province/State']=='Hubei')]

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
5655,Hubei,China,30.9756,112.2707,2020-02-12,34874,1068,2686,29612


In [586]:
# Ship
# ====

# ship rows containing ships with COVID-19 reported cases
ship_rows = full_table['Province/State'].str.contains('Grand Princess') | \
            full_table['Province/State'].str.contains('Diamond Princess') | \
            full_table['Country/Region'].str.contains('Diamond Princess') | \
            full_table['Country/Region'].str.contains('MS Zaandam')

# ship
ship = full_table[ship_rows]

# Latest cases from the ships
ship_latest = ship[ship['Date']==max(ship['Date'])]
# ship_latest.style.background_gradient(cmap='Pastel1_r')

# skipping rows with ships info
full_table = full_table[~(ship_rows)]

# WHO Region

https://en.wikipedia.org/wiki/WHO_regions

In [587]:
who_region = {}

# African Region AFRO
afro = "Algeria, Angola, Cabo Verde, Eswatini, Sao Tome and Principe, Benin, South Sudan, Western Sahara, Congo (Brazzaville), Congo (Kinshasa), Cote d'Ivoire, Botswana, Burkina Faso, Burundi, Cameroon, Cape Verde, Central African Republic, Chad, Comoros, Ivory Coast, Democratic Republic of the Congo, Equatorial Guinea, Eritrea, Ethiopia, Gabon, Gambia, Ghana, Guinea, Guinea-Bissau, Kenya, Lesotho, Liberia, Madagascar, Malawi, Mali, Mauritania, Mauritius, Mozambique, Namibia, Niger, Nigeria, Republic of the Congo, Rwanda, São Tomé and Príncipe, Senegal, Seychelles, Sierra Leone, Somalia, South Africa, Swaziland, Togo, Uganda, Tanzania, Zambia, Zimbabwe"
afro = [i.strip() for i in afro.split(',')]
for i in afro:
    who_region[i] = 'Africa'
    
# Region of the Americas PAHO
paho = 'Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Canada, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, United States, US, Uruguay, Venezuela'
paho = [i.strip() for i in paho.split(',')]
for i in paho:
    who_region[i] = 'Americas'

# South-East Asia Region SEARO
searo = 'Bangladesh, Bhutan, North Korea, India, Indonesia, Maldives, Myanmar, Burma, Nepal, Sri Lanka, Thailand, Timor-Leste'
searo = [i.strip() for i in searo.split(',')]
for i in searo:
    who_region[i] = 'South-East Asia'

# European Region EURO
euro = 'Albania, Andorra, Greenland, Kosovo, Holy See, Liechtenstein, Armenia, Czechia, Austria, Azerbaijan, Belarus, Belgium, Bosnia and Herzegovina, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Israel, Italy, Kazakhstan, Kyrgyzstan, Latvia, Lithuania, Luxembourg, Malta, Monaco, Montenegro, Netherlands, North Macedonia, Norway, Poland, Portugal, Moldova, Romania, Russia, San Marino, Serbia, Slovakia, Slovenia, Spain, Sweden, Switzerland, Tajikistan, Turkey, Turkmenistan, Ukraine, United Kingdom, Uzbekistan'
euro = [i.strip() for i in euro.split(',')]
for i in euro:
    who_region[i] = 'Europe'

# Eastern Mediterranean Region EMRO
emro = 'Afghanistan, Bahrain, Djibouti, Egypt, Iran, Iraq, Jordan, Kuwait, Lebanon, Libya, Morocco, Oman, Pakistan, Palestine, West Bank and Gaza, Qatar, Saudi Arabia, Somalia, Sudan, Syria, Tunisia, United Arab Emirates, Yemen'
emro = [i.strip() for i in emro.split(',')]
for i in emro:
    who_region[i] = 'Eastern Mediterranean'

# Western Pacific Region WPRO
wpro = 'Australia, Brunei, Cambodia, China, Cook Islands, Fiji, Japan, Kiribati, Laos, Malaysia, Marshall Islands, Micronesia, Mongolia, Nauru, New Zealand, Niue, Palau, Papua New Guinea, Philippines, South Korea, Samoa, Singapore, Solomon Islands, Taiwan, Taiwan*, Tonga, Tuvalu, Vanuatu, Vietnam'
wpro = [i.strip() for i in wpro.split(',')]
for i in wpro:
    who_region[i] = 'Western Pacific'

In [588]:
# add 'WHO Region' column
full_table['WHO Region'] = full_table['Country/Region'].map(who_region)

# find missing values
full_table[full_table['WHO Region'].isna()]['Country/Region'].unique()

array([], dtype=object)

In [589]:
# Cleaning data
# =============

# fixing Country values
full_table.loc[full_table['Province/State']=='Greenland', 'Country/Region'] = 'Greenland'

# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']

# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')

# filling missing values 
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
full_table[['Confirmed', 'Deaths', 'Recovered', 'Active']] = full_table[['Confirmed', 'Deaths', 'Recovered', 'Active']].fillna(0)

# fixing datatypes
full_table['Recovered'] = full_table['Recovered'].astype(int)

full_table.sample(6)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
7385,,Philippines,12.879721,121.774017,2020-02-18,3,1,1,1,Western Pacific
27325,,Niger,17.607789,8.081666,2020-05-03,750,36,518,196,Africa
32277,,Congo (Brazzaville),-0.228,15.8277,2020-05-22,469,16,137,316,Africa
31578,,Netherlands,52.1326,5.2913,2020-05-19,44249,5715,0,38534,Europe
48785,,Egypt,26.820553,30.802498,2020-07-23,90413,4480,31066,54867,Eastern Mediterranean
41657,,Lebanon,33.8547,35.8623,2020-06-26,1697,33,1144,520,Eastern Mediterranean


# Saving final data

In [590]:
# save data
full_table.to_csv('covid_19_clean_complete.csv', index=False)

# Full Grouped

In [591]:
# Grouped by day, country
# =======================

full_grouped = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

# new cases ======================================================

temp = full_grouped.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']
temp.tail(10)


  after removing the cwd from sys.path.
  


Unnamed: 0,Confirmed,Deaths,Recovered
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
38143,15184,105,8369
38144,10,1,8
38145,1841,528,937
38146,8501,246,7233


In [592]:
#percentage change

pc = temp.sum().diff().pct_change()
pc = pc.reset_index()
pc.columns = ['Country/Region', 'Date','% diff new cases', '% diff new deaths', '% diff new recovered']
pc.tail(10)

Unnamed: 0,Country/Region,Date,% diff new cases,% diff new deaths,% diff new recovered
38138,Zimbabwe,2020-08-03,-0.412214,9.0,7.2
38139,Zimbabwe,2020-08-04,-0.051948,-0.9,3.414634
38140,Zimbabwe,2020-08-05,-1.0,-1.0,-1.0
38141,Zimbabwe,2020-08-06,inf,inf,inf
38142,Zimbabwe,2020-08-07,-0.050847,5.0,2.115385
38143,Zimbabwe,2020-08-08,0.107143,-1.0,-0.123457
38144,Zimbabwe,2020-08-09,-0.403226,inf,-0.704225
38145,Zimbabwe,2020-08-10,0.337838,-1.0,3.142857
38146,Zimbabwe,2020-08-11,-0.292929,,-1.0
38147,Zimbabwe,2020-08-12,0.071429,inf,inf


In [593]:
temp = temp.sum().diff().reset_index()

temp.head(10)



Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered
0,Afghanistan,2020-01-22,,,
1,Afghanistan,2020-01-23,0.0,0.0,0.0
2,Afghanistan,2020-01-24,0.0,0.0,0.0
3,Afghanistan,2020-01-25,0.0,0.0,0.0
4,Afghanistan,2020-01-26,0.0,0.0,0.0
5,Afghanistan,2020-01-27,0.0,0.0,0.0
6,Afghanistan,2020-01-28,0.0,0.0,0.0
7,Afghanistan,2020-01-29,0.0,0.0,0.0
8,Afghanistan,2020-01-30,0.0,0.0,0.0
9,Afghanistan,2020-01-31,0.0,0.0,0.0


In [594]:
temp = pd.merge(temp, pc, on=['Country/Region', 'Date'])

temp.tail(10)

Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,% diff new cases,% diff new deaths,% diff new recovered
38138,Zimbabwe,2020-08-03,154.0,10.0,41.0,-0.412214,9.0,7.2
38139,Zimbabwe,2020-08-04,146.0,1.0,181.0,-0.051948,-0.9,3.414634
38140,Zimbabwe,2020-08-05,0.0,0.0,0.0,-1.0,-1.0,-1.0
38141,Zimbabwe,2020-08-06,118.0,3.0,26.0,inf,inf,inf
38142,Zimbabwe,2020-08-07,112.0,18.0,81.0,-0.050847,5.0,2.115385
38143,Zimbabwe,2020-08-08,124.0,0.0,71.0,0.107143,-1.0,-0.123457
38144,Zimbabwe,2020-08-09,74.0,2.0,21.0,-0.403226,inf,-0.704225
38145,Zimbabwe,2020-08-10,99.0,0.0,87.0,0.337838,-1.0,3.142857
38146,Zimbabwe,2020-08-11,70.0,0.0,0.0,-0.292929,,-1.0
38147,Zimbabwe,2020-08-12,75.0,18.0,96.0,0.071429,inf,inf


In [595]:
mask = temp['Country/Region'] != temp['Country/Region'].shift(1)

mask

0         True
1        False
2        False
3        False
4        False
         ...  
38143    False
38144    False
38145    False
38146    False
38147    False
Name: Country/Region, Length: 38148, dtype: bool

In [596]:
temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan
temp.loc[mask, 'Recovered'] = np.nan
temp.loc[mask, '% diff new cases'] = np.nan
temp.loc[mask, '% diff new deaths'] = np.nan
temp.loc[mask, '% diff new recovered'] = np.nan

temp.tail(10)

Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,% diff new cases,% diff new deaths,% diff new recovered
38138,Zimbabwe,2020-08-03,154.0,10.0,41.0,-0.412214,9.0,7.2
38139,Zimbabwe,2020-08-04,146.0,1.0,181.0,-0.051948,-0.9,3.414634
38140,Zimbabwe,2020-08-05,0.0,0.0,0.0,-1.0,-1.0,-1.0
38141,Zimbabwe,2020-08-06,118.0,3.0,26.0,inf,inf,inf
38142,Zimbabwe,2020-08-07,112.0,18.0,81.0,-0.050847,5.0,2.115385
38143,Zimbabwe,2020-08-08,124.0,0.0,71.0,0.107143,-1.0,-0.123457
38144,Zimbabwe,2020-08-09,74.0,2.0,21.0,-0.403226,inf,-0.704225
38145,Zimbabwe,2020-08-10,99.0,0.0,87.0,0.337838,-1.0,3.142857
38146,Zimbabwe,2020-08-11,70.0,0.0,0.0,-0.292929,,-1.0
38147,Zimbabwe,2020-08-12,75.0,18.0,96.0,0.071429,inf,inf


In [597]:


# renaming columns
temp.columns = ['Country/Region', 'Date', 'New cases', 'New deaths', 'New recovered','% diff new cases', '% diff new deaths', '% diff new recovered']
# =================================================================
temp.tail(10)

Unnamed: 0,Country/Region,Date,New cases,New deaths,New recovered,% diff new cases,% diff new deaths,% diff new recovered
38138,Zimbabwe,2020-08-03,154.0,10.0,41.0,-0.412214,9.0,7.2
38139,Zimbabwe,2020-08-04,146.0,1.0,181.0,-0.051948,-0.9,3.414634
38140,Zimbabwe,2020-08-05,0.0,0.0,0.0,-1.0,-1.0,-1.0
38141,Zimbabwe,2020-08-06,118.0,3.0,26.0,inf,inf,inf
38142,Zimbabwe,2020-08-07,112.0,18.0,81.0,-0.050847,5.0,2.115385
38143,Zimbabwe,2020-08-08,124.0,0.0,71.0,0.107143,-1.0,-0.123457
38144,Zimbabwe,2020-08-09,74.0,2.0,21.0,-0.403226,inf,-0.704225
38145,Zimbabwe,2020-08-10,99.0,0.0,87.0,0.337838,-1.0,3.142857
38146,Zimbabwe,2020-08-11,70.0,0.0,0.0,-0.292929,,-1.0
38147,Zimbabwe,2020-08-12,75.0,18.0,96.0,0.071429,inf,inf


In [598]:

# merging new values
full_grouped = pd.merge(full_grouped, temp, on=['Country/Region', 'Date'])

# filling na with 0
full_grouped = full_grouped.fillna(0)

# fixing data types
cols = ['New cases', 'New deaths', 'New recovered']
full_grouped[cols] = full_grouped[cols].astype('int')

full_grouped['New cases'] = full_grouped['New cases'].apply(lambda x: 0 if x<0 else x)

full_grouped.tail()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,% diff new cases,% diff new deaths,% diff new recovered
38143,2020-08-12,West Bank and Gaza,15184,105,8369,6710,309,1,188,-0.153425,-0.75,0.382353
38144,2020-08-12,Western Sahara,10,1,8,1,0,0,0,0.0,0.0,0.0
38145,2020-08-12,Yemen,1841,528,937,376,10,5,22,-11.0,0.0,inf
38146,2020-08-12,Zambia,8501,246,7233,1022,226,5,229,2.476923,inf,0.133663
38147,2020-08-12,Zimbabwe,4893,122,1620,3151,75,18,96,0.071429,inf,inf


In [599]:
# add 'WHO Region' column
full_grouped['WHO Region'] = full_grouped['Country/Region'].map(who_region)

# find missing values
full_grouped[full_grouped['WHO Region'].isna()]['Country/Region'].unique()

array([], dtype=object)

In [600]:
#adding percentage difference for confirmed,deaths, recovered,active #changes made to obtain quick stats on covid project


#percentage difference in confirmed cases

#(252337- 234639)/234639*100 > newcaseslastday - newcaseslastbutbeforeday/newcaseslastbutbeforeday*100


#full_grouped['new cases % diff'] = full_grouped['']


In [601]:
# save as .csv file
full_grouped.to_csv('full_grouped.csv', index=False)

In [602]:
full_grouped.shape

(38148, 13)

# Day wise

In [608]:
# Day wise
# ========

# table
day_wise = full_grouped.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 
                                        'Active', 'New cases', 'New deaths', 'New recovered'].sum().reset_index()

# number cases per 100 cases
day_wise['Deaths / 100 Cases'] = round((day_wise['Deaths']/day_wise['Confirmed'])*100, 2)
day_wise['Recovered / 100 Cases'] = round((day_wise['Recovered']/day_wise['Confirmed'])*100, 2)
day_wise['Deaths / 100 Recovered'] = round((day_wise['Deaths']/day_wise['Recovered'])*100, 2)

# no. of countries
day_wise['No. of countries'] = full_grouped[full_grouped['Confirmed']!=0] \
                                    .groupby('Date')['Country/Region'] \
                                    .unique() \
                                    .apply(len)\
                                    .values

# fillna by 0
cols = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
day_wise[cols] = day_wise[cols].fillna(0)

day_wise.head()

  """


Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries
0,2020-01-22,555,17,28,510,0,0,0,3.06,5.05,60.71,6
1,2020-01-23,654,18,30,606,99,1,2,2.75,4.59,60.0,8
2,2020-01-24,941,26,36,879,287,8,6,2.76,3.83,72.22,9
3,2020-01-25,1434,42,39,1353,493,16,3,2.93,2.72,107.69,11
4,2020-01-26,2118,56,52,2010,684,14,13,2.64,2.46,107.69,13


In [609]:
# save as .csv file
day_wise.to_csv('day_wise.csv', index=False)

# Country wise latest

In [610]:
# Country wise
# ============

full_grouped['Date'] = pd.to_datetime(full_grouped['Date'])

# getting latest values
country_wise = full_grouped[full_grouped['Date']==max(full_grouped['Date'])] \
                    .reset_index(drop=True) \
                    .drop('Date', axis=1)

print(country_wise.shape)

# group by country
country_wise = country_wise.groupby('Country/Region')['Confirmed', 'Deaths', 
                                                      'Recovered', 'Active', 
                                                      'New cases', 'New deaths', 'New recovered'].sum().reset_index()
print(country_wise.shape)


# per 100 cases
country_wise['Deaths / 100 Cases'] = round((country_wise['Deaths']/country_wise['Confirmed'])*100, 2)
country_wise['Recovered / 100 Cases'] = round((country_wise['Recovered']/country_wise['Confirmed'])*100, 2)
country_wise['Deaths / 100 Recovered'] = round((country_wise['Deaths']/country_wise['Recovered'])*100, 2)

cols = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
country_wise[cols] = country_wise[cols].fillna(0)


# 1 week increase and % change
# ============================

today = full_grouped[full_grouped['Date']==max(full_grouped['Date'])] \
            .reset_index(drop=True) \
            .drop('Date', axis=1)[['Country/Region', 'Confirmed']]

last_week = full_grouped[full_grouped['Date']==max(full_grouped['Date'])-timedelta(days=7)] \
                .reset_index(drop=True) \
                .drop('Date', axis=1)[['Country/Region', 'Confirmed']]

temp = pd.merge(today, last_week, on='Country/Region', suffixes=(' today', ' last week'))
temp['1 week change'] = temp['Confirmed today'] - temp['Confirmed last week']
temp = temp[['Country/Region', 'Confirmed last week', '1 week change']]

country_wise = pd.merge(country_wise, temp, on='Country/Region')
country_wise['1 week % increase'] = round(country_wise['1 week change']/country_wise['Confirmed last week']*100, 2)
country_wise.head()

country_wise['WHO Region'] = country_wise['Country/Region'].map(who_region)
country_wise[country_wise['WHO Region'].isna()]['Country/Region'].unique()

country_wise.head()

(187, 12)
(187, 8)


  


Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,37345,1354,26694,9297,76,10,279,3.63,71.48,5.07,36829,516,1.4,Eastern Mediterranean
1,Albania,6817,208,3552,3057,141,3,72,3.05,52.11,5.86,5889,928,15.76,Europe
2,Algeria,36699,1333,25627,9739,495,11,364,3.63,69.83,5.2,33055,3644,11.02,Africa
3,Andorra,977,53,855,69,14,1,16,5.42,87.51,6.2,939,38,4.05,Europe
4,Angola,1762,80,577,1105,27,0,2,4.54,32.75,13.86,1395,367,26.31,Africa


In [611]:
# save as .csv file
country_wise.to_csv('country_wise_latest.csv', index=False)

# Country wise data

In [612]:
# # china
# # =====

# china_province_wise = full_table[full_table['Country/Region']=='China']
# china_province_wise['Province/State'].unique()
# china_province_wise.to_csv('china_province_wise.csv', index=False)

In [613]:
# # Australia
# # =========

# australia_state_wise = full_table[full_table['Country/Region']=='Australia']
# australia_state_wise['Province/State'].unique()
# australia_state_wise.to_csv('australia_state_wise.csv', index=False)

In [614]:
# # Canada
# # ======

# canada_state_wise = full_table[full_table['Country/Region']=='Canada']
# canada_state_wise['Province/State'].unique()
# canada_state_wise.to_csv('canada_state_wise.csv', index=False)

# USA data

In [615]:
# download data
# =============

# urls of the files
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv']

# download files
for url in urls:
    filename = wget.download(url)

In [616]:
# read data
# =========

us_conf_df = pd.read_csv('time_series_covid19_confirmed_US.csv')
us_deaths_df = pd.read_csv('time_series_covid19_deaths_US.csv')

In [617]:
# ids
ids = us_conf_df.columns[0:11]
# dates
us_dates = us_conf_df.columns[11:]

# melt to longer format
us_conf_df_long = us_conf_df.melt(id_vars=ids, value_vars=us_dates, var_name='Date', value_name='Confirmed')
us_deaths_df_long = us_deaths_df.melt(id_vars=ids, value_vars=us_dates, var_name='Date', value_name='Deaths')

In [618]:
# first few rows
us_conf_df_long.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",1/22/20,0
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",1/22/20,0
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",1/22/20,0
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",1/22/20,0
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",1/22/20,0


In [619]:
# concatenate dataframes
us_full_table = pd.concat([us_conf_df_long, us_deaths_df_long[['Deaths']]], axis=1)

#first few rows
us_full_table.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",1/22/20,0,0
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",1/22/20,0,0
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",1/22/20,0,0
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",1/22/20,0,0
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",1/22/20,0,0


In [620]:
# save as .csv file
us_full_table.to_csv('usa_county_wise.csv', index=False)