In [1]:
# Dependencies; add others as needed
from config import username, password
import pycountry # please install this dependency
import country_converter as coco # please install this dependency
import pandas as pd

# Extract Data

### Extract Coronavirus (nCOV19) data

In [2]:
# Reading Coronavirus data downloaded from Kaggle
nCov19_df = pd.read_csv('./Resources/2019_nCoV_data.csv')

# Check dataframe
nCov19_df.head()

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020 12:00:00,Anhui,China,01/22/2020 12:00:00,1.0,0.0,0.0
1,2,01/22/2020 12:00:00,Beijing,China,01/22/2020 12:00:00,14.0,0.0,0.0
2,3,01/22/2020 12:00:00,Chongqing,China,01/22/2020 12:00:00,6.0,0.0,0.0
3,4,01/22/2020 12:00:00,Fujian,China,01/22/2020 12:00:00,1.0,0.0,0.0
4,5,01/22/2020 12:00:00,Gansu,China,01/22/2020 12:00:00,0.0,0.0,0.0


In [3]:
# Check data type
nCov19_df.dtypes

Sno                 int64
Date               object
Province/State     object
Country            object
Last Update        object
Confirmed         float64
Deaths            float64
Recovered         float64
dtype: object

In [4]:
# Getting a summary
nCov19_df.describe()

Unnamed: 0,Sno,Confirmed,Deaths,Recovered
count,1719.0,1719.0,1719.0,1719.0
mean,860.0,454.596859,10.441536,44.361838
std,496.376873,3712.529032,106.638286,351.078713
min,1.0,0.0,0.0,0.0
25%,430.5,2.0,0.0,0.0
50%,860.0,12.0,0.0,0.0
75%,1289.5,109.0,0.0,7.0
max,1719.0,59989.0,1789.0,7862.0


In [5]:
# Checking number of rows, grouped by country
country_ncov19_df = nCov19_df.groupby('Country')
country_ncov19_df['Sno'].count()

Country
Australia                84
Belgium                  14
Brazil                    1
Cambodia                 22
Canada                   59
China                    34
Egypt                     4
Finland                  20
France                   25
Germany                  22
Hong Kong                26
India                    19
Italy                    19
Ivory Coast               1
Japan                    27
Macau                    26
Mainland China          801
Malaysia                 25
Mexico                    1
Nepal                    24
Others                   11
Philippines              20
Russia                   18
Singapore                26
South Korea              27
Spain                    18
Sri Lanka                22
Sweden                   18
Taiwan                   26
Thailand                 27
UK                       18
US                      188
United Arab Emirates     20
Vietnam                  26
Name: Sno, dtype: int64

### Extract SARS Outbreak data

In [None]:
# Reading SARS outbreak data downloaded from Kaggle


# Check dataframe


# Transform Data

### Getting a standardized country list

In [6]:
### Importing country list using pycountry and creating a dataframe for later use
pycntrylst = list(pycountry.countries)
name = []
common_name = []
official_name = []
    
for i in pycntrylst:
    name.append(i.name)
    if hasattr(i, "common_name"):
        common_name.append(i.common_name)
    else:
        common_name.append("")
    if hasattr(i, "official_name"):
        official_name.append(i.official_name)
    else:
        official_name.append("")

# Check list
name

['Aruba',
 'Afghanistan',
 'Angola',
 'Anguilla',
 'Åland Islands',
 'Albania',
 'Andorra',
 'United Arab Emirates',
 'Argentina',
 'Armenia',
 'American Samoa',
 'Antarctica',
 'French Southern Territories',
 'Antigua and Barbuda',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Burundi',
 'Belgium',
 'Benin',
 'Bonaire, Sint Eustatius and Saba',
 'Burkina Faso',
 'Bangladesh',
 'Bulgaria',
 'Bahrain',
 'Bahamas',
 'Bosnia and Herzegovina',
 'Saint Barthélemy',
 'Belarus',
 'Belize',
 'Bermuda',
 'Bolivia, Plurinational State of',
 'Brazil',
 'Barbados',
 'Brunei Darussalam',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Central African Republic',
 'Canada',
 'Cocos (Keeling) Islands',
 'Switzerland',
 'Chile',
 'China',
 "Côte d'Ivoire",
 'Cameroon',
 'Congo, The Democratic Republic of the',
 'Congo',
 'Cook Islands',
 'Colombia',
 'Comoros',
 'Cabo Verde',
 'Costa Rica',
 'Cuba',
 'Curaçao',
 'Christmas Island',
 'Cayman Islands',
 'Cyprus',
 'Czechia',
 'Germany',
 'Djibouti',
 'Dominica'

### Creating a function to identify invalid country names

In [7]:
def country_name_check(input_country_list):
    invalid_countrynames =[]
    for j in input_country_list:
        if j not in (name):
            invalid_countrynames.append(j)
    invalid_countrynames = list(set(invalid_countrynames))
    return invalid_countrynames

### Creating a function to convert the invalid names to standard names to reduce manual comparison

In [8]:
def convert_name(invalid_countrynames):
    standard_names = coco.convert(names=invalid_countrynames, to='name_short')
    return standard_names

## Clean, standardize, and organize nCOV19 data

In [9]:
print("This is the list of invalid names: ")

# run the country name check function to identify invalid country names/labels
country_name_check(nCov19_df.Country)

This is the list of invalid names: 


['Taiwan',
 'UK',
 'Macau',
 'Others',
 'Russia',
 'Mainland China',
 'US',
 'Ivory Coast',
 'Vietnam',
 'South Korea']

In [10]:
# copy the invalid list and run the convert_name function to find alternative OR refer to the "name" list
invalid_list_ncov19 = ['Russia',
 'Mainland China',
 'Macau',
 'Taiwan',
 'Ivory Coast',
 'Others',
 'Vietnam',
 'US',
 'UK',
 'South Korea']

print("This is an option that can be matched to the standardized list: ")

convert_name(invalid_list_ncov19)



This is an option that can be matched to the standardized list: 




['Russia',
 'China',
 'Macao',
 'Taiwan',
 "Cote d'Ivoire",
 'not found',
 'Vietnam',
 'United States',
 'not found',
 'South Korea']

In [11]:
# rename the invalid names to the standardize list
updated_nCov19_df = nCov19_df.replace({'Country': {"Russia": "Russian Federation", "Mainland China": "China",
                                                  "Macau":"Macao","Taiwan":"Taiwan, Province of China",
                                                  "Ivory Coast":"Côte d'Ivoire", "Vietnam":"Viet Nam",
                                                  "US":"United States", "UK":"United Kingdom",
                                                  "South Korea":"Korea, Republic of"}})

In [12]:
# running country name check function again to make sure names are correctly renamed/replaced
country_name_check(updated_nCov19_df.Country)

['Others']

In [13]:
# show larger dataframe
pd.set_option('display.max_rows', 500)

# perform a quick check on the country names
updated_nCov19_df.groupby(['Province/State','Country']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sno,Date,Last Update,Confirmed,Deaths,Recovered
Province/State,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anhui,China,27,27,27,27,27,27
Arizona,United States,6,6,6,6,6,6
Bavaria,Germany,5,5,5,5,5,5
Beijing,China,27,27,27,27,27,27
"Boston, MA",United States,17,17,17,17,17,17
British Columbia,Canada,21,21,21,21,21,21
California,United States,6,6,6,6,6,6
Chicago,United States,1,1,1,1,1,1
"Chicago, IL",United States,17,17,17,17,17,17
Chongqing,China,27,27,27,27,27,27


In [14]:
# based on the quick check, Hong Kong, Macao and Taiwan and grouped into China in one instance each
# rename country of these instances to their respective countries

# Hong Kong
updated_nCov19_df.loc[updated_nCov19_df['Province/State'] == 'Hong Kong', 'Country'] = "Hong Kong"

# Macao
updated_nCov19_df.loc[updated_nCov19_df['Province/State'] == 'Macau', 'Country'] = "Macao"

# Taiwan
updated_nCov19_df.loc[updated_nCov19_df['Province/State'] == 'Taiwan', 'Country'] = "Taiwan, Province of China"

# perform a quick check on the country names
updated_nCov19_df.groupby(['Province/State','Country']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sno,Date,Last Update,Confirmed,Deaths,Recovered
Province/State,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anhui,China,27,27,27,27,27,27
Arizona,United States,6,6,6,6,6,6
Bavaria,Germany,5,5,5,5,5,5
Beijing,China,27,27,27,27,27,27
"Boston, MA",United States,17,17,17,17,17,17
British Columbia,Canada,21,21,21,21,21,21
California,United States,6,6,6,6,6,6
Chicago,United States,1,1,1,1,1,1
"Chicago, IL",United States,17,17,17,17,17,17
Chongqing,China,27,27,27,27,27,27


In [15]:
# copy dataframe
nCov19_df_subset = updated_nCov19_df[['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered']].copy()

# aggregate and sum the number of confirmed cases, deaths, and recoveries
nCov19_df_subset = nCov19_df_subset.groupby(['Country','Date']).sum()
nCov19_df_subset.reset_index(inplace = True)

In [29]:
# aggregate by country and get the last value
final_nCov19_df = nCov19_df_subset.groupby('Country').last()
final_nCov19_df.reset_index(inplace = True)

# remove the index column
final_nCov19_df = final_nCov19_df.drop(columns=['index','Date'])

# rename columns
final_nCov19_df = final_nCov19_df.rename(columns = {'Confirmed':'Confirmed_NCOV','Deaths':'Deaths_NCOV',
                                                    'Recovered':'Recovered_NCOV'})
final_nCov19_df

Unnamed: 0,Country,Confirmed_NCOV,Deaths_NCOV,Recovered_NCOV
0,Australia,15.0,0.0,10.0
1,Belgium,1.0,0.0,1.0
2,Brazil,0.0,0.0,0.0
3,Cambodia,1.0,0.0,1.0
4,Canada,8.0,0.0,1.0
5,China,72364.0,1863.0,12455.0
6,Côte d'Ivoire,0.0,0.0,0.0
7,Egypt,1.0,0.0,0.0
8,Finland,1.0,0.0,1.0
9,France,12.0,1.0,4.0


## Clean, standardize, and organize SARS outbreak data

In [None]:
print("This is the list of invalid names: ")

# run the country name check function to identify invalid country names/labels


In [None]:
print("This is an option that can be matched to the standardized list: ")

# copy the invalid list and run the convert_name function to find alternative OR refer to the "name" list
invalid_list_sars = 

convert_name(invalid_list_sars)

In [None]:
# rename the invalid names to the standardize list

In [None]:
# drop data without any country information

In [None]:
# aggregate and sum the number of confirmed cases, deaths, and recoveries

# Load data

In [None]:
# create connection
rds_connection_string = f"{username}:{password}@localhost:5432/sars_ncov19_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# check for tables
engine.table_names()

In [None]:
# Load nCov19 dataframe into database



# Confirm data has been added by querying the nCOV19 table



In [None]:
# Load SARS outbreak dataframe into database



# Confirm data has been added by querying the SARS outbreak table