In [None]:
# Dependencies; add others as needed
from config import username, password
import pycountry # please install this dependency
import country_converter as coco # please install this dependency
import pandas as pd
from sqlalchemy import create_engine

# Extract Data

### Extract Coronavirus (nCOV19) data

In [None]:
# Reading Coronavirus data downloaded from Kaggle
nCov19_df = pd.read_csv('./Resources/2019_nCoV_data.csv')

# Check dataframe
nCov19_df.head()

In [None]:
# Check data type
nCov19_df.dtypes

In [None]:
# Getting a summary
nCov19_df.describe()

In [None]:
# Checking number of rows, grouped by country
country_ncov19_df = nCov19_df.groupby('Country')
country_ncov19_df['Sno'].count()

### Extract SARS Outbreak data

In [None]:
# Reading SARS outbreak data downloaded from Kaggle
original_sars03_df = pd.read_csv('./Resources/sars_2003_complete_dataset_clean.csv')

sars03_df = original_sars03_df.rename(columns={'Country':'country', 
                                               'Cumulative number of case(s)':'confirmed_sars', 
                                               'Number of deaths':'deaths_sars', 
                                               'Number recovered': 'recovered_sars'}).copy()
                                 
# Check dataframe
sars03_df.head()

# Transform Data

### Getting a standardized country list

In [None]:
### Importing country list using pycountry and creating a dataframe for later use
pycntrylst = list(pycountry.countries)
name = []
common_name = []
official_name = []
    
for i in pycntrylst:
    name.append(i.name)
    if hasattr(i, "common_name"):
        common_name.append(i.common_name)
    else:
        common_name.append("")
    if hasattr(i, "official_name"):
        official_name.append(i.official_name)
    else:
        official_name.append("")

# Check list
name

### Creating a function to identify invalid country names

In [None]:
def country_name_check(input_country_list):
    invalid_countrynames =[]
    for j in input_country_list:
        if j not in (name):
            invalid_countrynames.append(j)
    invalid_countrynames = list(set(invalid_countrynames))
    return invalid_countrynames

### Creating a function to convert the invalid names to standard names to reduce manual comparison

In [None]:
def convert_name(invalid_countrynames):
    standard_names = coco.convert(names=invalid_countrynames, to='name_short')
    return standard_names

## Clean, standardize, and organize nCOV19 data

In [None]:
print("This is the list of invalid names: ")

# run the country name check function to identify invalid country names/labels
country_name_check(nCov19_df.Country)

In [None]:
# copy the invalid list and run the convert_name function to find alternative OR refer to the "name" list
invalid_list_ncov19 = ['Russia',
 'Mainland China',
 'Macau',
 'Taiwan',
 'Ivory Coast',
 'Others',
 'Vietnam',
 'US',
 'UK',
 'South Korea']

print("This is an option that can be matched to the standardized list: ")

convert_name(invalid_list_ncov19)

In [None]:
# rename the invalid names to the standardize list
updated_nCov19_df = nCov19_df.replace({'Country': {"Russia": "Russian Federation", "Mainland China": "China",
                                                  "Macau":"Macao","Taiwan":"Taiwan, Province of China",
                                                  "Ivory Coast":"Côte d'Ivoire", "Vietnam":"Viet Nam",
                                                  "US":"United States", "UK":"United Kingdom",
                                                  "South Korea":"Korea, Republic of"}})

In [None]:
# running country name check function again to make sure names are correctly renamed/replaced
country_name_check(updated_nCov19_df.Country)

In [None]:
# perform a quick check on the country names
updated_nCov19_df.groupby('Country').count()

In [None]:
# aggregate and sum the number of confirmed cases, deaths, and recoveries
updated_nCov19_df.groupby('Country').count()

## Clean, standardize, and organize SARS outbreak data

In [None]:
print("This is the list of invalid names: ")

# run the country name check function to identify invalid country names/labels
country_name_check(sars03_df.country)

In [None]:
print("This is an option that can be matched to the standardized list: ")

# copy the invalid list and run the convert_name function to find alternative OR refer to the "name" list
invalid_list_sars03 = ['Taiwan, China',
 'Macao SAR, China',
 'Hong Kong SAR, China',
 'Republic of Ireland',
 'Republic of Korea']

convert_name(invalid_list_sars03)

In [None]:
# rename the invalid names to the standardize list
updated_sars03_df = sars03_df.replace({'Country': {'Taiwan, China':'Taiwan, Province of China',
                                                  'Macao SAR, China':'Macao',
                                                  'Hong Kong SAR, China':'China',
                                                  'Republic of Ireland':'Ireland',
                                                  'Republic of Korea':'Korea, Republic of'}})

In [None]:
# running country name check function again to make sure names are correctly renamed/replaced
country_name_check(updated_sars03_df.country)

In [None]:
# perform a quick check on the country names

updated_sars03_df.groupby('country').count()

In [None]:
# drop data without any country information

In [None]:
# aggregate and sum the number of confirmed cases, deaths, and recoveries
aggregate_by_date_sars03 = updated_sars03_df.groupby(['Date','country']).sum().reset_index('country')
aggregate_by_date_sars03

In [None]:
# find the last date in the DataFrame.
index_date_sars03 = aggregate_by_date_sars03.reset_index()
last_date_sars03 = index_date_sars03['Date'].max()

# filter DataFrame by the last date in the 'Date' column.
final_table_sars03 = aggregate_by_date_sars03.loc[last_date_sars03].reset_index()\
                                                                    .drop(['Date'], axis=1)\
                                                                    .sort_values(['country'], ascending=True)

# Load data

In [None]:
# create connection
rds_connection_string = f"{username}:{password}@localhost:5432/sars_ncov19_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# check for tables
engine.table_names()

In [None]:
# Load nCov19 dataframe into database



# Confirm data has been added by querying the nCOV19 table



In [None]:
# Load SARS outbreak dataframe into database
final_table_sars03.to_sql(name='sars03_data', con=engine, if_exists='append', index=False)

# Confirm data has been added by querying the SARS outbreak table
pd.read_sql_query('select * from sars03_data', con=engine).head()