# 0. Setup

### 0.1 Install and load libraries

In [None]:

from numpy import inf

# time operations
from datetime import timedelta

# for numerical analyiss
import numpy as np

# to store and process data in dataframe
import pandas as pd

# basic visualization package
import matplotlib.pyplot as plt

# interactive visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
# import plotly.figure_factory as ff
#from plotly.subplots import make_subplots

# for offline ploting
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

# hide warnings
import warnings
warnings.filterwarnings('ignore')

# to interface with operating system
import os

# for advanced visualization
import seaborn as sns; sns.set()

# for trendlines
import statsmodels

# data manipulation
from datetime import datetime as dt
from scipy.stats.mstats import winsorize

### 0.1 Define Parameters

In [None]:
# color pallette
cnf, dth, rec, act = '#393e46', '#ff2e63', '#21bf73', '#fe9801' 

### 0.2 Import Data

Please add Covid 19 data from https://www.kaggle.com/imdevskp/corona-virus-report

and Worldometer snapshot data from https://www.kaggle.com/selfishgene/covid19-worldometer-snapshots-since-april-18

In [None]:
# list files
# ==========

!ls ../input/corona-virus-report

In [None]:
files = []

for dirname, _, filenames in os.walk('../input/korea-econfin-data'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))
        
files = sorted(files)
files

In [None]:
series = [pd.read_csv(f, na_values=['.']) for f in files]
series_name = ['btc', 'cpi', 'gold', 'korea', 'high_yield_bond', 'inv_grade_bond', 'moderna', 'employment', 'tesla_robinhood', 
               'trea_20y_bond', 'trea_10y_yield', 'tesla_stock', 'korea_m1', 'wti']
series_dict = dict(zip(series_name, series))

### 0.3 Wrangle Data

In [None]:

# =========

full_table = pd.read_csv('../input/corona-virus-report/covid_19_clean_complete.csv')

# Deep dive into the DataFrame
# Examine DataFrame (object type, shape, columns, dtypes)
full_table.info()

# type(full_table)
# full_table.shape
# full_table.columns
# full_table.dtypes
# full_table.head(20)

In [None]:
# Country wise
# ============

country_wise = pd.read_csv('../input/corona-virus-report/country_wise_latest.csv')

# Replace missing values '' with NAN and then 0
country_wise = country_wise.replace('', np.nan).fillna(0)

# Deep dive into the DataFrame
country_wise.info()
country_wise.head(10)

In [None]:
# Grouped by day, country
# =======================

full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv')
full_grouped.info()
full_grouped.head(10)

# Convert Date from Dtype "Object" (or String) to Dtype "Datetime"
full_grouped['Date'] = pd.to_datetime(full_grouped['Date'])
full_grouped.info()

In [None]:
# Grouped by day, country
# =======================

full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv')
full_grouped.info()
full_grouped.head(10)

# Convert Date from Dtype "Object" (or String) to Dtype "Datetime"
full_grouped['Date'] = pd.to_datetime(full_grouped['Date'])
us_covid = full_grouped[full_grouped['Country/Region']=="South Korea"]
us_covid.info()
us_covid.tail()

In [None]:
# Worldometer data
# ================

worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv')

# Replace missing values '' with NAN and then 0
# What are the alternatives? Drop or impute. Do they make sense in this context?
worldometer_data = worldometer_data.replace('', np.nan).fillna(0)
worldometer_data['Case Positivity'] = round(worldometer_data['TotalCases']/worldometer_data['TotalTests'],2)
worldometer_data['Case Fatality'] = round(worldometer_data['TotalDeaths']/worldometer_data['TotalCases'],2)

# Case Positivity is infinity when there is zero TotalTests due to division by zero
worldometer_data[worldometer_data["Case Positivity"] == inf] = 0

# Qcut is quantile cut. Here we specify three equally sized bins and label them low, medium, and high, respectively.
worldometer_data ['Case Positivity Bin']= pd.qcut(worldometer_data['Case Positivity'], q=3, labels=["low", "medium", "high"])

# Population Structure
worldometer_pop_struc = pd.read_csv('../input/covid19-worldometer-snapshots-since-april-18/population_structure_by_age_per_contry.csv')

# Replace missing values with zeros
worldometer_pop_struc = worldometer_pop_struc.fillna(0)
#worldometer_pop_struc.info()

# Merge worldometer_data with worldometer_pop_struc
# Inner means keep only common key values in both datasets
worldometer_data = worldometer_data.merge(worldometer_pop_struc,how='inner',left_on='Country/Region', right_on='Country')

# Keep observations where column "Country/Region" is not 0
worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0]

# Inspect worldometer_data's metadata
worldometer_data.info()

# Inspect Data
# worldometer_data.info()
# worldometer_data.tail(20)
# worldometer_data["Case Positivity"].describe()
worldometer_data.tail()

same data from problem set 1

In [None]:
# 1. Korean KS11 
korea = series_dict['korea']
korea['Date'] = pd.to_datetime(korea['Date'])
korea.rename(columns={'Adj Close':'korea'}, inplace=True)
korea['korea_return'] = korea['korea'].pct_change()
korea['korea_volatility_1m'] = (korea['korea_return'].rolling(20).std())*(20)**(1/2) 
korea['korea_volatility_1y'] = (korea['korea_return'].rolling(252).std())*(252)**(1/2) 
korea = korea[['Date','korea','korea_return','korea_volatility_1m','korea_volatility_1y']]
# Calculate 1-month forward cumulative returns
korea['one_month_forward_korea_return'] = korea['korea_return'][::-1].rolling(window=20, min_periods=1).sum()[::-1]

# 2. Bitcoin
btc = series_dict['btc']
btc['Date'] = pd.to_datetime(btc['Date'])
btc.rename(columns={'Adj Close':'btc'}, inplace=True)
btc['btc_return'] = btc['btc'].pct_change()
btc['btc_volatility_1m'] = (btc['btc_return'].rolling(20).std())*(20)**(1/2) 
btc['btc_volatility_1y'] = (btc['btc_return'].rolling(252).std())*(252)**(1/2) 
btc = btc[['Date','btc','btc_return','btc_volatility_1m','btc_volatility_1y']]
btc['one_month_forward_btc_return'] = btc['btc_return'][::-1].rolling(window=20, min_periods=1).sum()[::-1]

# 3. Gold
gold = series_dict['gold']
gold['Date'] = pd.to_datetime(gold['DATE'])
gold.rename(columns={'GOLDPMGBD228NLBM':'gold'}, inplace=True)
gold['gold_lag1'] = gold['gold'].shift(1)
gold['gold_lag2'] = gold['gold'].shift(2)
gold['gold'] = gold['gold'].fillna(gold['gold_lag1'])
gold['gold'] = gold['gold'].fillna(gold['gold_lag2'])
gold["gold"] = gold["gold"].astype('float64')
gold['gold_return'] = gold['gold'].pct_change()
gold['gold_volatility_1m'] = (gold['gold_return'].rolling(20).std())*(20)**(1/2) 
gold['gold_volatility_1y'] = (gold['gold_return'].rolling(252).std())*(252)**(1/2) 
gold = gold[['Date','gold','gold_return','gold_volatility_1m','gold_volatility_1y']]
gold['one_month_forward_gold_return'] = gold['gold_return'][::-1].rolling(window=20, min_periods=1).sum()[::-1]

# 4. High Yield Bond
high_yield_bond = series_dict['high_yield_bond']
high_yield_bond['Date'] = pd.to_datetime(high_yield_bond['Date'])
high_yield_bond.rename(columns={'Adj Close':'high_yield_bond'}, inplace=True)
high_yield_bond['high_yield_bond_return'] = high_yield_bond['high_yield_bond'].pct_change()
high_yield_bond['high_yield_bond_volatility_1m'] = (high_yield_bond['high_yield_bond_return'].rolling(20).std())*(20)**(1/2)
high_yield_bond['high_yield_bond_volatility_1y'] = (high_yield_bond['high_yield_bond_return'].rolling(252).std())*(252)**(1/2)
high_yield_bond = high_yield_bond[['Date','high_yield_bond','high_yield_bond_return','high_yield_bond_volatility_1m',
                                   'high_yield_bond_volatility_1y']]
high_yield_bond['one_month_forward_high_yield_bond_return'] = high_yield_bond['high_yield_bond_return'][::-1].rolling(window=20, min_periods=1).sum()[::-1]

# 5. Investment Grade Bond
inv_grade_bond = series_dict['inv_grade_bond']
inv_grade_bond['Date'] = pd.to_datetime(inv_grade_bond['Date'])
inv_grade_bond.rename(columns={'Adj Close':'inv_grade_bond'}, inplace=True)
inv_grade_bond['inv_grade_bond_return'] = inv_grade_bond['inv_grade_bond'].pct_change()
inv_grade_bond['inv_grade_bond_volatility_1m'] = (inv_grade_bond['inv_grade_bond_return'].rolling(20).std())*(20)**(1/2)
inv_grade_bond['inv_grade_bond_volatility_1y'] = (inv_grade_bond['inv_grade_bond_return'].rolling(252).std())*(252)**(1/2)
inv_grade_bond = inv_grade_bond[['Date','inv_grade_bond','inv_grade_bond_return','inv_grade_bond_volatility_1m',
                                 'inv_grade_bond_volatility_1y']]
inv_grade_bond['one_month_forward_inv_grade_bond_return'] = inv_grade_bond['inv_grade_bond_return'][::-1].rolling(window=20, min_periods=1).sum()[::-1]

# 6. Crude Oil WTI
wti = series_dict['wti']
wti['Date'] = pd.to_datetime(wti['DATE'])
wti.rename(columns={'WTISPLC':'wti'}, inplace=True)
wti['wti_return'] = wti['wti'].pct_change()
wti['wti_volatility_1m'] = wti['wti_return'].rolling(20).std()*(20)**(1/2)
wti['wti_volatility_1y'] = wti['wti_return'].rolling(252).std()*(252)**(1/2)
wti = wti[['Date','wti','wti_return','wti_volatility_1m','wti_volatility_1y']]
wti['one_month_forward_wti_return'] = wti['wti_return'][::-1].rolling(window=20, min_periods=1).sum()[::-1]

new things to add here

In [None]:
# 7. Inflation
cpi = series_dict['cpi']
cpi['Date'] = pd.to_datetime(cpi['DATE'])
cpi.rename(columns={'CUUR0000SEHE':'cpi'}, inplace=True)
cpi = cpi[['Date','cpi']]

# 8. Korean Employment
employment = series_dict['employment']
employment['Date'] = pd.to_datetime(employment['DATE'])
employment.rename(columns={'PAYEMS_CHG':'employment'}, inplace=True)
employment = employment[['Date','employment']]

# 9. Korean
fed_bs = series_dict['korea_m1']
fed_bs['Date'] = pd.to_datetime(fed_bs['DATE'])
fed_bs.rename(columns={'WALCL':'korea_m1'}, inplace=True)
fed_bs = fed_bs[['Date','korea_m1']]

In [None]:
nber_recession_indicator_month = pd.read_csv('../input/nber-based-recession-indicators-united-states/USRECM.csv')
nber_recession_indicator_day = pd.read_csv('../input/nber-based-recession-indicators-united-states/USRECD.csv')

nber_recession_indicator_day["Date"] = pd.to_datetime(nber_recession_indicator_day["date"])
nber_recession_indicator_day["value"] = nber_recession_indicator_day["value"].astype('bool')
nber_recession_indicator_day.rename(columns={'value':'recession'}, inplace=True)
nber_recession_indicator_day = nber_recession_indicator_day[["Date","recession"]]

In [None]:
baseline = pd.merge(korea, nber_recession_indicator_day, how='left', on='Date')
baseline = pd.merge(baseline, btc, how='left', on='Date')
baseline = pd.merge(baseline, cpi, how='left', on='Date')
baseline = pd.merge(baseline, gold, how='left', on='Date')
baseline = pd.merge(baseline, high_yield_bond, how='left', on='Date')
baseline = pd.merge(baseline, inv_grade_bond, how='left', on='Date')
baseline = pd.merge(baseline, wti, how='left', on='Date')
baseline = pd.merge(baseline, employment, how='left', on='Date')
baseline = pd.merge(baseline, fed_bs, how='left', on='Date')

baseline.loc[baseline.Date >= '2020-03-01', "recession"] = 1
baseline["recession"] = baseline["recession"].fillna(0)
#baseline["recession"] = baseline["recession"].astype(int)

baseline.info()

#2020 covid19 period
baseline2020 = baseline[baseline['Date'] >= '2020-01-01']
baseline2020 = pd.merge(baseline2020,us_covid, how='left', on='Date')
baseline2020['New cases'] = baseline2020['New cases'].fillna(0)

# 1. How does Korea's pandemic curve look? 

In [None]:
# Use Boolean indexing to generate a mask which is just a series of boolean values representing whether the column contains the specific element or not
selected = full_grouped['Country/Region'].str.contains('South Korea')

# Apply this mask to our original DataFrame to filter the required values.
korea = full_grouped[selected]
korea["New active"] = korea["Active"].diff()

korea.info()
korea.tail(10)

In [None]:
temp = korea.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()

# Plot a stack area graph with the three types of cases (i.e., recovered, deaths, and active)
fig = px.area(temp, x="Date", y="Count", color='Case', height=600, width=700,
             title='Cases over time - S. Korea', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

## 1.1 How does it compare to a country similar to Korea?

In [None]:
# Use Boolean indexing to generate a mask which is just a series of boolean values representing whether the column contains the specific element or not
selected2 = full_grouped['Country/Region'].str.contains('Denmark')

# Apply this mask to our original DataFrame to filter the required values.
denmark = full_grouped[selected2]
denmark["New active"] = denmark["Active"].diff()

denmark.info()
denmark.tail(10)

In [None]:
temp = denmark.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()

# Plot a stack area graph with the three types of cases (i.e., recovered, deaths, and active)
fig = px.area(temp, x="Date", y="Count", color='Case', height=600, width=700,
             title='Cases over time - Denmark', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

## 1.2 What may explain the similarity or difference?

### Similarity:


### Difference:



# 2. Are the reported confirmed cases and deaths reliable? Why? 

In [None]:
def plot_hbar_wm(col, n, min_pop=50000000, max_pop=100000000):
    df = worldometer_data[(worldometer_data['Population']>min_pop)&(worldometer_data['Population']<max_pop)]
    df = df.sort_values(col, ascending=True).tail(n)
    df.info()
    fig = px.bar(df,
                 x=col, y="Country/Region", color='WHO Region',  
                 text=col, orientation='h', width=700, 
                 color_discrete_sequence = px.colors.qualitative.Dark2)
    fig.update_layout(title=col+' (Only countries with Population > ' + str(min_pop)+' and < '+str(max_pop), 
                      xaxis_title="", yaxis_title="", 
                      yaxis_categoryorder = 'total ascending',
                      uniformtext_minsize=8, uniformtext_mode='hide')
    fig.show()
    
# Draw histogram with two arguments
# 1. variable of interest
# 2. the number of bins
def plot_histogram_wm(col, bins):
    fig = px.histogram(worldometer_data[col], x=col, nbins=bins)
    fig.show()

In [None]:
# Draw bar chart for case fatality of the top 10 countries with the highest case fatality rate (with the close population of Korea)
plot_hbar_wm('Case Fatality', 10, 40000000,60000000)

In [None]:
def plot_hbar_w(col, n, min_pop=1000000):
    df = worldometer_data[worldometer_data['Population']>min_pop]
    df = df.sort_values(col, ascending=True).tail(n)
    df.info()
    fig = px.bar(df,
                 x=col, y="Country/Region", color='WHO Region',  
                 text=col, orientation='h', width=700, 
                 color_discrete_sequence = px.colors.qualitative.Dark2)
    fig.update_layout(title=col+' (Only countries with Population > ' + str(min_pop), 
                      xaxis_title="", yaxis_title="", 
                      yaxis_categoryorder = 'total ascending',
                      uniformtext_minsize=8, uniformtext_mode='hide')
    fig.show()


In [None]:
plot_hbar_w('Tests/1M pop', 15, 1000000)

In [None]:
plot_hbar_w('Case Positivity', 15, 1000000)

In [None]:
# Draw the histogram for case fatality rate (50 bins)
plot_histogram_wm("Case Fatality",50)

In [None]:
def plot_histogram_wma(col, bins,n):
    mi = worldometer_data[worldometer_data['Case Positivity']<n]
    fig = px.histogram(mi[col], x=col, nbins=bins)
    fig.show()
plot_histogram_wma("Case Positivity",50,5)

In [None]:
def plot_histogram_wmn(col, bins,n):
    mi = worldometer_data[worldometer_data['Population']>n]
    fig = px.histogram(mi[col], x=col, nbins=bins)
    fig.show()

plot_histogram_wmn("Tests/1M pop",50,1000000)

In [None]:
k1=worldometer_data['Country/Region'].str.contains('S. Korea')
k2=worldometer_data['Country/Region'].str.contains('Denmark')
worldometer_data[k1]

In [None]:
worldometer_data[k2]

# 3.1 What are the economic and financial impacts of Covid19 on your country?

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces to create subplots
fig.add_trace(
    go.Scatter(x=baseline2020['Date'], y=baseline2020['korea'], name = 'KS11'),  
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=baseline2020['Date'], y=baseline2020['New cases'], name = 'New COVID19 Cases'), 
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="KS11 and New COVID19 Cases"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>KS11</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>New COVID19 Cases</b>", secondary_y=True)

fig.show()

In [None]:
baseline['korea_return'].describe()

In [None]:
print("The worst single-day return in 2020 is ", str(round(abs(baseline2020['korea_return'].min()/baseline['korea_return'].std()),2)), 
      " X standard deviations of KS11 historical returns!")

In [None]:
print("KS11 historical daily returns from " + str(baseline[baseline['korea_return'].notnull()]['Date'].min().date()) + ' to '
       + str(baseline[baseline['korea_return'].notnull()]['Date'].max().date()))

fig = px.histogram(baseline, x="korea_return")
fig.show()

In [None]:
def plot_chart(series):
    fig = px.scatter(baseline[baseline[series].notnull()], x="Date", y=series, color='recession', width=1000)
    fig.update_traces(mode='markers', marker_size=4)
    fig.update_layout(title=series, xaxis_title="", yaxis_title="")
    fig.show()

In [None]:
plot_chart("employment")

In [None]:
plot_chart('korea_m1')

# 3.2 What are the plausible reasons for the observed impacts? 

# 4. What do you want to find out more? What do you find?

How to find a country whose COVID-19 condition is close to Korea?

In [None]:
def gt_n(n,m):
    # Identify countries with confirmed cases greater than m and recovered cases less than n - close condition to Korea
    # Then among these countries choose the unique set of countries
    countries = full_grouped[(full_grouped['Recovered']>n) & (full_grouped['Confirmed']<m)]['Country/Region'].unique()
    
    # Filter countries that are in the unique set of countries with confirmed cases greater than N
    temp = full_table[full_table['Country/Region'].isin(countries)]
    
    # Aggregate (i.e., sum up) confirmed cases by Country/Region and Date
    # Reset the index (it is no longer in running order)
    temp = temp.groupby(['Country/Region', 'Date'])['Confirmed','Recovered'].sum().reset_index()
    
    # Filter observations with confirmed cases more than N
    temp = temp[(temp['Recovered']>n) & (temp['Confirmed']<m)]
    # print(temp.head())

    # Identify the start date when confirmed cases exceed N for each country
    min_date = temp.groupby('Country/Region')['Date'].min().reset_index()
    
    # Name the columns in the dataframe min_date
    min_date.columns = ['Country/Region', 'Min Date']
    # print(min_date.head())

    # Merge dataframe temp with dataframe min_date by 'Country/Region'
    from_nth_case = pd.merge(temp, min_date, on='Country/Region')
    
    # Convert data type to datetime object
    from_nth_case['Date'] = pd.to_datetime(from_nth_case['Date'])
    from_nth_case['Min Date'] = pd.to_datetime(from_nth_case['Min Date'])
    
    # Create a variable that counts the number of days relative to the day when confirmed cases exceed N
    from_nth_case['N days'] = (from_nth_case['Date'] - from_nth_case['Min Date']).dt.days
    # print(from_nth_case.head())

    # Plot a line graph from dataframe from_nth_case with column 'N days' and 'Confirmed' mapped to x-axis and y-axis, respectively.
    # Distinguish each country by color (system-determined color)
    # str converts n integer into string and "'N days from '+ str(n) +' case'" is the title 
    fig = px.line(from_nth_case, x='N days', y='Confirmed', color='Country/Region', 
                  title='N days from '+ str(n) +' case', height=600)
    fig.show()

In [None]:
# Call function gt_n with argument 10000,30000 to get information from countries have close condition with Korea
gt_n(10000,15000)

We set the constrain to find out countries with less than 15,000 of confirmed cases and more than 10,000 of recovered cases, and then plot them in the same plot. From the plot above, we found the lines of Denmark and Korea are pretty close.

# Relation between GDP and Case Positivity & Case Fatality