In [None]:
# data wrangling
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# offline interactive visualization
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

# regression
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.graphics.api as smg

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Worldometer data
# ================

worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv')

# Replace missing values '' with NAN and then 0
worldometer_data = worldometer_data.replace('', np.nan).fillna(0)

# Correcting Country name 
worldometer_data['Country/Region'].replace({'USA':'US', 'UAE':'United Arab Emirates', 'S. Korea':'South Korea', \
                                           'UK':'United Kingdom'}, inplace=True)

# Grouped by day, country
# =======================

full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv')

# Merge in population data
full_grouped = full_grouped.merge(worldometer_data[['Country/Region', 'Population']], how='left', on='Country/Region')

full_grouped['Date'] = pd.to_datetime(full_grouped['Date'], format = '%Y-%m-%d')

In [None]:
_= plt.plot(full_grouped[full_grouped['Country/Region']=='New Zealand']['Date'],full_grouped[full_grouped['Country/Region']=='New Zealand']['Active'])
plt.show()
print(full_grouped[(full_grouped['Country/Region']=='New Zealand') &(full_grouped['Date']=='2020-07-14')]['Active'])

In [None]:
def plot_country(country, date): 
    temp = full_grouped[full_grouped['Country/Region']==country]
    temp['recent_wave'] = np.where(temp['Date'] >= date,1,0)

    fig = px.line(temp, x='Date', y='Confirmed', color='recent_wave', \
                  title = 'Infections for ' + str(country), height=600)      
    fig.show()
    
    fig = px.line(temp, x='Date', y='Recovered', color='recent_wave', \
              title = 'Recovered Patients ' + str(country), height=600)      
    fig.show()
    
    return country, date

In [None]:
country, date = plot_country('New Zealand', '2020-07-14')

In [None]:
# Calibrate model

def estimate_sir_param(country, date):
    
    # Assume everyone is at risk
    # Identify the maximum population and the latest date in the time series for the country
    population  = full_grouped[full_grouped['Country/Region']==country]["Population"].max()
    latest_date = full_grouped[full_grouped['Country/Region']==country]["Date"].max()
    
    time_series_length = (latest_date - datetime.strptime(date,'%Y-%m-%d')).days + 1

    temp = full_grouped[full_grouped['Country/Region']==country]
    temp['recent_wave'] = np.where(temp['Date'] >= date,1,0)
    
    # Initialize Numpy arrays for total population (the maximum population), 
    # susceptible population (empty), and change in time (i.e., 1 day)
    N  = np.array([population] * time_series_length)
    S  = np.array([])
    dt = np.array([1] * (time_series_length-1))

    # Apply the condition N = S+I+(R+D)
    # Filter time-series to those of the recent wave
    I = np.array(temp[temp['recent_wave']==1]['Active'])
    R = np.array(temp[temp['recent_wave']==1]['Recovered'])
    D = np.array(temp[temp['recent_wave']==1]['Deaths'])

    # R includes both Recovered and Death for brevity
    S = N - I - (R + D)

    ## 1. Estimate beta
    
    x = (S * I) / N
    
    # Copy all elements except the last
    x = x[:-1].copy()
    
    # Take the first difference
    dS = np.diff(S)
    y = dS/dt

    # Fit into a linear regression
    results = sm.OLS(y, x, missing='drop').fit()
    beta = results.params
    print(results.summary())
    print('\n')
    print('*'*80)
    print(f"Transmission rate or Beta is: {beta}")
    print('*'*80)
    
    ## 2. Estimate gamma
    
    x = I[:-1].copy()
    dR = np.diff(R+D)
    y = dR/dt

    results = sm.OLS(endog=y, exog=x, missing='drop').fit()
    gamma = results.params
    print (results.summary())
    print('\n')
    print('*'*80)
    print(f"Recovery (and Mortality) rate or Gamma is: {gamma}")
    print('*'*80)
    
    #3. Calculate R

    print('\n')
    print('*'*80)
    print(f"Reproduction number or R is: {-beta/gamma}")
    print('*'*80)
    
    return -beta.astype('float'), gamma.astype('float'), datetime.strptime(date,'%Y-%m-%d').date()


In [None]:
beta, gamma, date = estimate_sir_param('New Zealand', date)

In [None]:
def sir_model(I0=0.01, beta=0.6, gamma=0.1, days=365, date=date.today()):
    """
    Function will take in initial state for infected population,
    Transmission rate (beta) and recovery rate(gamma) as input.
    
    The function returns the maximum percentage of infectious population,
    the number of days to reach the maximum (inflection point),
    the maximum percentage of population infected,
    the number of days to reach 80% of the maximum percentage of population infected.
    
    """
    ## Initialize model parameters
    N = 1          #Total population in percentage, i.e., 1 = 100%
    I = I0         #Initial state of I default value 1% of population, i.e., I0 = 0.01
    S = N - I      #Initial state of S
    R = 0          #Initial State of R
    C = I          #Initial State of Total Cases
    beta  = beta   #Transmission Rate
    gamma = gamma  #Recovery Rate

    ## Initialize empty lists
    inf  = []       # List of Infectious population for each day
    day  = []       # Time period in day
    suc  = []       # List of Susceptible population for each day
    rec  = []       # List of Recovered population for each day
    conf = []       # List of Total Cases population for each day
    
    ## Project into the future
    for i in range(days):
        day.append(i)
        inf.append(I)
        suc.append(S)
        rec.append(R)
        conf.append(C)

        new_inf= I*S*beta/N            #New infections equation (1)   
        new_rec= I*gamma               #New Recoveries equation (2)
        
        I=I+new_inf-new_rec            #Total infectious population for next day
        S=max(min(S - new_inf, N), 0)  #Total infectious population for next day
        R=min(R + new_rec, N)          #Total recovered population for next day
        
        C=C+new_inf                    #Total confirmed cases for next day

    ## Pinpoint important milestones    
    max_inf = round(np.array(inf).max()*100,2)        #Peak infectious population in percentage
    inflection_day = inf.index(np.array(inf).max())   #Peak infectious population in days
    max_conf = round(np.array(conf).max()*100,2)      #Overall infected population in percentage
    plateau_day = np.array(np.where(np.array(conf) >= 0.8*np.array(conf).max())).min()   #Peak infectious population in days
        
    print(f"Maximum Infectious population at a time :{max_inf}%")
    print(f"Number of Days to Reach Maximum Infectious Population (Inflection Point):{inflection_day} days or {date + timedelta(days=inflection_day)}")
    print(f"Total Infected population :{max_conf}%")
    print(f"Number of Days to Reach 80% of the Projected Confirmed Cases (Plateau Point):{plateau_day} days or {date + timedelta(days=plateau_day.item())}")

    ## Visualize the model outputs
    sns.set(style="darkgrid")
    plt.figure(figsize=(10,6))
    plt.title(f"SIR Model: R = {round(beta/gamma,2)}", fontsize=18)
    sns.lineplot(day,inf, label="Infectious")
    sns.lineplot(day,suc,label="Succeptible")
    sns.lineplot(day,rec, label="Recovered")
    
    plt.legend()
    plt.xlabel("Time (in days)")
    plt.ylabel("Fraction of Population")
    plt.show()

In [None]:
sir_model(I0=0.000006, beta = beta.item(), gamma = gamma.item(), days=730, date = date)

In [None]:
Q1: We can see that since New Zealand has a really good medical treatment system, the recovery rate is really high
    so we can reach the inflection point in day 1. We also reach the plateau point really fast at 2020-08-25, which is 42 days
    after the starting point.

**Q2**

In [None]:
# Import and wrangle with stock_ret dataset
nz50 = pd.read_csv('../input/nz50data/NZ50.csv')
nz50['Date'] = pd.to_datetime(nz50['Date'])
nz50.tail()

# Calculate daily Total Returns for the NZ50 (excluding dividends)
nz50['day_return'] = nz50['Close']/nz50['Close'].shift(1) - 1

nz50 = nz50.loc[:].copy()
nz50['cum_return'] = np.cumprod(nz50['day_return']+1)
nz50.info()
nz50.tail()

In [None]:
# Calculate the negative runs in the NZ50 (i.e., from one peak to another)
# Initialize an empty list for cumulative returns from one peak to another 
neg_run = []

# Store the previous maximum cumulative return
max_cum_nz50_now = nz50['cum_return'].iloc[0]   

# enumerate() method adds counter (t) to an iterable (nz50['day_return']) and 
# returns a tuple (t, stock_ret['day_return'])
for t, val in enumerate(nz50['day_return']):
    
    # First return in the daily return series
    if t == 0:
        
        # If daily return is negative
        if val < 0:
            
            # Append the negative return to neg_run list
            neg_run.append(val)
            
        else:
            
            # Append a zero to neg_run list
            neg_run.append(0)
            
    # Not the first return in the daily return series
    else:
        
        # If the cumulative return at time t is less than the previous maximum cumulative return
        # i.e., the previous all time high
        if nz50['cum_return'].iloc[t] < max_cum_nz50_now:
            
            # cumulate/compound the return at time t with the return at time t-1
            # i.e., tally the loss
            neg_run.append((1 + neg_run[t-1])*(1 + val) - 1) 
            
        # If the cumulative return at time t is more than the previous maximum cumulative return
        else:
            
            # stop the loss tally and append a zero to the negative run list
            neg_run.append(0)                                
            
            # replace the previous all time high with the new high
            max_cum_nz50_now = nz50['cum_return'].iloc[t]

# Add the variable to the dataframe stock_ret
nz50['neg_run'] = neg_run

In [None]:
# Plot the nz50 time series
sns.lineplot(x='Date', y='Close', data=nz50, color='red')

In [None]:
# Plot the peak-to-peak negative run
sns.lineplot(x='Date', y='neg_run', data=nz50, color='red')

In [None]:
# Recap that a neg_run is the peak-to-peak run 
# Identify and label each neg_run sequentially (e.g., the 10th neg_run is tagged as 10)
# The label serves as the groupby variable to examine the characteristics of each run

# Initialize label value
label = 1

# Initialize the indicator value of whether stock_ret['neg_run'] (or loss tally) is within a peak-to-peak run
within_negative_run = False

# Initialize an empty list for negative run number
neg_run_num = []

# Identify and label each cycle of negative run, which ends with a zero
# The cumulative return (or loss tally) during the cycle is negative
for i in nz50['neg_run']:
    
    # Loss tally is negative
    if i < 0:
        
        # Append the label to neg_run_num list
        neg_run_num.append(label)
        
        # Switch the state for within_negative_run
        within_negative_run = True
        
    # Loss tally is zero - negative run ends
    else:
        
        # Append a zero to neg_run_num list
        neg_run_num.append(0)
        
        # Increment label value by 1 if within_negative_run is True
        # This happens only for a 'new' cycle of negative run
        # The label doesn't increment by 1 in market run-up after the exit from a negative run
        # i.e., reaching new all-time highs after exiting from a cycle of negative run
        if within_negative_run:
            label += 1
            within_negative_run = False
            
nz50['neg_run_num'] = neg_run_num

In [None]:
# Identify and label each peak (previous all time high) to trough (the lowest point) within each peak-to-peak run
# This is also known as the maximum drawdown
# The integer label runs sequentially (e.g., the 10th peak-to-trough is tagged as 10)

# Initialize the label value
label = 1

# Initialize the search status of whether the lowest point within a negative run has been discovered
is_neg_run_min = False

# Initialize an empty list for peak-to-trough run number
peak_trough_num = []

for t, val in enumerate(nz50['neg_run_num']):
    
    # Identify the lowest point (i.e., cumulated returns) within a negative run
    trough = min(nz50[nz50['neg_run_num']==val]['neg_run'])
    
    # Recap that if the cumulative return at time t is more than the previous maximum cumulative return
    # The loss tally will stop with a zero appended to the negative run list (i.e., the negative run has ended)
    # neg_run_num will also be appended with a zero when neg_run is zero

    # While still within a peak-to-peak negative run
    if val > 0:
        
        # Append zero to peak_trough_num if the lowest point has been discovered
        if is_neg_run_min:
            peak_trough_num.append(0)
            
        # Lowest point within a negative run has not been discovered
        else:
            if nz50.iloc[t]['neg_run'] == trough:
                is_neg_run_min = True
                peak_trough_num.append(val)
            else:
                peak_trough_num.append(val)
                
    # Out of the peak-to-peak negative run
    else:
        is_neg_run_min = False
        peak_trough_num.append(val)
            
nz50['peak_trough_num'] = peak_trough_num

In [None]:
# Groupby's to check out the durations and maximum loss or drawdown of each market decline identified
# There are 263 peak-to-peak negative runs

# By peak-to-peak run number, count the number of days 
run_len = nz50[nz50['neg_run_num']>0].groupby('neg_run_num').count()['neg_run']

# By peak-to-peak run number, count lowest cumulative returns (i.e., maximum drawdown)
maximum_drawdown = nz50[nz50['neg_run_num']>0].groupby('neg_run_num').min()['neg_run']

# By peak-to-trough run number, count the number of days
peak_trough_dur = nz50[nz50['peak_trough_num']>0].groupby('peak_trough_num').count()['neg_run']

fig, ax = plt.subplots(3)
ax[0].plot(run_len.sort_values(ascending=False).reset_index(drop=True))
ax[0].set_title("Time between Two Peaks (Days)")
ax[1].plot(peak_trough_dur.sort_values(ascending=False).reset_index(drop=True))
ax[1].set_title("Time to Maximum Drawdown (Days)")
ax[2].plot(maximum_drawdown.sort_values(ascending=False).reset_index(drop=True))
ax[2].set_title("Maximum Drawdown (%)")
fig.tight_layout()

In [None]:
# Store groupby results in a new dataframe with the 263 runs
declines_df = pd.DataFrame()

declines_df['run_len'] = run_len
declines_df['maximum_drawdown'] = maximum_drawdown
declines_df['peak_trough_dur'] = peak_trough_dur

declines_df.tail(10)

In [None]:
# Create 6 buckets by the magnitude of drawdown
drawdown_bin = []
for i in maximum_drawdown:
    if i >= 0.00:
        drawdown_bin.append(0)
    elif i >= -0.05:
        drawdown_bin.append(1)
    elif i >= -0.10:
        drawdown_bin.append(2)
    elif i >= -0.20:
        drawdown_bin.append(3)
    elif i >= -0.30:
        drawdown_bin.append(4)
    else:
        drawdown_bin.append(5)

declines_df['drawdown_bin'] = drawdown_bin

In [None]:
# Overall means for drawdown metrics
np.mean(declines_df)

In [None]:
# Count the number of drawdowns in each drawdown bucket
declines_df.groupby('drawdown_bin').count()['run_len']

In [None]:
# Plot the number of declines in each magnitude bucket in probability term

# Calculate the probability of being in a drawdown bin relative to all drawdown bins
prob_bucket = declines_df.groupby('drawdown_bin').count()['run_len']/sum(declines_df.groupby('drawdown_bin').count()['run_len'])

# Plot the probabilities for each drawdown bin
fig, ax = plt.subplots(figsize=(10,6))
bin_names = ['-5% or Better','-5% to -10%','-10% to -20%','-20% to -30%','-30% or Worse']
sns.barplot(x=prob_bucket, y=bin_names);
ax.set_xlabel("Probability",fontsize=14)
ax.set_ylabel("Drawdown Bin",fontsize=14)

# Probability is between 0 and 1 - limit the range of possible value for x-axis
ax.set_xlim(0, 1)

plt.tight_layout()

In [None]:
# What happens after the market has already dropped by 5%

# Calculate the probability for 
worst_probs = prob_bucket[1:]/sum(prob_bucket[1:])

# probability of decline more than 10%
print("The probability of a further decline of more than 10% is", sum(worst_probs[1:]))     

# probability of decline being more than 20%
print("The probability of a further decline of more than 20% is", sum(worst_probs[2:]))

In [None]:
# Calculate the mean maximum drawdown for each drawdown bucket of negative runs 
declines_df.groupby('drawdown_bin').mean()['maximum_drawdown']

In [None]:
# Calculate the metrics of each drawdown bucket and store in a dataframe for plots

# Calculate the peak-to-peak and peak-to-trough duration for each run
duration_df = declines_df.groupby('drawdown_bin').mean()[['peak_trough_dur','run_len']]
duration_df.reset_index(inplace=True)

# Time to recover (in days)
duration_df['recover_dur'] = duration_df['run_len'] - duration_df['peak_trough_dur']

# Time to recover relative to time to the trough
duration_df['recover_to_peak_trough_ratio'] = duration_df['recover_dur'] / duration_df['peak_trough_dur']

In [None]:
# Plot the metrics
fig, ax = plt.subplots(figsize=(10,6))
bin_names = ['-5% or Better','-5% to -10%','-10% to -20%','-20% to -30%','-30% or Worse']
sns.barplot(x=bin_names, y=duration_df['recover_dur'])
ax.set_xlabel("Market Decline Bin",fontsize=14)
ax.set_ylabel("Recovery Time in Days",fontsize=14)

plt.tight_layout()

In [None]:
duration_df

In [None]:
# Calculate the Number and percentage of negative days
print("The number of negative daily returns: ", len([i for i in nz50['day_return'] if i<0]))
print("The number of daily returns: ", nz50.shape[0])
print("The fraction of negative daily returns: ", len([i for i in nz50['day_return'] if i<0])/nz50.shape[0])

In [None]:
# Calculate the Mean length of drawdown
print("The average length of peak-to-trough market downturn: ", np.mean(declines_df['peak_trough_dur']), "days")