In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from scipy.special import expit
from scipy.optimize import curve_fit
def check(df, n=5):
    print('shape =', df.shape)
    dft = pd.concat([df.head(n), df.tail(n)]) if len(df) >= 5*2 else df
    display(dft)
pd.core.frame.DataFrame.check = check

In [None]:
df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/train.csv', dtype={'Id': int, 'ConfirmedCases': int, 'Fatalities': int})
df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/test.csv', dtype={'ForecastId': int})
df_sub = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/submission.csv')

df_train['Date'] = pd.to_datetime(df_train['Date'])
df_test['Date'] = pd.to_datetime(df_test['Date'])
day0 = df_train['Date'].min()

df_train['days'] = (df_train['Date'] - day0).dt.days
df_test['days'] = (df_test['Date'] - day0).dt.days

df_train['unique_key'] = df_train['Country_Region'] + '_' + df_train['Province_State'].fillna('NaN')
df_test['unique_key'] = df_test['Country_Region'] + '_' + df_test['Province_State'].fillna('NaN')

df_train.check()
df_test.check()

In [None]:
print('N unique keys (train, test):', df_train['unique_key'].nunique(), ',', df_test['unique_key'].nunique())
print('train days range:', df_train['days'].min(), '-', df_train['days'].max())
print('test  dats range:', df_test['days'].min(), '-', df_test['days'].max())

locations = df_train['unique_key'].unique()
days_range = np.arange(df_test['days'].max()+1)
days_range

In [None]:
print(df_test['Country_Region'].unique())
df_test.loc[(df_test['Country_Region']=='US') & (df_test['Province_State']=='Alabama')].check()

### Fitting function

In [None]:
mysigmoid = lambda x, a, b, c: a * expit(b * (x-c))
#mysigmoid_fixa = lambda x, b, c: a * expit(b * (x-c))
def log1p_sigmoid(x, a, b, c):
    return np.log1p(a * expit(b * (x-c)))

In [None]:
x = np.arange(df_train['days'].max()+1)
a = 60; b = 0.2; c = 80
plt.plot(x, mysigmoid(x, a, b, c))
plt.grid(); plt.show()
plt.plot(x, log1p_sigmoid(x, a, b, c))
plt.grid(); plt.show()

## Example of data

In [None]:
plt.figure(figsize=(12, 6))
for l in np.random.choice(locations, 10, replace=False):
    df = df_train.loc[df_train['unique_key'] == l]
    plt.plot(df['days'], df['ConfirmedCases'], label=l, alpha=0.5)
plt.xlabel('days'); plt.ylabel('confirmed cases')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.figure(figsize=(12, 6))
for l in np.random.choice(locations, 10, replace=False):
    df = df_train.loc[df_train['unique_key'] == l]
    plt.plot(df['days'], np.log1p(df['ConfirmedCases']), label=l, alpha=0.5)
plt.xlabel('days'); plt.ylabel('log1p(confirmed cases)')
plt.legend(); plt.grid(); plt.show()

### Find the center

In [None]:
def find_center(sq, r=0.5):
    """ Estimate center (c) for a sigmoid curve """
    mid = sq.max() * r
    return np.argmin(np.abs(sq - mid))

In [None]:
plt.figure(figsize=(16, 8))
plt_subplot = 1
for l in np.random.choice(locations, 10, replace=False):
    plt.subplot(2, 5, plt_subplot)
    df = df_train.loc[df_train['unique_key'] == l]
    b = find_center(df['ConfirmedCases'].values)
    plt.plot(df['days'], df['ConfirmedCases'], label=l, alpha=0.5)
    plt.plot(df['days'].values[b], df['ConfirmedCases'].values[b], '*')
    plt.xlabel('days'); plt.ylabel('confirmed cases')
    plt.legend(); plt.grid(); 
    plt_subplot += 1
plt.show()

In [None]:
issue_case = 'Bahamas_NaN'
df = df_train.loc[df_train['unique_key'] == issue_case]
a = df['ConfirmedCases'].max() * 4
b = 0.2
c = find_center(df['ConfirmedCases'].values)
min_day = df.loc[df['ConfirmedCases'] > max(df['ConfirmedCases'].max() / 10, 0), 'days'].min()
print('%39s'%issue_case, 'min_day=%3d'%min_day, 'init=', (a, b, c), end=' ')

x = df['days'].values
plt.plot(x, df['ConfirmedCases'], label=issue_case, alpha=0.5)
plt.plot(x[c], df['ConfirmedCases'].values[c], '*', label='center')
plt.plot(x, mysigmoid(x, a, b, c), label='init')
plt.yscale('log')
plt.xlabel('days'); plt.ylabel('confirmed cases')
plt.legend(); plt.grid(); plt.show()

### Confirmed cases

In [None]:
# days_low (lowest day to include in the curve fitting)

# days_low = {key: 21 for key in df_train['unique_key'].unique()}
days_low = {}
days_low.update({
    # adding manual days_low here
})

In [None]:
# key = 'Australia_New South Wales'
# df = df_train.loc[df_train['unique_key'] == key]
# min_day = df.loc[df['ConfirmedCases'] > df['ConfirmedCases'].max() / 10, 'days'].min()

In [None]:
# df.loc[df['days'] >= min_day, 'days']

In [None]:
# params, _ = curve_fit(
#             log1p_sigmoid, 
#             df.loc[df['days'] >= min_day, 'days'], 
#             np.log1p(df.loc[df['days'] >= min_day, 'ConfirmedCases']), 
#             p0=[a, b, c], bounds=(0, np.inf))

In [None]:
%%time
params_confirmed = {}
issue_locations = []
for i, key in enumerate(df_train['unique_key'].unique()):
    df = df_train.loc[df_train['unique_key'] == key]
    a = df['ConfirmedCases'].max() * 2
    b = 0.2
    c = find_center(df['ConfirmedCases'].values)
#     c = df.loc[df['ConfirmedCases'] != 0, 'days'].min() + 45  # initial
#     mysigmoid_fixed_a = lambda x, b, c: a * expit(b * (x-c))
#     def log1p_sigmoid_fixed_a(x, b, c): 
#         return np.log1p(a * expit(b * (x - c)))
    if key not in days_low:
        min_day = df.loc[df['ConfirmedCases'] > df['ConfirmedCases'].max() / 10, 'days'].min()
        days_low[key] = min_day
    else:
        min_day = days_low[key]
    print('%39s'%key, 'min_day=%3d'%min_day, 'init=', (a, b, c), end=' ')
    
    try:
        params, _ = curve_fit(
            log1p_sigmoid, 
            df.loc[df['days'] >= min_day, 'days'], 
            np.log1p(df.loc[df['days'] >= min_day, 'ConfirmedCases']), 
            p0=[a, b, c], bounds=(0, np.inf)) # , maxfev=1000
        a, b, c = params
    #         params, _ = curve_fit(
    #             log1p_sigmoid_fixed_a, 
    #             df.loc[df['days'] >= days_low[key], 'days'], 
    #             np.log1p(df.loc[df['days'] >= days_low[key], 'ConfirmezzdCases']),
    #             p0=[b, c])
    #         b, c = params
        print('fit = (%6d, %.3f, %.1f)'%(a, b, c))
    except:
        print('fit = (%6d, %.3f, %.1f) Not Optimal'%(a, b, c))
        issue_locations.append(key)
    params_confirmed[key] = (a, b, c)

    if i%5==0: 
        plt.figure(figsize=(20, 4))
    plt.subplot(151 + i%5)
    plt.title(key)
    plt.scatter(df['days'], df['ConfirmedCases'], label='true', s=10, alpha=0.8)
    plt.scatter(df.loc[df['days'] >= min_day, 'days'], df.loc[df['days'] >= min_day, 'ConfirmedCases'], 
                label='true train', s=10, alpha=0.7)
    # pred = [mysigmoid_fixed_a(x, b, c) for x in days_range]
    pred = [mysigmoid(x, a, b, c) for x in days_range]
    plt.plot(days_range, pred, label='pred')
    plt.grid(); plt.yscale('log')
    if i%5==4:
        plt.legend(); plt.show()
plt.legend(); plt.show()

In [None]:
display(params_confirmed)

## Issue fix round 1

In [None]:
print('Locations need manually fix:', len(issue_locations))
for l in issue_locations:
    print("    '%s': "%l, params_confirmed[l], sep='', end=',\n')

In [None]:
# Average b (for b without fitting issue)
b_mean = []
for l in params_confirmed:
    if l not in issue_locations:
        b_mean.append(params_confirmed[l][1])
b_mean = np.mean(b_mean)
print('Global average b:', b_mean)

In [None]:
# Modify params_confirmed here
fix_init_params = {
    'Angola_NaN': (34, 0.2, 80),
    'Bangladesh_NaN': (328, 0.2, 80),
    'Belarus_NaN': (1722, 0.2, 80),
    'Belize_NaN': (14, 0.2, 90),
    'Benin_NaN': (52, 0.2, 90),
    'Bhutan_NaN': (10, 0.2, 70),
    'Botswana_NaN': (12, 0.2, 80),
    'Cameroon_NaN': (1316, 0.2, 80),
    'Canada_Northwest Territories': (10, 0.2, 80),
    'Central African Republic_NaN': (16, 0.2, 80),
    'China_Hong Kong': (1870, 0.2, 64),
    'China_Macau': (88, 0.2, 60),
    'Congo (Brazzaville)_NaN': (90, 0.2, 80),
    'Denmark_NaN': (10142, 0.2, 80),
    'Djibouti_NaN': (180, 0.2, 80),
    'Ethiopia_NaN': (104, 0.2, 80),
    'Fiji_NaN': (30, 0.2, 80),
    'France_Saint Barthelemy': (12, 0.2, 80),
    'Gabon_NaN': (60, 0.2, 80),
    'Georgia_NaN': (392, 0.2, 90),
    'Guinea-Bissau_NaN': (66, 0.2, 80),
    'Guyana_NaN': (66, 0.2, 80),
    'India_NaN': (10622, 0.2, 80),
    'Japan_NaN': (7812, 0.2, 80),
    'Kuwait_NaN': (1486, 0.2, 80),
    'Liberia_NaN': (28, 0.2, 80),
    'Moldova_NaN': (2112, 0.2, 80),
    'Nepal_NaN': (18, 0.2, 65),
    'Papua New Guinea_NaN': (4, 0.2, 58),
    'Peru_NaN': (5908, 0.2, 71),
    'Qatar_NaN': (4114, 0.2, 95),
    'Saint Lucia_NaN': (28, 0.2, 67),
    'Saint Vincent and the Grenadines_NaN': (16, 0.2, 72),
    'Sudan_NaN': (28, 0.2, 69),
    'US_District of Columbia': (2422, 0.15, 90),
    'US_Nebraska': (894, 0.2, 70),
    'United Arab Emirates_NaN': (4718, 0.2, 72),
    'United Kingdom_Cayman Islands': (90, 0.2, 70),
    'United Kingdom_Turks and Caicos Islands': (16, 0.2, 66),
    'Uzbekistan_NaN': (1040, 0.2, 73),
    'West Bank and Gaza_NaN': (522, 0.2, 80),
}
days_low.update({
    'Angola_NaN': 60, 'Bangladesh_NaN': 70, 'Belarus_NaN': 65, 'China_Macau': 50, 'China_Hong Kong': 50, 
    'Denmark_NaN': 51, 'Djibouti_NaN': 63, 'Ethiopia_NaN': 50, 'Georgia_NaN': 50, 'India_NaN': 42, 'Moldova_NaN': 59, 'Peru_NaN': 59,
    'Nepal_NaN': 61, 'Qatar_NaN': 65, 'US_District of Columbia': 57, 'US_Nebraska': 50, 'Uzbekistan_NaN': 58, 'West Bank and Gaza_NaN': 60, 
})
issue_locations2 = []
for i, (key, params0) in enumerate(fix_init_params.items()):
    df = df_train.loc[df_train['unique_key'] == key]
    if key not in days_low:
        min_day = df.loc[df['ConfirmedCases'] > df['ConfirmedCases'].max() / 10, 'days'].min()
        days_low[key] = min_day
    else:
        min_day = days_low[key]
    print('%39s'%key, 'min_day=%3d'%min_day, 'init=', params0, end=' ')
    
    try:
        params, _ = curve_fit(
            log1p_sigmoid, 
            df.loc[df['days'] >= min_day, 'days'], 
            np.log1p(df.loc[df['days'] >= min_day, 'ConfirmedCases']), 
            p0=params0, bounds=([df['ConfirmedCases'].max(), 0, 9], [1e9, 0.3, 120]), ftol=0.00001)
        a, b, c = params
        print('fit = (%6d, %.3f, %.1f)'%(a, b, c))
    except:
        a, b, c = params0
        print('fit = (%6d, %.3f, %.1f) Not Optimal'%params0)
        issue_locations2.append(key)
    params_confirmed[key] = (a, b, c)

    if i%5==0: 
        plt.figure(figsize=(20, 4))
    plt.subplot(151 + i%5)
    plt.title(key)
    plt.scatter(df['days'], df['ConfirmedCases'], label='true', s=10, alpha=0.8)
    plt.scatter(df.loc[df['days'] >= min_day, 'days'], df.loc[df['days'] >= min_day, 'ConfirmedCases'], 
                label='true train', s=10, alpha=0.7)
    pred = [mysigmoid(x, a, b, c) for x in days_range]
    plt.plot(days_range, pred, label='pred')
    plt.yscale('log'); plt.ylim(0.5, max(max(pred), df['ConfirmedCases'].max())* 1.2)
    plt.grid()
    if i % 5 == 4: plt.legend(); plt.show()
if i % 5 != 4: plt.legend(); plt.show()

In [None]:
print('Locations need manually fix:', len(issue_locations2))
for l in issue_locations2:
    print("    '%s': "%l, params_confirmed[l], sep='', end=',\n')

In [None]:
# Modify params_confirmed here
# params_confirmed.update({
#     'Angola_NaN': (34, 0.2, 80),
#     'Bangladesh_NaN': (328, 0.2, 90),
#     'Belarus_NaN': (1722, 0.2, 90),
#     'Belize_NaN': (14, 0.2, 90),
#     'Benin_NaN': (52, 0.2, 90),
#     'Bhutan_NaN': (10, 0.2, 58),
#     'Botswana_NaN': (240, 0.24296055599297614, 100),
#     'Cameroon_NaN': (1316, 0.2, 71),
#     'Canada_Northwest Territories': (10, 0.2, 70),
#     'Central African Republic_NaN': (16, 0.2, 58),
#     'China_Hong Kong': (1135.6527048366922, 0.16459610931574645, 65.94539031100251),
#     'China_Macau': (47.71918636875172, 0.18733368420124386, 60.670161620607324),
#     'Congo (Brazzaville)_NaN': (90, 0.2, 71),
#     'Denmark_NaN': (10142, 0.2, 68),
#     'Djibouti_NaN': (120, 0.3, 72),
#     'Ethiopia_NaN': (160, 0.15, 80),
#     'Fiji_NaN': (30, 0.2, 71),
#     'France_Saint Barthelemy': (12, 0.2, 42),
#     'Gabon_NaN': (60, 0.2, 69),
#     'Georgia_NaN': (600, 0.12, 80),
#     'Guinea-Bissau_NaN': (66, 0.2, 72),
#     'Guyana_NaN': (66, 0.2, 60),
#     'India_NaN': (15000, 0.2, 80),
#     'Japan_NaN': (7812, 0.2, 69),
#     'Kuwait_NaN': (5000, 0.15, 90),
#     'Liberia_NaN': (28, 0.2, 72),
#     'Moldova_NaN': (1182, 0.2, 68),
#     'Nepal_NaN': (9.98185012162651, 0.22852344998257335, 67.6680944469595),
#     'Papua New Guinea_NaN': (4, 0.2, 58),
#     'Peru_NaN': (3190, 0.2, 67),
#     'Qatar_NaN': (2150, 0.2, 63),
#     'Saint Lucia_NaN': (28, 0.2, 67),
#     'Saint Vincent and the Grenadines_NaN': (16, 0.2, 72),
#     'Sudan_NaN': (28, 0.2, 69),
#     'US_District of Columbia': (2422, 0.2, 70),
#     'US_Nebraska': (558, 0.2, 68),
#     'United Arab Emirates_NaN': (2528, 0.2, 68),
#     'United Kingdom_Cayman Islands': (90, 0.2, 70),
#     'United Kingdom_Turks and Caicos Islands': (16, 0.2, 66),
#     'Uzbekistan_NaN': (1040, 0.2, 73),
#     'West Bank and Gaza_NaN': (388, 0.2, 66),
# })


for i, key in enumerate(issue_locations):
    if i%5==0: 
        plt.figure(figsize=(20, 4))
    df = df_train.loc[df_train['unique_key'] == key]
    plt.subplot(1, 5, 1 + i%5)
    plt.title(key)
    plt.scatter(df['days'], df['ConfirmedCases'], label='true', s=10, alpha=0.8)
    a, b, c = params_confirmed[key]
    pred = [mysigmoid(x, a, b, c) for x in days_range]
    plt.plot(days_range, pred, label='pred')
    plt.grid(); plt.legend(); 
    if i%5 == 4: plt.show()
if i%5 != 4: plt.show()

In [None]:
locations_china = [l for l in locations if l.startswith('China')]
print(locations_china)

### China

In [None]:
for l in locations_china:
    print("    '%s': "%l, params_confirmed[l], sep='', end=',\n')

In [None]:
# Modify params_confirmed here
params_confirmed.update({
    'China_Beijing': (600, 0.16008712959269888, 16),
    'China_Gansu': (150.68773390606425, 0.15846088303130867, 15.428679276487077),
    'China_Inner Mongolia': (120.6382433255926, 0.17158973997712573, 16),
    'China_Shanghai': (580, 0.20081457526501872, 14),
    'China_Tianjin': (170.7370847679753, 0.17889575614705566, 14.99427817956847),
})


for i, key in enumerate(locations_china):
    if i%5==0: 
        plt.figure(figsize=(20, 4))
    df = df_train.loc[df_train['unique_key'] == key]
    plt.subplot(1, 5, 1 + i%5)
    plt.title(key)
    plt.scatter(df['days'], df['ConfirmedCases'], label='true', s=10, alpha=0.8)
    a, b, c = params_confirmed[key]
    pred = [mysigmoid(x, a, b, c) for x in days_range]
    plt.plot(days_range, pred, label='pred')
    plt.yscale('log'); plt.ylim(max(0.5, df['ConfirmedCases'].min() * 0.1), max(max(pred), df['ConfirmedCases'].max())* 1.2)
    plt.grid(); plt.legend(); 
    if i%5 == 4: plt.show()
if i%5 != 4: plt.show()

### Fatalities

In [None]:
# days_low (lowest day to include in the curve fitting)

days_low = {key: 35 for key in df_train['unique_key'].unique()}
days_low.update({
    # adding manual days_low here
})

In [None]:
multiple = 4.6  # use to linear fit 
#linear_model = lambda x, a, b: int(a*x+b)

params_fatalities = {}
for key in df_train['unique_key'].unique():
    print(key)
    
    df = df_train.loc[df_train['unique_key'] == key]
    a, b, c = df['Fatalities'].max()*2, 0.5, df.loc[df['Fatalities']!=0, 'days'].min()+21  # initial
    
    # fix a
    a = df['Fatalities'].max()*(2**5.5)
    mysigmoid_fixa_fat = lambda x, b, c: a * expit(b * (x-c))
     
    plt.plot(df['days'], df['Fatalities'], '*', label='actual data')

    fmax = df['Fatalities'].max()
    if "China" in key:
        pred = [fmax for _ in range(100)]
    else:
        if fmax <= 3:
            params_fatalities[key] = [fmax, fmax*multiple]
            #pred = [fmax for x in range(69)] + [fmax + (x-69)/(99-69)*(multiple-1)*fmax for x in range(69,100)] #by sky
            pred = [fmax for x in range(69)] + [fmax + (x-69)/(99-69)*(multiple)*fmax for x in range(69,100)] 
        else:
            try:
#                 params, _ = curve_fit(mysigmoid, df.loc[df['days'] >= days_low[key], 'days'], df.loc[df['days'] >= days_low[key], 'Fatalities'], p0=[a, b, c])
#                 a, b, c = params
    
                #fix a
                params, _ = curve_fit(mysigmoid_fixa_fat, df.loc[df['days'] >= days_low[key], 'days'], df.loc[df['days'] >= days_low[key], 'Fatalities'], p0=[b, c])
                b, c = params

            except:
                print('Warning: for key {} cannot find curve, manually write one below. (a0, b0, c0) {}, {}, {}'.format(key, a, b, c))    
            params_fatalities[key] = (a, b, c)
            #pred = [mysigmoid(x,a,b,c) for x in range(100)]
            pred = [mysigmoid_fixa_fat(x,b,c) for x in range(100)] #fix a
    
    plt.plot(range(100), pred, label='my-prediction cuve')
    plt.show()

In [None]:
display(params_fatalities)

#### Modify params_fatalities

In [None]:
f_issue = ['Costa Rica_NaN', 'Diamond Princess_NaN', 'France_NaN', 'Ghana_NaN', 'Iran_NaN', 'Taiwan*_NaN']

In [None]:
for l in f_issue:
    print("    '%s': "%l, params_fatalities[l], sep='', end=',\n')

In [None]:
# Modify params_fatalities here

key = 'Costa Rica_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = 4, 0.2, 70
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

key = 'Diamond Princess_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = (12, 0.12, 50)
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

key = 'France_NaN'
df = df_train.loc[df_train['unique_key'] == key]
plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = 33000, 0.2, 80
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
plt.plot(range(100), pred, label='my-prediction cuve')

key = 'Ghana_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = (7, 0.2, 65)
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

key = 'Iran_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = 20000, 0.09, 90
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

key = 'Japan_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = 1000, 0.06, 115
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

key = 'Philippines_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = 2400, 0.1, 100
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

key = 'Taiwan*_NaN'
df = df_train.loc[df_train['unique_key'] == key]
# plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
a, b, c = (8, 0.2, 70)
params_fatalities[key] = (a,b,c)
pred = [mysigmoid(x,a,b,c) for x in range(100)]
# plt.plot(range(100), pred, label='my-prediction cuve')

In [None]:
for l in f_issue:
    df = df_train.loc[df_train['unique_key'] == l]
    plt.plot(df['days'], df['Fatalities'], '*', label='actual data')
    params = params_fatalities[l]
    pred = [mysigmoid(x, *params) for x in range(100)]
    plt.plot(range(100), pred, label='pred')
    plt.title(l)
    plt.show()

### generate submission

In [None]:
pred_fid = []
pred_confirmed = []
pred_fatalities = []

for ind in df_test.index:
    pred_fid.append(df_test.loc[ind, 'ForecastId'])
    
    key = df_test.loc[ind, 'unique_key']
    day = df_test.loc[ind, 'days']
    fmax = df_train.loc[ind, 'Fatalities'].max()
    
    # confirmed
#     if key == "Diamond Princess_NaN":
#         pred_confirmed.append(712)
#     elif key == "China_Tibet":
#         pred_confirmed.append(1)
#     elif key == "China_Xinjiang":
#         pred_confirmed.append(76)
#     elif key == "China_Qinghai":
#         pred_confirmed.append(18)
#     elif key == "China_Shandong":
#         pred_confirmed.append(777)
#     else:
    a, b, c = params_confirmed[key]
    pred_confirmed.append( mysigmoid(day, a, b, c))
    
    # fatalities
    if "China" in key:
        pred_fatalities.append(fmax)
    elif len(params_fatalities[key]) == 2:
        y0, y1 = params_fatalities[key]
        pred = y0 + (day-69)/(99-69)*(y1-y0)
        pred_fatalities.append( params_fatalities[key][0] )
    else:
        a, b, c = params_fatalities[key]
        pred_fatalities.append( mysigmoid(day, a, b, c) )

# out
df_out = pd.DataFrame({'ForecastId': pred_fid, 
                       'ConfirmedCases': np.around(pred_confirmed).astype('int'), 
                       'Fatalities': np.around(pred_fatalities).astype('int')})
display(df_out.head(10)); display(df_out.tail(10))
df_out.to_csv('submission.csv',index=False)