In [None]:
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
from scipy import stats

import os
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
def print_summary(title, series):
    summary = series.describe()
    print(f'{title}: {summary["50%"]} (IQR {summary["25%"]}-{summary["75%"]})')

def print_count(title, count, total):
    print(f'{title}: {count} ({count/total * 100:.1f}%)')

def convert_dates(df):
    for col in df.columns.values:
        if col.endswith('_date'):
            print('Converting to date', col)
            df[col] = pd.to_datetime(df[col], errors='coerce')

def plot_column(df, column, title, dimensions=['confirmed_week', 'age', 'sex']):
    ncols = 2
    nrows = math.ceil((len(dimensions) + 1.0) / ncols)
    _, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, nrows * 5))
    ax = ax.ravel()
    ax[0].set_title(title)
    df[column].hist(ax=ax[0])
    
    for i, dim in enumerate(dimensions):
        boxplot(df, column, dim, ax[1 + i])

def boxplot(df, column, group, ax):
    df = df[~df[group].isna()]
    df_p = df.pivot(columns=group, values=column)
    df_p.plot.box(ax=ax)
    ax.set_title(f'{column} by {group} (n={df.patient_id.count()})')
    group_values = df[group].sort_values().unique()
    # Draw the points so we have an idea of n per group.
    for i, value in enumerate(group_values):
        y = df[column][df[group]==value]
        x = np.random.normal(i+1, 0.04, size=len(y))
        ax.plot(x, y, 'r.', alpha=0.2)
    
def analyze_interval(df, t1, t2, dimensions=['confirmed_week', 'age', 'sex']):
    col1 = t1 + '_date'
    col2 = t2 + '_date'
    df = df[~(df[col1].isna() | df[col2].isna())]
    df = df.copy(deep=True)
    n_valid = df.patient_id.count()
    print_count(f'Number of patients with {col1} and {col2}', n_valid, n_patients)
    if n_valid == 0:
        return
    
    timediff = (df[col2] - df[col1]).dt.days
    column = f'{t1}_to_{t2}'
    df[column] = timediff
    title = f'Days between {t1} and {t2}'
    print_summary(title, timediff)
    plot_column(df, column, title, dimensions)
    return df
    
print(f"Last updated on {pd.to_datetime('today').strftime('%m/%d/%Y')}")
data_dir = '/kaggle/input/coronavirusdataset/'

# Comparing Case (PatientInfo.csv) and Time Series (Time.csv) Data
Time.csv provides more complete data on total case counts over time, while PatientInfo.csv provides valuable info on the progression of individual cases. Most of the analysis in this notebook focuses on PatientInfo.csv unless otherwise indicated.

TODO: Analyze TimeAge, TimeGender to gain more insights.

In [None]:
patients = pd.read_csv(data_dir + 'PatientInfo.csv')
convert_dates(patients)
patients['confirmed_week'] = patients.confirmed_date.dt.week

n_patients = patients.confirmed_date.count()
print('PatientInfo.csv - Total cases as of', patients.confirmed_date.max().strftime('%Y-%m-%d'), ': ', n_patients)
for state in ['isolated', 'released', 'deceased']:
    print_count(state, patients[patients.state == state].state.count(), n_patients)

In [None]:
time_df = pd.read_csv(data_dir + 'Time.csv')
time_df['isolated'] = time_df.confirmed - time_df.released - time_df.deceased
time_df.fillna(0, inplace=True)
latest = time_df.iloc[-1]
print('Time.csv - Total cases as of', latest.date, ': ', latest.confirmed)

for state in ['isolated', 'released', 'deceased']:
    print_count(state, latest[state], latest['confirmed'])
    
print_count('\nIncluded in PatientInfo.csv', n_patients, latest.confirmed)

In [None]:
counts = {}
states = ['confirmed', 'released', 'deceased']
rename = {}
for state in states:
    column = state + '_date'
    rename[column] = state
    counts[state] = patients[column].value_counts()
counts = pd.concat(counts.values(), axis=1).fillna(0)
counts.rename(columns=rename, inplace=True)
counts.reset_index(inplace=True)
counts.rename(columns={'index': 'date'}, inplace=True)
counts['datestr'] = counts.date.dt.strftime('%Y-%m-%d')
counts['deceasedx10'] = counts.deceased * 10
counts[counts.date > pd.to_datetime('2020-02-16')].plot(x='datestr', y=['confirmed', 'released', 'deceasedx10'], figsize=(20, 4))
plt.title('Daily change in case counts from PatientInfo.csv')

# We'll use this later.
_ = patients_daily = counts.copy(deep=True)

In [None]:
time_df = pd.read_csv(data_dir + 'Time.csv')
time_df['positive_rate'] = time_df.confirmed / (time_df.confirmed + time_df.negative)
time_df_diff = time_df[['confirmed', 'released', 'deceased']].diff() # Daily change.
time_df_diff['date'] = time_df.date
time_df_diff['positive_rate'] = time_df.positive_rate
time_df_diff['week'] = pd.to_datetime(time_df_diff.date).dt.week
time_df_diff['deceasedx10'] = time_df_diff.deceased * 10
filtered = time_df_diff[time_df_diff.week>7]
filtered.plot(x='date', y=['confirmed', 'released', 'deceasedx10'], figsize=(20, 4), title='Daily change in case counts from Time.csv')
_ = plt.plot(filtered.date, filtered.positive_rate * 10000, 'k--', label='positive tests per 10,000', alpha=0.5)
_ = plt.legend()


# Comparing PatientInfo.csv and TimeProvince.csv
After noticing that PatientInfo.csv having a higher death rate and lower release rate than Time.csv, I wanted to see if this is due to different methods of reporting from the provinces. It turns out "released" info is lacking for some provinces. **Daegu** in particular, is missing PatientInfo for most cases. It has 20 deaths out of 63 confirmed cases in the PatientInfo table. Because of this, we will exclude Daegu from some of the analysis in the following sections.

Please also note that the <span style='color:green;'>released</span> status is also not fully updated in the PatientInfo table, at least 400 (as of 2020-04-13) more patients have been released than recorded there.

In [None]:
province = pd.read_csv(data_dir + 'TimeProvince.csv')
province_last = province[province.date == province.date.max()]


patients_province = []
for name, group in patients.groupby('province'):
    patients_province.append([name, group.patient_id.count(),
                 group.patient_id[group.state=='released'].count(),
                 group.patient_id[group.state=='deceased'].count()])
patients_province = pd.DataFrame(patients_province, columns=['province', 'confirmed', 'released', 'deceased'])

province_j = province_last.merge(patients_province, on='province').sort_values('confirmed_x', ascending=False)
def highlight_err(s):
    err = (s.confirmed_y / s.confirmed_x < .8) | (s.released_y / s.released_x < .5)
    style = ''
    err_style = 'background-color: #ffcccc'
    if err:
        return [err_style] * len(s)
    else:
        return [style] * len(s)
print('*_x contains counts from TimeProvince.csv on the last day, *_y contains counts from PatientInfo.csv.')
province_j.style.apply(highlight_err, axis=1)

In [None]:
patients_all = patients.copy(deep=True)

patients = patients[patients.province!='Daegu'].copy(deep=True)
counts = {}
states = ['confirmed', 'released', 'deceased']
rename = {}
for state in states:
    column = state + '_date'
    rename[column] = state
    counts[state] = patients[column].value_counts()
counts = pd.concat(counts.values(), axis=1).fillna(0)
counts.rename(columns=rename, inplace=True)
counts.reset_index(inplace=True)
counts.rename(columns={'index': 'date'}, inplace=True)
counts['datestr'] = counts.date.dt.strftime('%Y-%m-%d')
counts['deceasedx10'] = counts.deceased * 10
counts[counts.date > pd.to_datetime('2020-02-16')].plot(x='datestr', y=['confirmed', 'released', 'deceasedx10'], figsize=(20, 4))
plt.title('Daily change in case counts from PatientInfo.csv, excluding Daegu')

# We'll use this later.
_ = patients_daily = counts.copy(deep=True)

n_patients = patients.confirmed_date.count()
print('PatientInfo.csv excluding Daegu - Total cases as of', patients.confirmed_date.max().strftime('%Y-%m-%d'), ': ', n_patients)
for state in ['isolated', 'released', 'deceased']:
    print_count(state, patients[patients.state == state].state.count(), n_patients)

# Number of patients in each state by date of confirmation
From this graph you can see that some patients have been isolated for over 40 days, and a significant % of patients admitted in late Feburary are still isolated. However as we've seen above from the by province table, at least 400 more patients have been released than recorded in PatientInfo.csv.

In [None]:
data = []
for name, group in patients.groupby('confirmed_date'):
    data.append([name, group.patient_id.count(),
                 group.patient_id[group.state=='released'].count(),
                 group.patient_id[group.state=='deceased'].count(),
                 group.patient_id[group.state=='isolated'].count()])
time_series = pd.DataFrame(data, columns=['date', 'confirmed', 'released', 'deceased', 'isolated'])
time_series['day'] = time_series.date.dt.strftime('%m/%d/%Y')
_ = time_series.plot.bar(x='day', y=['isolated', 'released', 'deceased'], stacked=True, figsize=(20, 4))
_ = plt.title('Number of patients in each state by date of confirmation')

## A note about the *released* status
*Released* seems to means released from quarantine (a patient may be discharged from the hospital before then and self-isolated at home). Time to release is not the same as time to recovery and is influenced by [quanrantine/isolation guidelines](https://covidtranslate.org/assets/CovidPlaybook_EN_v0.9.pdf) from the government. For example, a patient whose symptoms have dissappeared is still required to receive 2 negative tests 24hrs apart in order to be released. Asymptomatic individuals may be required to quarantine for a fixed period of time, then obtain 2 negative results 24hrs apart. In general we should expect the recovery time to be shorter than the release time. Please see the guidelines for more details: [Korean](https://www.cdc.go.kr/board/board.es?mid=a20507020000&bid=0019&act=view&list_no=366712&tag=&nPage=1) | [English](https://covidtranslate.org/)

# Time from symptom onset to confirmation

In [None]:
onset_to_confirmed = analyze_interval(patients, 'symptom_onset', 'confirmed')

# Time from confirmation to resolution (released or deceased)

In [None]:
confirmed_to_released = analyze_interval(patients, 'confirmed', 'released')

In [None]:
confirmed_to_deceased = analyze_interval(patients, 'confirmed', 'deceased')

# Symptom onset to resolution

In [None]:
onset_to_released = analyze_interval(patients, 'symptom_onset', 'released')
onset_to_deceased = analyze_interval(patients, 'symptom_onset', 'deceased')

# Estimating CFR

In [None]:
n_deceased = patients[patients.state=='deceased'].patient_id.count()
print('A. Simplest formula, total deceased / total confirmed ',
      f'\n    {n_deceased / n_patients * 100 :.2f}%  (PatientInfo.csv)',
      f'\n    {latest.deceased / latest.confirmed * 100 :.2f}%  (Time.csv)')

n_released = patients[patients.state=='released'].patient_id.count()
n_resolved = n_deceased + n_released
print('B. Other simple formula, total deceased / total resolved: ',
      f'\n    {n_deceased / n_resolved * 100 :.2f}%  (PatientInfo.csv)',
      f'\n    {latest.deceased / (latest.confirmed - latest.isolated) * 100 :.2f}%  (Time.csv)')

Neither of these are accurate. 
- **A** uses all cases in the denominator and assumes all of them would recover, resulting in too low of an estimate.
- **B** assumes the time from confirmation to resolution is the same for both outcomes. Based on the data we currently have, the time to release is much longer, resulting in too high of an estimate.