In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import warnings
import altair as alt
from datetime import datetime


pd.options.display.max_rows = 50

In [None]:
data = pd.read_csv("/kaggle/input/confirmed-covid-cases-in-thailand-as-of-20210721/covid-2021-07-21.csv")
data.head()

In [None]:
data.dtypes

# Basic statistics

### 1. Property and Count

## Age and Sex

In [None]:
data.describe()

In [None]:
data.age.hist()

In [None]:
# display unique values
data.sex.value_counts()

In [None]:
data = data[data.sex.map(lambda x: x in ("ชาย", "หญิง"))]
data.sex.value_counts()

In [None]:
data.sex.value_counts().plot.bar().grid()

In [None]:
male_data = data[data.sex == "ชาย"]
female_data = data[data.sex == "หญิง"]

In [None]:
# bucket
max_range = 100      # max_age = max(data.age) = 108 will be ignored
num_bins  = 20
print(f"bucket_size: {max_range / num_bins}")

In [None]:
male_counts, male_range   = np.histogram(male_data.age, bins=num_bins, range=(0, max_range))
female_counts, female_range = np.histogram(female_data.age, bins=num_bins, range=(0, max_range))

In [None]:
sex_compar_data = pd.DataFrame({"male_count": male_counts, "female_count": female_counts})
sex_compar_data.index = male_range[1:]
sex_compar_data

In [None]:
sex_compar_data.plot.bar().grid()

## Cluster

In [None]:
cluster_data = data[["risk", "announce_date", "province_of_onset"]]
cluster_count_data = pd.DataFrame(cluster_data.risk.value_counts())
cluster_count_data

In [None]:
cluster_count_data = cluster_count_data.reset_index()
cluster_count_data.columns = ["cluster_name", "num_patients"]
cluster_count_data

In [None]:
cluster_count_data.plot.bar().grid()

In [None]:
alt.Chart(cluster_count_data).mark_bar().encode(
    x=alt.X('cluster_name:N', sort={"encoding": "y", "order": "descending"}),
    y=alt.Y('num_patients:Q'),
).properties(width=1400)

Log scale

In [None]:
alt.Chart(cluster_count_data).mark_bar().encode(
    x=alt.X('cluster_name:N', sort={"encoding": "y", "order": "descending"}),
    y=alt.Y('num_patients:Q', scale=alt.Scale(type='log')),
).properties(width=1400)

## 2.Trend

In [None]:
data["date"] = data.announce_date.apply(lambda x: datetime.strptime(x, "%d/%m/%Y"))
data

In [None]:
trend_data = data[["date"]].groupby("date").size()
trend_data = trend_data.to_frame("new_cases")
trend_data = trend_data.reset_index()
trend_data

In [None]:
alt.Chart(trend_data).mark_line().encode(
    alt.X('date:T'),
    alt.Y('new_cases:Q')
).properties(width=1400)

Log Scale

In [None]:
alt.Chart(trend_data).mark_line().encode(
    alt.X('date:T'),
    alt.Y('new_cases:Q', scale=alt.Scale(type='log'))
).properties(width=1400)

# Cluster Analysis

Attempt to analyze the number of clusters from multiple perspectives.

In [None]:
ccluster_data = cluster_data.copy()
ccluster_data["date"] = cluster_data.announce_date.apply(lambda x: datetime.strptime(x, "%d/%m/%Y"))
ccluster_data = ccluster_data[["risk", "date", "province_of_onset"]]
ccluster_data

## 1. Cluster total cases

In [None]:
actual_cluster_count_data = cluster_count_data[cluster_count_data.cluster_name.str.find("Cluster") != -1]
actual_cluster_count_data.columns = ["risk", "num_patients"]
actual_cluster_count_data

In [None]:
alt.Chart(actual_cluster_count_data).mark_bar().encode(
    x=alt.X('cluster_name:N', sort={"encoding": "y", "order": "descending"}),
    y=alt.Y('num_patients:Q', scale=alt.Scale(type='log')),
).properties(width=1400)

## 2. Cluster spread period analysis

In [None]:
# pivot (expand to onehot encoding)
cluster_count_by_date = pd.crosstab(ccluster_data.date, ccluster_data.risk)  # group by date, expand risk
cluster_count_by_date

In [None]:
cluster_period = cluster_count_by_date.transpose().gt(0).sum(1).to_frame("days")
cluster_period = cluster_period.reset_index()
cluster_period

In [None]:
alt.Chart(cluster_period).mark_bar().encode(
    x=alt.X('risk:N', sort={"encoding": "y", "order": "descending"}),
    y=alt.Y('days:Q'),
).properties(width=1400)

In [None]:
# filter only cluster
actual_cluster_period = cluster_period[cluster_period.risk.str.find("Cluster") != -1]
actual_cluster_period

In [None]:
alt.Chart(actual_cluster_period).mark_bar().encode(
    x=alt.X('risk:N', sort={"encoding": "y", "order": "descending"}),
    y=alt.Y('days:Q'),
).properties(width=1400)

## 3. Number of Province Spread

In [None]:
actual_cluster_spread = ccluster_data[ccluster_data.risk.str.find("Cluster") != -1].groupby("risk").province_of_onset.nunique()
actual_cluster_spread = actual_cluster_spread.to_frame("num_spread")
actual_cluster_spread = actual_cluster_spread.reset_index()
actual_cluster_spread

In [None]:
alt.Chart(cluster_spread).mark_bar().encode(
    x=alt.X('risk:N', sort={"encoding": "y", "order": "descending"}),
    y=alt.Y('num_spread:Q'),
).properties(width=1400)

## Join

In [None]:
to_join = [
    actual_cluster_count_data,
    actual_cluster_period,
    actual_cluster_spread,
]

combinded = pd.concat([df.set_index("risk") for df in to_join], axis=1, join="inner")
combinded = combinded.reset_index()
combinded

In [None]:
chart = alt.Chart(combinded).mark_circle().encode(
    x='days',
    y='num_spread',
    color='risk',
    size='num_patients'
)

text1 = (
    alt.Chart(combinded[combinded.num_patients > 100])
    .mark_text(dy=-30, color="black")
    .encode(
        x='days',
        y='num_spread',
        text='risk',
    )
)

text2 = (
    alt.Chart(combinded[combinded.num_patients > 100])
    .mark_text(dy=-15, color="black")
    .encode(
        x='days',
        y='num_spread',
        text='num_patients',
    )
)

(chart + text1 + text2).properties(width=1000).interactive()

### Thanks. Dataset and knowledge sharing from Botnoi DSE.