In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.basemap import Basemap
from tqdm.notebook import tqdm
tqdm().pandas()
import swifter

## Read all CSV 

In [None]:
time_series_covid_19_confirmed = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/time_series_covid_19_confirmed.csv')
time_series_covid_19_recovered = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/time_series_covid_19_recovered.csv')
covid_19_data = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv')
time_series_covid_19_deaths = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/time_series_covid_19_deaths.csv')
COVID19_line_list_data = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/COVID19_line_list_data.csv')
COVID19_open_line_list = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/COVID19_open_line_list.csv')

In [None]:
time_series_covid_19_confirmed.head()

In [None]:
time_series_covid_19_recovered.head()

In [None]:
covid_19_data.head()

In [None]:
time_series_covid_19_deaths.head()

In [None]:
COVID19_line_list_data.head()

In [None]:
COVID19_open_line_list.head()

## Exploring TimeSeries DataSet
1. time_series_covid_19_confirmed
2. time_series_covid_19_recovered

### Total Length each Dataframe

* 分別檢查兩份資料個數
* 兩者皆為 `442` 筆，其中可能包含 `nan` 的 row

In [None]:
total_size_of_recovered = len(time_series_covid_19_recovered)
total_size_of_confirmed = len(time_series_covid_19_confirmed)
print('recovered size = {:_}\nconfirmed size = {:_}'.format(total_size_of_recovered, total_size_of_confirmed))

### Filter `Nan`

只有 `Province/State` 有 `nan` ，仍然能確定是屬於何種國家。

In [None]:
time_series_covid_19_recovered.isnull().sum().sort_values(ascending = False).head()

In [None]:
time_series_covid_19_confirmed.isnull().sum().sort_values(ascending = False).head()

### Peek unique discrete features except periods

In [None]:
time_series_covid_19_recovered.nunique()[:4]

* `Province/State` : 總共包含 `305` 個不同的城市
* `Country/Region` : 總共包含 `143` 個不同的國家
* `Lat & Long` : 包含不同的經緯度。在這裡意義不大，之後會深入探討分布的位置情形。

### Recovered & Confirmed Distribution For each Country

* 分析武漢肺炎在各個國家的分布情形
* 將發現/確診(recoverd/confirmed)比例低於 1% 的國家列為 `Others`
* 從結果不難發現，美國占了最大的比例，其次為 `Others` ，第三為中國

In [None]:
country_recovered = time_series_covid_19_recovered['Country/Region'].value_counts()
country_confirmed = time_series_covid_19_confirmed['Country/Region'].value_counts()

def thresholdAggre(df, percent=0.01):
    threshhold = df.sum(axis=0, skipna=True)  * percent
    df.rename(index=lambda s: 'others' if df[s] < threshhold else s, inplace=True)
    return df.groupby(df.index).agg(sum)


country_recovered = thresholdAggre(country_recovered)
country_confirmed = thresholdAggre(country_confirmed)

fig = plt.figure(figsize=(12,5))

plt.subplot(121)
plt.pie(country_recovered, autopct='%1.1f%%', shadow=True, startangle=140, labels=country_recovered.index)
plt.axis('equal')
plt.legend()

plt.subplot(122)
plt.pie(country_confirmed, autopct='%1.1f%%', shadow=True, startangle=140, labels=country_confirmed.index)
plt.axis('equal')
plt.legend()

plt.subplots_adjust(wspace=1)
plt.show()

**Note** : 注意到這裡的統計數據是基於**有發生的地區**而非**有發生地區的人數**。也就是說，從上圖我們可以得知`US`占了最大的比例，
    但不見得有最多的患者，只能說明COVID19在這個地區分布的很廣。因此，為了探討病例的數量，我們必須將時間序列的資料列入計算。

In [None]:
time_series_covid_19_recovered['total_nums'] = time_series_covid_19_recovered.swifter.apply(lambda x: sum(x[4:]),axis=1)
time_series_covid_19_confirmed['total_nums'] = time_series_covid_19_confirmed.swifter.apply(lambda x: sum(x[4:]),axis=1)

In [None]:
country_people_recovered = time_series_covid_19_recovered.groupby('Country/Region')['total_nums'].sum()
country_people_confirmed = time_series_covid_19_confirmed.groupby('Country/Region')['total_nums'].sum()

In [None]:
country_people_recovered = thresholdAggre(country_people_recovered)
country_people_confirmed = thresholdAggre(country_people_confirmed)

fig = plt.figure(figsize=(12,5))

plt.subplot(121)
plt.pie(country_people_recovered, autopct='%1.1f%%', shadow=True, startangle=140, labels=country_people_recovered.index)
plt.axis('equal')
plt.legend()

plt.subplot(122)
plt.pie(country_people_confirmed, autopct='%1.1f%%', shadow=True, startangle=140, labels=country_people_confirmed.index)
plt.axis('equal')
plt.legend()

plt.subplots_adjust(wspace=1)
plt.show()

1. 江將！當初比例最高的 `US` 低於母體數 `1%` 被歸類為 `Others`。接下來我們全局地看地圖，病例的全球分布。

### Correaltion

從經緯度資料與總發現病例來做關聯分析，這裡使用correlation matrix來呈現

In [None]:
sns.heatmap(time_series_covid_19_recovered[['Lat','Long', 'total_nums']].corr(), annot=True)

### Using Basemap to see the variation along date

In [None]:
from itertools import chain

def draw_map(m, scale=0.2):
    # draw a shaded-relief image
    m.shadedrelief(scale=scale)
    
    # lats and longs are returned as a dictionary
    lats = m.drawparallels(np.linspace(-90, 90, 13))
    lons = m.drawmeridians(np.linspace(-180, 180, 13))

    # keys contain the plt.Line2D instances
    lat_lines = chain(*(tup[1][0] for tup in lats.items()))
    lon_lines = chain(*(tup[1][0] for tup in lons.items()))
    all_lines = chain(lat_lines, lon_lines)
    
    # cycle through these lines and set the desired style
    for line in all_lines:
        line.set(linestyle='-', alpha=0.3, color='w')

In [None]:
lat = time_series_covid_19_recovered['Lat'].values
lon = time_series_covid_19_recovered['Long'].values
population = time_series_covid_19_recovered['total_nums'].values

In [None]:
fig = plt.figure(figsize=(20, 6), edgecolor='w')
m = Basemap(projection='moll', resolution=None,
            lat_0=0, lon_0=0)

m.scatter(lon, lat, latlon=True,
          c=np.log(population), s=50,
          cmap='Reds', alpha=0.8)

draw_map(m)

In [None]:
time_series = time_series_covid_19_recovered[list(time_series_covid_19_recovered)[4:-1]]
time_series = time_series.cumsum(axis=1)
x = time_series.rolling(30, axis=1).max().dropna(axis=1, how='all')
with plt.style.context('Solarize_Light2'):
    for i in range(len(x.columns)):
        m.scatter(lon, lat, latlon=True, c=np.log10(x[x.columns[i]]), s=50, cmap='Reds', alpha=0.8)
        plt.title(x.columns[i])
        draw_map(m)
        plt.show()


從 `2020.2.20 - 2020.3.14` 的區間中，我們對比較顯著的變化來做case by case的討論:
1. 美國西岸病例數量持續增加，大約從3月開始，數量已經超越美國東岸
2. 歐洲也是從沿岸往內陸感染
3. 澳洲東南岸有疫情
4. 中東地區與中國接壤處疫情逐漸嚴重
5. 中國幾乎全境感染

不難發現在沿岸地區普遍發生病例，而且集中於人口密集地區。

### Population & GDP extra-data with Country all over the world
* World Population Review
    * url: https://worldpopulationreview.com/countries/countries-by-density/
* World GDP Review
    * url: https://worldpopulationreview.com/countries/countries-by-gdp/

In [None]:
population2020 = pd.read_csv("/kaggle/input/population2020/population2020.csv")
GDP2020 = pd.read_csv("/kaggle/input/gdp2020/GDP2020.csv")

In [None]:
population2020.head()

In [None]:
GDP2020.head()

In [None]:
len(population2020['name']), len(GDP2020['country'])

前面有統計過不同國家的數量 `Country/Region` 為 `143` 個。
然而，2020人口的dataset包含了 `232` 個國家，看起來是足夠的，但我們還是得檢查是否涵蓋了全部的國家。

In [None]:
set(time_series_covid_19_recovered['Country/Region']) - set(population2020['name'])

有 `13` 個國家沒有包含到，可能是名字不同，例如 `US` 等同於 `United States`，我們必須找到與之對應的名字。像是島嶼國家例如: `Guernsey` `Jersey` 從人口分布資料中未能找到，因此將他並為所屬的 `United Kingdom`。
有趣的是，其中包含了一些屬於 `Cruise Ship` 的移動式國家lol

In [None]:
set(time_series_covid_19_recovered['Country/Region']) - set(GDP2020['country'])

In [None]:
map_state = {'US':'United States', 
             'Korea, South':'South Korea',
             'Cote d\'Ivoire':'Ivory Coast',
             'Czechia':'Czech Republic',
             'Eswatini':'Swaziland',
             'Guernsey':'United Kingdom',
             'Holy See':'Vatican City',
             'Jersey':'United Kingdom',
             'North Macedonia':'Macedonia',
             'Taiwan*':'Taiwan',
             'occupied Palestinian territory':'Palestine'
            }
map_state_rev = {v: k for k, v in map_state.items()}

In [None]:
population2020.rename(columns={'name':'Country/Region'},inplace=True)
population2020['Country/Region'] = population2020['Country/Region'].swifter.apply(lambda x: map_state_rev[x] if x in map_state_rev else x)

In [None]:
GDP2020.rename(columns={'country':'Country/Region'},inplace=True)
GDP2020['Country/Region'] = GDP2020['Country/Region'].swifter.apply(lambda x: map_state_rev[x] if x in map_state_rev else x)

In [None]:
time_series_covid_19_recovered = time_series_covid_19_recovered.merge(population2020, how='left', on='Country/Region')
time_series_covid_19_recovered = time_series_covid_19_recovered.merge(GDP2020, how='left', on='Country/Region').drop(['pop'],axis=1)

In [None]:
time_series_covid_19_recovered

In [None]:
sns.heatmap(time_series_covid_19_recovered[['Lat','Long','pop2020','density','area','total_nums','gdpPerCapita']].corr(), annot=True)

### Map corresponds to Population

In [None]:
total_population = time_series_covid_19_recovered.fillna(0)['pop2020'].values
area = time_series_covid_19_recovered.fillna(0)['area'].values

fig = plt.figure(figsize=(20, 6), edgecolor='w')
m = Basemap(projection='moll', resolution=None,
            lat_0=0, lon_0=0)

m.scatter(lon, lat, latlon=True,
          c=np.log(total_population), s=area/max(area) * 1000,
          cmap='YlGnBu', alpha=0.4)

m.scatter(lon, lat, latlon=True,
          c=np.log(population), s=50,
          cmap='Reds', alpha=0.9, marker='x')
draw_map(m)

### Map corresponds to GDP

In [None]:
gdpPerCapita = time_series_covid_19_recovered.fillna(0)['gdpPerCapita'].values
area = time_series_covid_19_recovered.fillna(0)['area'].values

fig = plt.figure(figsize=(20, 6), edgecolor='w')
m = Basemap(projection='moll', resolution=None,
            lat_0=0, lon_0=0)

m.scatter(lon, lat, latlon=True,
          c=np.log(gdpPerCapita), s=gdpPerCapita/100,
          cmap='YlGnBu', alpha=0.4)

m.scatter(lon, lat, latlon=True,
          c=np.log(population), s=50,
          cmap='Reds', alpha=0.9, marker='x')
draw_map(m)

從結果上我們不能說人口密集程度與病例正相關，因為在中國和美國，兩個同樣是人口密度高的國家，但具有截然不同的病例數。但是我們可以說幾乎人口聚集的地方通常會有病例。

### Top 10 the Density of COVID19 over Country

In [None]:
time_series_covid_19_recovered['average_covid19_in_country'] = time_series_covid_19_recovered['total_nums'] / time_series_covid_19_recovered['area']

In [None]:
time_series_covid_19_recovered.sort_values('average_covid19_in_country',ascending=False)[['Country/Region','average_covid19_in_country']].head(10)

天哪！新加玻平均土地發現的病例人數遠高於其他國家，但實際上並不是病例最多的國家，可能是因為國土的限制造成每單位土地面積病例數過高。

In [None]:
start = time_series_covid_19_recovered.columns.get_loc('1/22/20')
end = time_series_covid_19_recovered.columns.get_loc('3/14/20')
timeseries = time_series_covid_19_recovered[list(time_series_covid_19_recovered)[start:end]]
time_series_covid_19_recovered['first_burst'] = timeseries[timeseries!=0].idxmin(axis=1)
time_series_covid_19_recovered_drop_nan = time_series_covid_19_recovered.dropna(subset=['first_burst'])

### First and Last Occurence in each country

In [None]:
time_series_covid_19_recovered_drop_nan.sort_values(by='first_burst')[['Country/Region', 'Province/State' ,'first_burst']].head(15)

從結果來看，我們可以發現從既有的資料中，中國是最早發生病毒的。從資料中的 `2020.1.22 - 2020.3.14` 的期間內，最早發生是在 `2020.1.22` 直到 `2020.1.26` 才在中國以外的國境內看到。

In [None]:
time_series_covid_19_recovered_drop_nan.sort_values(by='first_burst')[['Country/Region', 'Province/State' ,'first_burst']].tail(15)