

이 자료는 [COVID19-Explained through Visualizations](https://www.kaggle.com/anshuls235/covid19-explained-through-visualizations/data) 을 기반으로 Python Visualization 을 공부하기 위해 구성했습니다

사용된 [DataSet](https://www.kaggle.com/c/covid19-global-forecasting-week-4) 의 Global Forecasting (Week 4) 입니다

# Install Package

In [31]:
!pip install pycountry_convert
!pip install plotly==4.6.0



# Import Library

In [0]:
import pandas as pd
import numpy as np
import datetime as dt
import requests
import sys
from itertools import chain
import pycountry
import pycountry_convert as pc
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import GridSearchCV

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
FOLDER = '/content/drive/My Drive/You Play AI'
os.chdir(FOLDER)

In [0]:
df_train = pd.read_csv('./data/COVID19/train.csv') 
df_sk_cases = pd.read_csv('./data/COVID19/Case.csv')
df_sk = pd.read_csv('./data/COVID19/PatientInfo.csv')

In [0]:
display(df_train.head())
display(df_train.describe())
display(df_train.info())

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0


Unnamed: 0,Id,ConfirmedCases,Fatalities
count,26292.0,26292.0,26292.0
mean,17826.5,1186.183896,62.265594
std,10300.678012,8549.128727,695.049077
min,1.0,0.0,0.0
25%,8913.75,0.0,0.0
50%,17826.5,1.0,0.0
75%,26739.25,117.0,1.0
max,35652.0,203020.0,21067.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26292 entries, 0 to 26291
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              26292 non-null  int64  
 1   Province_State  11172 non-null  object 
 2   Country_Region  26292 non-null  object 
 3   Date            26292 non-null  object 
 4   ConfirmedCases  26292 non-null  float64
 5   Fatalities      26292 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 1.2+ MB


None

In [0]:
df_train['Date'] = pd.to_datetime(df_train['Date'], format = '%Y-%m-%d')

In [0]:
train_date_min = df_train['Date'].min()
train_date_max = df_train['Date'].max()
print('Minimum date from training set: {}'.format(train_date_min))
print('Maximum date from training set: {}'.format(train_date_max))

Minimum date from training set: 2020-01-22 00:00:00
Maximum date from training set: 2020-04-14 00:00:00


In [0]:
class country_utils():
    def __init__(self):
        self.d = {}
    
    def get_dic(self):
        return self.d
    
    def get_country_details(self,country):
        """Returns country code(alpha_3) and continent"""
        try:
            country_obj = pycountry.countries.get(name=country)
            if country_obj is None:
                c = pycountry.countries.search_fuzzy(country)
                country_obj = c[0]
            continent_code = pc.country_alpha2_to_continent_code(country_obj.alpha_2)
            continent = pc.convert_continent_code_to_continent_name(continent_code)
            return country_obj.alpha_3, continent
        except:
            if 'Congo' in country:
                country = 'Congo'
            elif country == 'Diamond Princess' or country == 'Laos' or country == 'MS Zaandam'\
            or country == 'Holy See' or country == 'Timor-Leste':
                return country, country
            elif country == 'Korea, South' or country == 'South Korea':
                country = 'Korea, Republic of'
            elif country == 'Taiwan*':
                country = 'Taiwan'
            elif country == 'Burma':
                country = 'Myanmar'
            elif country == 'West Bank and Gaza':
                country = 'Gaza'
            else:
                return country, country
            country_obj = pycountry.countries.search_fuzzy(country)
            continent_code = pc.country_alpha2_to_continent_code(country_obj[0].alpha_2)
            continent = pc.convert_continent_code_to_continent_name(continent_code)
            return country_obj[0].alpha_3, continent
    
    def get_iso3(self, country):
        return self.d[country]['code']
    
    def get_continent(self,country):
        return self.d[country]['continent']
    
    def add_values(self,country):
        self.d[country] = {}
        self.d[country]['code'],self.d[country]['continent'] = self.get_country_details(country)
    
    def fetch_iso3(self,country):
        if country in self.d.keys():
            return self.get_iso3(country)
        else:
            self.add_values(country)
            return self.get_iso3(country)
        
    def fetch_continent(self,country):
        if country in self.d.keys():
            return self.get_continent(country)
        else:
            self.add_values(country)
            return self.get_continent(country)

# COVID-19 Visualization with ploty.express 

In [0]:
df_tm = df_train.copy()
# 데이터셋 update 에 따른 max 사용
date = df_tm.Date.max()
df_tm = df_tm[df_tm['Date']==date]
obj = country_utils()
df_tm.Province_State.fillna('',inplace=True)
df_tm['continent'] = df_tm.apply(lambda x: obj.fetch_continent(x['Country_Region']), axis=1)
df_tm["world"] = "World" 
fig = px.treemap(df_tm, path=['world', 'continent', 'Country_Region','Province_State'], values='ConfirmedCases',
                  color='ConfirmedCases', hover_data=['Country_Region'],
                  color_continuous_scale='dense', title='Current share of Worldwide COVID19 Cases')
fig.show()

In [0]:
fig = px.treemap(df_tm, path=['world', 'continent', 'Country_Region','Province_State'], values='Fatalities',
                  color='Fatalities', hover_data=['Country_Region'],
                  color_continuous_scale='matter', title='Current share of Worldwide COVID19 Deaths')
fig.show()

In [0]:
def add_daily_measures(df):
    df.loc[0,'Daily Cases'] = df.loc[0,'ConfirmedCases']
    df.loc[0,'Daily Deaths'] = df.loc[0,'Fatalities']
    for i in range(1,len(df)):
        df.loc[i,'Daily Cases'] = df.loc[i,'ConfirmedCases'] - df.loc[i-1,'ConfirmedCases']
        df.loc[i,'Daily Deaths'] = df.loc[i,'Fatalities'] - df.loc[i-1,'Fatalities']
   
   # COVID 첫날 2020-01-22 
    df.loc[0,'Daily Cases'] = 0
    df.loc[0,'Daily Deaths'] = 0
    return df

In [0]:
df_world = df_train.copy()
df_world = df_world.groupby('Date',as_index=False)['ConfirmedCases','Fatalities'].sum()
df_world = add_daily_measures(df_world)

In [0]:
fig = go.Figure(data=[
    go.Bar(name='Cases', x=df_world['Date'], y=df_world['Daily Cases']),
    go.Bar(name='Deaths', x=df_world['Date'], y=df_world['Daily Deaths'])
])

# Bar 차트 overlay 
fig.update_layout(barmode='overlay', title='Worldwide daily Case and Death count')
fig.show()

In [0]:
df_map = df_train.copy()
df_map['Date'] = df_map['Date'].astype(str)
df_map = df_map.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()

In [0]:
df_map['iso_alpha'] = df_map.apply(lambda x: obj.fetch_iso3(x['Country_Region']), axis=1)

In [0]:
df_map['ln(ConfirmedCases)'] = np.log(df_map.ConfirmedCases + 1)
df_map['ln(Fatalities)'] = np.log(df_map.Fatalities + 1)

## 확진자 수 시각화

In [0]:
px.choropleth(df_map, 
              locations="iso_alpha", 
              color="ln(ConfirmedCases)", 
              hover_name="Country_Region", 
              hover_data=["ConfirmedCases"] ,
              animation_frame="Date",
              color_continuous_scale=px.colors.sequential.dense, 
              title='Total Confirmed Cases growth(Logarithmic Scale)')

## 사망자 수 시각화

In [0]:
px.choropleth(df_map, 
              locations="iso_alpha", 
              color="ln(Fatalities)", 
              hover_name="Country_Region",
              hover_data=["Fatalities"],
              animation_frame="Date",
              color_continuous_scale=px.colors.sequential.OrRd,
              title = 'Total Deaths growth(Logarithmic Scale)')

## 치사율 시각화

In [0]:
df_map['Mortality Rate%'] = round((df_map.Fatalities/df_map.ConfirmedCases)*100,2)

In [0]:
px.choropleth(df_map, 
                    locations="iso_alpha", 
                    color="Mortality Rate%", 
                    hover_name="Country_Region",
                    hover_data=["ConfirmedCases","Fatalities"],
                    animation_frame="Date",
                    color_continuous_scale=px.colors.sequential.Magma_r,
                    title = 'Worldwide Daily Variation of Mortality Rate%')

# COVID19 상위 10 개국 시각화

In [0]:
last_date = df_train.Date.max()
df_countries = df_train[df_train['Date']==last_date]
df_countries = df_countries.groupby('Country_Region', as_index=False)['ConfirmedCases','Fatalities'].sum()
df_countries = df_countries.nlargest(10,'ConfirmedCases')

df_trend = df_train.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()
df_trend = df_trend.merge(df_countries, on='Country_Region')
df_trend.drop(['ConfirmedCases_y','Fatalities_y'],axis=1, inplace=True)
df_trend.rename(columns={'Country_Region':'Country', 'ConfirmedCases_x':'Cases', 'Fatalities_x':'Deaths'}, inplace=True)

# 로그 스케일 
df_trend['ln(Cases)'] = np.log(df_trend['Cases']+1)# log 0 에러를 피하기 위한 +1
df_trend['ln(Deaths)'] = np.log(df_trend['Deaths']+1)

In [0]:
px.line(df_trend, x='Date', y='Cases', color='Country', title='COVID19 Total Cases growth for top 10 worst affected countries')

In [0]:
px.line(df_trend, x='Date', y='Deaths', color='Country', title='COVID19 Total Deaths growth for top 10 worst affected countries')

# 한국 시각화

다른 자료에서는 한국 시각화를 안했으니 여기서 ?

In [0]:
df_sk = df_train.query("Country_Region=='Korea, South'")
df_sk.reset_index(inplace = True)
df_sk = add_daily_measures(df_sk)
fig = go.Figure(data=[
    go.Bar(name='Cases', x=df_sk['Date'], y=df_sk['Daily Cases']),
    go.Bar(name='Deaths', x=df_sk['Date'], y=df_sk['Daily Deaths'])
])
# Change the bar mode
fig.update_layout(barmode='overlay', title='Daily Case and Death count(South Korea)')
fig.show()

In [0]:
df_sk.age.replace('66s','60s', inplace=True)
df_sk_age = df_sk.groupby(['age','sex','country','province','city','infection_case'], as_index=False)['patient_id'].count()

In [41]:
# Subplot 사용
fig = make_subplots(
    rows=2, cols=2, subplot_titles=("Age-wise distribution of Cases", "Province-wise distribution of Cases"\
                                    , "Infection Origin", "Gender-wise distribution of Patient Statuses"),
    specs=[[{"type": "bar"}, {"type": "pie"}],
           [{"type": "pie"}, {"type": "bar"}]],
)

# Add traces 
#   - subplot spece 에 각 plot type 을 명세하고
#   - 각 trace 별로 데이터 구성후 add_trace 로 각 trace에 plot 
# trace-1
df_males = df_sk_age[df_sk_age['sex']=='male'].groupby('age',as_index=False)['patient_id'].sum()
df_females = df_sk_age[df_sk_age['sex']=='female'].groupby('age',as_index=False)['patient_id'].sum()
fig.add_trace(go.Bar(name='Males', x=df_males['age'], y=df_males['patient_id']), row=1, col=1)
fig.add_trace(go.Bar(name='Females', x=df_females['age'], y=df_females['patient_id']), row=1, col=1)
# trace-2
df_province = df_sk_age.groupby('province',as_index=False)['patient_id'].sum()
fig.add_trace(go.Pie(labels=df_province['province'], values=df_province['patient_id'], hole=0.3), row=1, col=2)
# trace-3
df_inf_case = df_sk_age.groupby('infection_case', as_index=False)['patient_id'].sum()
df_inf_case.loc[df_inf_case['patient_id'] < 50, 'infection_case'] = 'etc'
fig.add_trace(go.Pie(labels=df_inf_case['infection_case'],values=df_inf_case['patient_id'], hole=0.3), row=2, col=1)
# trace-4
df_males = df_sk[df_sk['sex']=='male'].groupby('state',as_index=False)['patient_id'].sum()
df_females = df_sk[df_sk['sex']=='female'].groupby('state',as_index=False)['patient_id'].sum()
fig.add_trace(go.Bar(name='Males', x=df_males['state'], y=df_males['patient_id']), row=2, col=2)
fig.add_trace(go.Bar(name='Females', x=df_females['state'], y=df_females['patient_id']), row=2, col=2)

fig.update_xaxes(title_text="Age", row=1, col=1)
fig.update_xaxes(title_text="Status", row=2, col=2)

fig.update_yaxes(title_text="Confirmed Cases", row=1, col=1)
fig.update_yaxes(title_text="Total number of People", row=2, col=2)

# Layout 수정
fig.update_layout(title_text="South Korea: Some more visualizations", height=700, showlegend=False)

fig.show()