Content:

1. [Patient Info](#1)
2. [Region](#2)
3. [Search Trend](#3)
4. [Time Gender](#4)
5. [Weather](#5)
6. [Case](#6)
7. [Time](#7)
8. [Time Province](#8)
9. [Time Age](#9)
10. [Patient Route](#10)
11. [Seoul Floating](#11)
12. [Policy](#12)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import os

In [None]:
df_patient_info=pd.read_csv("/kaggle/input/coronavirusdataset/PatientInfo.csv")
df_region=pd.read_csv("/kaggle/input/coronavirusdataset/Region.csv")
df_search_trend=pd.read_csv("/kaggle/input/coronavirusdataset/SearchTrend.csv")
df_time_gender=pd.read_csv("/kaggle/input/coronavirusdataset/TimeGender.csv")
df_weather=pd.read_csv("/kaggle/input/coronavirusdataset/Weather.csv")
df_case=pd.read_csv("/kaggle/input/coronavirusdataset/Case.csv")
df_time=pd.read_csv("/kaggle/input/coronavirusdataset/Time.csv")
df_time_province=pd.read_csv("/kaggle/input/coronavirusdataset/TimeProvince.csv")
df_time_age=pd.read_csv("/kaggle/input/coronavirusdataset/TimeAge.csv")
df_patient_route=pd.read_csv("/kaggle/input/coronavirusdataset/PatientRoute.csv")
df_seoul_floating=pd.read_csv("/kaggle/input/coronavirusdataset/SeoulFloating.csv")
df_policy=pd.read_csv("/kaggle/input/coronavirusdataset/Policy.csv")

<a id="1"></a>
## 1. Patient Info 

In [None]:
df_patient_info.head()

## Variables
1) patient_id: the ID of the patient

2) global_num: the number given by KCDC

3) sex: the sex of the patient

4) birth_year: the birth year of the patient

5) age: the age of the patient

6) country: the country of the patient

7) province: the province of the patient

8) city: the city of the patient

9) disease: TRUE: underlying disease / FALSE: no disease

10) infection_case: the case of infection

11) infection_order: the order of infection

12) infected_by: the ID of who infected the patient

13) contact_number: the number of contacts with people

14) symptom_onset_date: the date of symptom onset

15) confirmed_date: the date of being confirmed

16) released_date: the date of being released

17) deceased_date: the date of being deceased

18) state: isolated / released / deceased

In [None]:
df_patient_info.drop(["patient_id","global_num","infected_by"],axis=1, inplace=True)

In [None]:
df_patient_info.head()

In [None]:
df_patient_info.tail()

*
Let's see how many nulls there are*

In [None]:
df_patient_info.info()

In [None]:
df_patient_info.isna().sum()

## ***there is too much missing data here. so let's remove these columns***

1) birth_year  

2) disease

3) infection_case         

4) infection_order

5) contact_number

6) symptom_onset_date

7) released_date    

8) deceased_date         


In [None]:
df_patient_info.drop(["birth_year","disease","infection_case","infection_order","contact_number","symptom_onset_date","released_date","released_date","deceased_date"],axis=1,inplace=True)

In [None]:
df_patient_info.head()

## **now let's look at the data types**

In [None]:
df_patient_info.dtypes

## *now converting the age variable to a numeric variable*

In [None]:
df_patient_info.head()

***Let's get rid of the letter we don't want in age values***

In [None]:
df_patient_info['age'] = df_patient_info['age'].str.replace(r'\D', '')

In [None]:
df_patient_info.head()

In [None]:
df_patient_info.dtypes

In [None]:
df_patient_info.age.isna().sum()

***First, let's fill the empty spaces even if we don't want that***

In [None]:
def impute_median(series):
    return series.fillna(series.median())

In [None]:
df_patient_info.age =df_patient_info.age.transform(impute_median)

In [None]:
df_patient_info.age.unique()

In [None]:
df_patient_info["age"]=df_patient_info["age"].apply(lambda x: float(x))

In [None]:
df_patient_info.dtypes

In [None]:
df_patient_info.isna().sum()

***let's fill other empty places***

In [None]:
df_patient_info["sex"].fillna(str(df_patient_info["sex"].mode().values[0]),inplace=True)
df_patient_info["country"].fillna(str(df_patient_info["country"].mode().values[0]),inplace=True)
df_patient_info["city"].fillna(str(df_patient_info["city"].mode().values[0]),inplace=True)
df_patient_info["confirmed_date"].fillna(str(df_patient_info["confirmed_date"].mode().values[0]),inplace=True)

In [None]:
df_patient_info.isna().sum()

In [None]:
df_patient_info.head()

***now let's set the gender to 1 and 0***

In [None]:
#m=1,f=0

df_patient_info["sex"]=[1 if i.strip()== "male" else 0 for i in df_patient_info.sex]

In [None]:
df_patient_info.head()

In [None]:
df_patient_info.age.describe()

In [None]:
len(df_patient_info[df_patient_info.age==0.0])

***Let's arrange ages according to their groups***

In [None]:
df_patient_info['age']=df_patient_info['age']
bins=[0,14,28,42,56,70,100]
labels=["Child","Young","Young Adult","Early Adult","Adult","Senior"]
df_patient_info['age_group']=pd.cut(df_patient_info['age'],bins,labels=labels)


In [None]:
df_patient_info.head()

In [None]:
df_patient_info.age_group.isna().sum()

In [None]:
df_patient_info.age_group.unique()

***let's fill other empty places***

In [None]:
df_patient_info["age_group"].fillna(str(df_patient_info["age_group"].mode().values[0]),inplace=True)

In [None]:
df_patient_info.country.dtypes

In [None]:
df_patient_info.dtypes


In [None]:
df_patient_info.country.unique()

***Let's organize the countries according to their continents***

In [None]:
# Create a list to store the data
continents = []

# For each row in the column,
for row in df_patient_info['country']:
    if row =="Korea":
        continents.append('Asia')
    elif row=="China":
        continents.append('Asia')
    elif row=="United States":
        continents.append('North America')
    elif row=="France":
        continents.append('Europe')
    elif row=="Thailand":
        continents.append('Asia')
    elif row=="Canada":
        continents.append('North America')
    elif row=="Switzerland":
        continents.append('Europe')
    elif row=="Indonesia":
        continents.append('Asia')
    elif row=="Mongolia":
        continents.append("Asia")
    elif row=="Spain":
        continents.append("Europe")
    elif row=="Foreign":
        continents.append("Asia")
    else:
        continents.append('Failed')
        
# Create a column from the list
df_patient_info['continents'] = continents

In [None]:
df_patient_info.head()

In [None]:
df_patient_info.continents.unique()

**Let's look at what days and months the case was made**

In [None]:
df_patient_info['months']=0
for i in df_patient_info:
    df_patient_info['months']=df_patient_info['confirmed_date'].str.split('-', 0).str[1].str.strip() 
df_patient_info.head()

In [None]:
df_patient_info['days']=0
for i in df_patient_info:
    df_patient_info['days']=df_patient_info['confirmed_date'].str.split('-',0).str[2].str.strip() 
df_patient_info.head()

In [None]:
df_patient_info.dtypes

In [None]:
df_patient_info["months"]=df_patient_info["months"].apply(lambda x: int(x))
df_patient_info["days"]=df_patient_info["days"].apply(lambda x: int(x))

one last time.. checking


In [None]:
df_patient_info.dtypes

In [None]:
df_patient_info.isna().sum()

## Visualize

Age Group vs. Sex

In [None]:
fig=plt.figure(figsize=(10,5))
sns.barplot(x='age_group',y='sex',data=df_patient_info)
plt.legend()
plt.show()

In [None]:
x=df_patient_info.sex
y=df_patient_info.age
plt.plot(x, y, '-p', color='gray',
         markersize=15, linewidth=4,
         markerfacecolor='white',
         markeredgecolor='gray',
         markeredgewidth=3)
plt.ylim(0, 100);


In [None]:
#man
df_patient_info[df_patient_info.sex==1].age.describe()

In [None]:
#women
df_patient_info[df_patient_info.sex==0].age.describe()

In [None]:
print("Sum of Korea:",len(df_patient_info[df_patient_info.country=="Korea"]))
print("Sum of China:",len(df_patient_info[df_patient_info.country=="China"]))
print("Sum of United States", len(df_patient_info[df_patient_info.country=="United States"]))
print("Sum of France", len(df_patient_info[df_patient_info.country=="France"]))
print("Sum of Thailand", len(df_patient_info[df_patient_info.country=="Thailand"]))
print("Sum of Canada:",len(df_patient_info[df_patient_info.country=="Canada"]))
print("Sum of Switzerland:",len(df_patient_info[df_patient_info.country=="Switzerland"]))
print("Sum of Indonesia", len(df_patient_info[df_patient_info.country=="Indonesia"]))
print("Sum of Foreign", len(df_patient_info[df_patient_info.country=="Foreign"]))
print("Sum of Mongolia", len(df_patient_info[df_patient_info.country=="Mongolia"]))
print("Sum of Spain", len(df_patient_info[df_patient_info.country=="Spain"]))

<a id="2"></a>
## 2. Region

In [None]:
df_region.head()

In [None]:
df_region.tail()

In [None]:
df_region.drop(["code"],axis=1,inplace=True)

In [None]:
df_region.isna().sum()

In [None]:
df_region.dtypes

<a id="3"></a>
## 3. Search Trend

In [None]:
df_search_trend.head()

In [None]:
df_search_trend.tail()

In [None]:
df_search_trend['months']=0
for i in df_search_trend:
    df_search_trend['months']=df_search_trend['date'].str.split('-', 0).str[1].str.strip() 
df_search_trend.head()

In [None]:
df_search_trend['days']=0
for i in df_search_trend:
    df_search_trend['days']=df_search_trend['date'].str.split('-',0).str[2].str.strip() 
df_search_trend.head()

In [None]:
df_search_trend.dtypes

In [None]:
df_search_trend["months"]=df_search_trend["months"].apply(lambda x: int(x))
df_search_trend["days"]=df_search_trend["days"].apply(lambda x: int(x))

In [None]:
df_search_trend.isna().sum()

<a id="4"></a>
## 4. Time Gender

In [None]:
df_time_gender.head()

In [None]:
df_time_gender.tail()

In [None]:
df_time_gender['months']=0
for i in df_time_gender:
    df_time_gender['months']=df_time_gender['date'].str.split('-', 0).str[1].str.strip() 
df_time_gender.head()

In [None]:
df_time_gender['days']=0
for i in df_time_gender:
    df_time_gender['days']=df_time_gender['date'].str.split('-',0).str[2].str.strip() 
df_time_gender.head()

In [None]:
df_time_gender.dtypes

In [None]:
df_time_gender["months"]=df_time_gender["months"].apply(lambda x: int(x))
df_time_gender["days"]=df_time_gender["days"].apply(lambda x: int(x))

In [None]:
df_time_gender.dtypes

In [None]:
df_time_gender.isna().sum()

<a id="5"></a>
## 5. Weather

In [None]:
df_weather.head()

In [None]:
df_weather.drop(["code"],axis=1,inplace=True)

In [None]:
df_weather.head()

In [None]:
df_weather.isna().sum()

In [None]:
df_weather=df_weather.dropna()

In [None]:
df_weather.isna().sum()

In [None]:
df_weather['months']=0
for i in df_weather:
    df_weather['months']=df_weather['date'].str.split('-', 0).str[1].str.strip() 
df_weather.head()

In [None]:
df_weather['days']=0
for i in df_weather:
    df_weather['days']=df_weather['date'].str.split('-',0).str[2].str.strip() 
df_weather.head()

In [None]:
df_weather.dtypes

In [None]:
df_weather["months"]=df_weather["months"].apply(lambda x: int(x))
df_weather["days"]=df_weather["days"].apply(lambda x: int(x))

In [None]:
df_weather.dtypes

<a id="6"></a>
## 6. Case

In [None]:
df_case.head()

In [None]:
df_case.drop(["case_id"],axis=1,inplace=True)

In [None]:
df_case.tail()

In [None]:
df_case.isna().sum()

i will back there

<a id="7"></a>
## 7. Time

In [None]:
df_time.head()

In [None]:
df_time.tail()

In [None]:
df_time['months']=0
for i in df_time:
    df_time['months']=df_time['date'].str.split('-', 0).str[1].str.strip() 
df_time.head()

In [None]:
df_time['days']=0
for i in df_time:
    df_time['days']=df_time['date'].str.split('-',0).str[2].str.strip() 
df_time.head()

In [None]:
df_time.dtypes

In [None]:
df_time["months"]=df_time["months"].apply(lambda x: int(x))
df_time["days"]=df_time["days"].apply(lambda x: int(x))

In [None]:
df_time.dtypes

In [None]:
df_time.isna().sum()

<a id="8"></a>
## 8. Time Province

In [None]:
df_time_province.head()

In [None]:
df_time_province.tail()

In [None]:
df_time_province['months']=0
for i in df_time_province:
    df_time_province['months']=df_time_province['date'].str.split('-', 0).str[1].str.strip() 
df_time_province.head()

In [None]:
df_time_province['days']=0
for i in df_time_province:
    df_time_province['days']=df_time_province['date'].str.split('-',0).str[2].str.strip() 
df_time_province.head()

In [None]:
df_time_province.dtypes

In [None]:
df_time_province["months"]=df_time_province["months"].apply(lambda x: int(x))
df_time_province["days"]=df_time_province["days"].apply(lambda x: int(x))

In [None]:
df_time_province.dtypes

In [None]:
df_time_province.isna().sum()

<a id="9"></a>
## 9. Time Age

In [None]:
df_time_age.head()

In [None]:
df_time_age.tail()

In [None]:
df_time_age.isna().sum()

In [None]:
df_time_age['months']=0
for i in df_time_age:
    df_time_age['months']=df_time_age['date'].str.split('-', 0).str[1].str.strip() 
df_time_age.head()

In [None]:
df_time_age['days']=0
for i in df_time_age:
    df_time_age['days']=df_time_age['date'].str.split('-',0).str[2].str.strip() 
df_time_age.head()

In [None]:
df_time_age.dtypes

In [None]:
df_time_age["months"]=df_time_age["months"].apply(lambda x: int(x))
df_time_age["days"]=df_time_age["days"].apply(lambda x: int(x))

In [None]:
df_time_age.dtypes

In [None]:
df_time_age['age'] = df_time_age['age'].str.replace(r'\D', '')

In [None]:
df_time_age.head()

In [None]:
df_time_age.dtypes

In [None]:
df_time_age["age"]=df_time_age["age"].apply(lambda x: int(x))

In [None]:
df_time_age.dtypes

<a id="10"></a>
## 10. Patient Route

In [None]:
df_patient_route.head()

In [None]:
df_patient_route.tail()

In [None]:
df_patient_route.drop(["patient_id","global_num"],axis=1, inplace=True)

In [None]:
df_patient_route.head()

In [None]:
df_patient_route['months']=0
for i in df_patient_route:
    df_patient_route['months']=df_patient_route['date'].str.split('-', 0).str[1].str.strip() 
df_patient_route.head()

In [None]:
df_patient_route['days']=0
for i in df_patient_route:
    df_patient_route['days']=df_patient_route['date'].str.split('-',0).str[2].str.strip() 
df_patient_route.head()

In [None]:
df_patient_route.dtypes

In [None]:
df_patient_route["months"]=df_patient_route["months"].apply(lambda x: int(x))
df_patient_route["days"]=df_patient_route["days"].apply(lambda x: int(x))

In [None]:
df_patient_route.dtypes

In [None]:
df_patient_route.isna().sum()

<a id="11"></a>
## 11. Seoul Floating

In [None]:
df_seoul_floating.head()

In [None]:
df_seoul_floating.tail()

In [None]:
df_seoul_floating['months']=0
for i in df_seoul_floating:
    df_seoul_floating['months']=df_seoul_floating['date'].str.split('-', 0).str[1].str.strip() 
df_seoul_floating.head()

In [None]:
df_seoul_floating['days']=0
for i in df_seoul_floating:
    df_seoul_floating['days']=df_seoul_floating['date'].str.split('-',0).str[2].str.strip() 
df_seoul_floating.head()

In [None]:
df_seoul_floating.dtypes

In [None]:
df_seoul_floating["months"]=df_seoul_floating["months"].apply(lambda x: int(x))
df_seoul_floating["days"]=df_seoul_floating["days"].apply(lambda x: int(x))

In [None]:
df_seoul_floating.dtypes

<a id="12"></a>
## 12. Policy

In [None]:
df_policy.head()

In [None]:
df_policy.drop(["policy_id"],axis=1,inplace=True)

In [None]:
df_policy.tail()

In [None]:
df_policy.isna().sum()

In [None]:
df_policy=df_policy.dropna()

In [None]:
df_policy.isna().sum()

In [None]:
df_policy['start_months']=0
for i in df_policy:
    df_policy['start_months']=df_policy['start_date'].str.split('-', 0).str[1].str.strip() 
df_policy.head()

In [None]:
df_policy['start_days']=0
for i in df_policy:
    df_policy['start_days']=df_policy['start_date'].str.split('-',0).str[2].str.strip() 
df_policy.head()

In [None]:
df_policy['end_months']=0
for i in df_policy:
    df_policy['end_months']=df_policy['end_date'].str.split('-', 0).str[1].str.strip() 
df_policy.head()

In [None]:
df_policy['end_days']=0
for i in df_policy:
    df_policy['end_days']=df_policy['end_date'].str.split('-',0).str[2].str.strip() 
df_policy.head()

In [None]:
df_policy.dtypes

In [None]:
df_policy["start_months"]=df_policy["start_months"].apply(lambda x: int(x))
df_policy["start_days"]=df_policy["start_days"].apply(lambda x: int(x))
df_policy["end_months"]=df_policy["end_months"].apply(lambda x: int(x))
df_policy["end_days"]=df_policy["end_days"].apply(lambda x: int(x))

In [None]:
df_policy.dtypes

In [None]:
df_policy.gov_policy.unique()

In [None]:
df_policy.detail.unique()