In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import libraries 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import re
import datetime
import folium
import geopandas as gpd
from shapely.geometry import Point, Polygon
from geopandas import GeoDataFrame
import math
from folium.plugins import HeatMap, MarkerCluster
from folium import Marker,GeoJson,Choropleth, Circle

## Import datasets

In [None]:
case = pd.read_csv('/kaggle/input/coronavirusdataset/Case.csv')
pat_info = pd.read_csv('/kaggle/input/coronavirusdataset/PatientInfo.csv')
policy = pd.read_csv('/kaggle/input/coronavirusdataset/Policy.csv')
religion = pd.read_csv('/kaggle/input/coronavirusdataset/Region.csv')
trend = pd.read_csv('/kaggle/input/coronavirusdataset/SearchTrend.csv')
floating = pd.read_csv('/kaggle/input/coronavirusdataset/SeoulFloating.csv')
time = pd.read_csv('/kaggle/input/coronavirusdataset/Time.csv')
time_age = pd.read_csv('/kaggle/input/coronavirusdataset/TimeAge.csv')
time_gender = pd.read_csv('/kaggle/input/coronavirusdataset/TimeGender.csv')
time_prov = pd.read_csv('/kaggle/input/coronavirusdataset/TimeProvince.csv')
weather = pd.read_csv('/kaggle/input/coronavirusdataset/Weather.csv')

# 1. Analyzing the first dataset

In [None]:
case.head()

## A bar plot of confirmed cases and province shows Daegu as having the most number of cases


In [None]:
plt.figure(figsize = (8,6))
sns.barplot('confirmed','province', data = case)
 

## Most of the infections happened in Shincheonji church, then the hospitals

In [None]:
plt.figure(figsize = (20, 15)) 
sns.barplot('confirmed','infection_case', data = case)

## Most infected city is Nam-gu

In [None]:
plt.figure(figsize = (10,8)) 
sns.barplot('confirmed','city', data = case)

## To find out which province and city Shincheonji Church (the most infected place) is in I grouped the data by province, city and infection_case.
### Shincheonji Church  is in the province of Daegu in the city of Nam-gu, all of which are the highest infected places

In [None]:
d = case.groupby(['province','city','infection_case']).sum()

In [None]:
pd.set_option('display.max_rows', d.shape[0]+1)
pd.set_option('display.max_columns', d.shape[0]+1)

In [None]:
d

## Replacing True and False values in the column 'group' with integers

In [None]:
le = LabelEncoder()
case.group = le.fit_transform(case.group)

In [None]:
case.head()

 # In the provinces the majority of infection happened among groups 

In [None]:
plt.figure(figsize = (6,6)) 
sns.violinplot('group', 'province',data = case)

## In the different cities, the infection also happened in groups

In [None]:
plt.figure(figsize = (8,6)) 
sns.swarmplot('group','city', data = case)

## In the various specific locations, the infections also happened in groups 

In [None]:
plt.figure(figsize = (12,12)) 

sns.swarmplot('group','infection_case', data = case)

### Most of the infection happened in groups rather than not

## Looking at the 'latitude' and 'longitude' columns to start prepping to plot a map

In [None]:
case.head(10) # They have some more or less empty spaces

### Converting it to numeric using this method changes the empty columns to 'NaN' columns.

In [None]:
case.latitude = pd.to_numeric(case.latitude, errors='coerce')
case.longitude = pd.to_numeric(case.longitude, errors='coerce')

In [None]:
case.head(10)

 ## Then I can drop null columns. I tried dropping the empty columns without first converting them to numeric and it didn't work. So this was a solution

In [None]:
case.dropna(axis = 0, inplace = True)

In [None]:
case.head(10)

In [None]:
m = folium.Map(location = [37.538621,126.992652], tiles = 'Stamen Terrain', zoom_start =7)
for idx, row in case.iterrows():
    Marker([row['latitude'], row['longitude']], popup=row['confirmed']).add_to(m)
    
m

# 2. Analyzing the second dataset: Patient Information

In [None]:
pat_info.head()  

## Checking the type of object in the released_date column

In [None]:
type(pat_info.released_date)

## My goal is to subtract the two dates, so I can get the duration of the stay in the hospital. To do that I have to converted the object type from series to Datetime and then subtracted them

In [None]:
pat_info.confirmed_date = pd.to_datetime(pat_info.confirmed_date, errors = 'coerce')

In [None]:
pat_info.released_date = pd.to_datetime(pat_info.released_date, errors = 'coerce')

In [None]:
pat_info['num_of_days_in_hospital'] = (pat_info['released_date'] - pat_info['confirmed_date'])

In [None]:
pat_info.head()

## Let's drop columns I don't need

In [None]:
pat_info.drop(['patient_id', 'infected_by','symptom_onset_date','confirmed_date','released_date','deceased_date'], axis = 1, inplace = True)

In [None]:
pat_info.head()

## Replace the genders with integers

In [None]:
pat_info.sex.replace('male', 1, inplace = True)
pat_info.sex.replace('female', 0, inplace = True)


## Strip the 's' from the intergers in the age column

In [None]:
 pat_info['age'] =  pat_info['age'].str.rstrip('s')

### Converting the age column from series to numeric

In [None]:
 pat_info['age'] =  pd.to_numeric( pat_info['age'])

In [None]:
pat_info.head()

## Checking to see the number of nulls in each column

In [None]:
pat_info.isna().sum()

# Analysis of the data
## I decided not to replace the nulls in this dataset, so I can see the data as it is without any manipulation 


## Distribution of the age column

In [None]:
sns.distplot(pat_info['age']) # More people within the age of 20 and 60 were infected

## More older people died

In [None]:
sns.catplot('state','age',  data = pat_info, kind = 'violin') 

## Males, more than females, died. More females were isolated and released

In [None]:
sns.violinplot('state','sex',data = pat_info)

## The median age distribution of the people infected different nations in South Korea is about 30

In [None]:
plt.figure(figsize = (10,8))
sns.violinplot('age', 'country', data = pat_info)

## There seem to be more males than females, from the different nations

In [None]:
sns.violinplot('sex', 'country', data = pat_info)

## Let's take out the 'days' string from the num_of_days_in_hospital column

In [None]:
pat_info['num_of_days_in_hospital'] = pat_info['num_of_days_in_hospital']/np.timedelta64(1, 'D')

In [None]:
pat_info.head()

## The distribution of the number of days in the hospital. 
### On average, it's about 20days 

In [None]:
plt.figure(figsize = (10,8))
sns.distplot(pat_info['num_of_days_in_hospital'])

## Generally, the greater the age the greater the number of days in the hospital

In [None]:
plt.figure(figsize = (10,8))
sns.lineplot('age','num_of_days_in_hospital',  data = pat_info)

## Not every place had hospitilizations, and those that did the longest stay came from the Gyeonsang Seorin Nursing Home

In [None]:
plt.figure(figsize = (12,10))
sns.violinplot('num_of_days_in_hospital','infection_case', data = pat_info)

## Those who were isolated were released from the hospital sooner. Those who died stayed in the hospital the longest.

In [None]:
plt.figure(figsize = (8,6))
sns.violinplot('state','num_of_days_in_hospital',data = pat_info)

## Gender didn't really determine the length of stay in the hospital

In [None]:
plt.figure(figsize = (8,6))
sns.boxplot('sex','num_of_days_in_hospital',data = pat_info)

## Let's replace the 'state' column with integers

In [None]:
pat_info.state.replace('released', 2, inplace = True)
pat_info.state.replace('isolated', 1, inplace = True)
pat_info.state.replace('deceased', 0, inplace = True)
 

## Most of the deceased came from Mongolia; the rest of the countries were either isolated or released

In [None]:
plt.figure(figsize = (8,6))
sns.violinplot('state','country',data = pat_info)

## Checking type of data in 'contact_number' column

In [None]:
type(pat_info.contact_number)

## Converting it to numeric

In [None]:
pat_info.contact_number = pd.to_numeric(pat_info.contact_number,errors='coerce')

In [None]:
pat_info['contact_number'].head(20)

In [None]:
pat_info.dropna(axis = 0, inplace =True)

In [None]:
pat_info['contact_number'].head(20)

## Distribution of the number of people contacted

In [None]:
plt.figure(figsize = (10,8))
sns.distplot(pat_info['contact_number'])

In [None]:
plt.figure(figsize = (8,6)) 
sns.violinplot('sex','contact_number', data = pat_info)

## There's a bell curve with the age and the contact number

In [None]:
plt.figure(figsize = (8,6))
sns.scatterplot('age','contact_number', data = pat_info)

## There's no trend that's observed with the number of days in the hospital verses contact number

In [None]:
plt.figure(figsize = (8,6))
sns.scatterplot('num_of_days_in_hospital','contact_number', data = pat_info)

In [None]:
pat_info.head()

## Highest contact happened in the clubs

In [None]:
plt.figure(figsize = (10,8))
sns.violinplot('contact_number','infection_case', data = pat_info)

## There isn't enough information to be able to any insight into the number of contact versus the state, because when I dropped the nulls, the rows for state also dropped, leaving behind only the state of being release, which is labeled 2. We won't know if the whether or not the person was deceased, released or isolated had anything to do with the number of contacts.

In [None]:
plt.figure(figsize = (10,8))
sns.violinplot('state','contact_number', data = pat_info)

## I confirmed the types of values in the 'state' column, and it does show only 2 

In [None]:
pat_info.state.value_counts()

# Please note the other datasets will be analyzed in other notebooks