# **Datasets**
All of datasets are taken from Kaggle, please visit kaggle site to use them in your notebook. This dataset is taken from: https://www.kaggle.com/imdevskp/corona-virus-report

In [0]:
# import the necessary packages
import os
from google.colab import drive
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

In [6]:
# read csv data
covid_data = pd.read_csv('/your_google_drive_folder_path/covid_19_clean_complete.csv', parse_dates=['Date'])
covid_data.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0


In [0]:
# renaming columns for better data processing
covid_data.rename(columns={'Date': 'date', 
                     'Province/State':'state',
                     'Country/Region':'country',
                     'Confirmed': 'confirmed',
                     'Deaths':'deaths',
                     'Recovered':'recovered'
                    }, inplace=True)

In [0]:
# cases to be considered in this notebook
cases = ['confirmed', 'deaths', 'recovered', 'active']

# Active Case = confirmed - deaths - recovered
covid_data['active'] = covid_data['confirmed'] - covid_data['deaths'] - covid_data['recovered']

# replacing Mainland china with just China
covid_data['country'] = covid_data['country'].replace('Mainland China', 'China')

# filling missing values 
covid_data[['state']] = covid_data[['state']].fillna('')
covid_data[cases] = covid_data[cases].fillna(0)
covid_data.rename(columns={'Date':'date'}, inplace=True)

In [9]:
# dataset information
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18688 entries, 0 to 18687
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   state      18688 non-null  object        
 1   country    18688 non-null  object        
 2   Lat        18688 non-null  float64       
 3   Long       18688 non-null  float64       
 4   date       18688 non-null  datetime64[ns]
 5   confirmed  18688 non-null  int64         
 6   deaths     18688 non-null  int64         
 7   recovered  18688 non-null  int64         
 8   active     18688 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 1.3+ MB


In [10]:
# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {covid_data['date'].min()}")
print(f"Last Entry:     {covid_data['date'].max()}")
print(f"Total Days:     {covid_data['date'].max() - covid_data['date'].min()}")

External Data
Earliest Entry: 2020-01-22 00:00:00
Last Entry:     2020-04-03 00:00:00
Total Days:     72 days 00:00:00


In [11]:
# plotting the values of confirmed and dead cases/worlwide by country
pio.templates.default = "plotly_dark"

group = covid_data.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()

fig = px.line(group, x="date", y="confirmed", 
              title="Worldwide Confirmed Cases Over Time(Serial: 1)")

fig.show()

fig = px.line(group, x="date", y="deaths", 
              title="Worldwide Deaths Over Time(Serial: 2)")

fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [12]:
# calculating mortality rate based on deaths and confirmed value and plotting them ranked by country
cleaned_latest = covid_data[covid_data['date'] == max(covid_data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'deaths', 'recovered', 'active'].sum().reset_index()

flg['mortalityRate'] = round((flg['deaths']/flg['confirmed'])*100, 2)
temp = flg[flg['confirmed']>1000]
temp = temp.sort_values('mortalityRate', ascending=False)

fig = px.bar(temp.sort_values(by="mortalityRate", ascending=False)[:10][::-1],
             x = 'mortalityRate', y = 'country', 
             title='Deaths per 1000 Confirmed Cases', text='mortalityRate', height=800, orientation='h',
             color_discrete_sequence=['red']
            )
fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [13]:
# reading data from temparature dataset(available in Kaggle)
df_temperature = pd.read_csv("/your_google_drive_folder_path/temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')
df_temperature.head()

Unnamed: 0,index,country,state,date,humidity,sunHour,tempC,windspeedKmph
0,0,Afghanistan,,2020-01-22,65.0,8.7,-1.0,8.0
1,1,Afghanistan,,2020-01-23,59.0,8.7,-3.0,8.0
2,2,Afghanistan,,2020-01-24,71.0,7.1,0.0,7.0
3,3,Afghanistan,,2020-01-25,79.0,8.7,0.0,7.0
4,4,Afghanistan,,2020-01-26,64.0,8.7,-1.0,8.0


In [14]:
# merging two dataset and build a new one
covid_by_pop_temp = covid_data.merge(df_temperature, on=['country','date', 'state'], how='inner')
covid_by_pop_temp

Unnamed: 0,state,country,Lat,Long,date,confirmed,deaths,recovered,active,index,humidity,sunHour,tempC,windspeedKmph
0,,Afghanistan,33.000000,65.000000,2020-01-22,0,0,0,0,0,65.0,8.7,-1.0,8.0
1,,Albania,41.153300,20.168300,2020-01-22,0,0,0,0,60,51.0,7.0,11.0,3.0
2,,Algeria,28.033900,1.659600,2020-01-22,0,0,0,0,120,50.0,10.5,19.0,16.0
3,,Andorra,42.506300,1.521800,2020-01-22,0,0,0,0,180,98.0,3.5,0.0,7.0
4,,Argentina,-38.416100,-63.616700,2020-01-22,0,0,0,0,300,61.0,10.5,33.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11813,,Uzbekistan,41.377500,64.585300,2020-03-21,43,0,0,43,16496,63.0,6.1,18.0,8.0
11814,,Venezuela,6.423800,-66.589700,2020-03-21,70,0,0,70,16556,75.0,8.8,25.0,6.0
11815,,Vietnam,16.000000,108.000000,2020-03-21,94,0,17,77,16616,81.0,8.9,30.0,12.0
11816,,Zambia,-15.416700,28.283300,2020-03-21,2,0,0,2,16676,85.0,7.4,21.0,9.0


In [16]:
# assigning the new dataset as the training dataset that we will use in our model
train_data = covid_by_pop_temp
print(train_data.shape)

(11818, 14)


In [0]:
# Dropping some columns that will be uncessary for our task in this notebook
train_data = train_data.drop([
                     "country", 
                     "active", 
                     "recovered", 
                     "state",
                     "Lat",
                     "Long",
                     "date",
                     "index",
                    ], axis= 1).dropna()

In [18]:
# priting some data to view the dataset
train_data.head()

Unnamed: 0,confirmed,deaths,humidity,sunHour,tempC,windspeedKmph
0,0,0,65.0,8.7,-1.0,8.0
1,0,0,51.0,7.0,11.0,3.0
2,0,0,50.0,10.5,19.0,16.0
3,0,0,98.0,3.5,0.0,7.0
4,0,0,61.0,10.5,33.0,13.0


In [0]:
# assigning the training data to the training variables
y = train_data[["confirmed", "deaths"]]
X = train_data.drop(["confirmed", "deaths"],axis=1)

In [20]:
# printing training variable data shape
print(X.shape, y.shape)

(11700, 4) (11700, 2)


In [0]:
# Split into training and evaluation data:
from sklearn.model_selection import train_test_split as tts
from sklearn.tree import DecisionTreeRegressor
X_train, X_val, y_train, y_val = tts(X, y, test_size= 0.2, random_state=42, shuffle=True)

In [0]:
# initializing regression models
model_infected = DecisionTreeRegressor(random_state=42, criterion="mae")
model_deaths = DecisionTreeRegressor(random_state=42, criterion="mae")

In [0]:
# fitting data into the model
model_infected = model_infected.fit(X, y["confirmed"])
model_deaths = model_deaths.fit(X, y["deaths"])

In [26]:
# plotting with plotly graph
infected_importances = model_infected.feature_importances_
infected_indices = np.argsort(infected_importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

# priting features based on score
for f in range(X.shape[1]):
    print("{}. Feature: {}, Importance: {}".format(f + 1, X.columns[infected_indices[f]], infected_importances[infected_indices[f]]))

# plotting features based on score
fig = px.bar(range(X.shape[1]), x=X.columns[infected_indices], y=infected_importances[infected_indices] * 100)
fig.update_layout(
    title="Percentage of Features for Corona Infected People(Serial: 5)",
    xaxis_title="Features",
    yaxis_title="(%)Percentage of Score",
)
fig.show()

Feature ranking:
1. Feature: humidity, Importance: 0.4370039834879456
2. Feature: tempC, Importance: 0.39290379108818785
3. Feature: sunHour, Importance: 0.09899283694797995
4. Feature: windspeedKmph, Importance: 0.07109938847588652


In [27]:
# plotting with plotly graph
death_importances = model_deaths.feature_importances_
death_indices = np.argsort(death_importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

# priting features based on score
for f in range(X.shape[1]):
    print("{}. Feature: {}, Importance: {}".format(f + 1, X.columns[death_indices[f]], death_importances[death_indices[f]]))

# plotting features based on score
fig = px.bar(range(X.shape[1]), x=X.columns[death_indices], y=death_importances[death_indices] * 100)
fig.update_layout(
    title="Percentage of Features for Dead People Infected by Corona(Serial: 6)",
    xaxis_title="Features",
    yaxis_title="(%) Percentage of Score",
)
fig.show()

Feature ranking:
1. Feature: tempC, Importance: 0.4386885245901657
2. Feature: sunHour, Importance: 0.43016393442622675
3. Feature: windspeedKmph, Importance: 0.0998907103825149
4. Feature: humidity, Importance: 0.031256830601092786
