# Description

Find relationship between common GDP and human development measures and lethality of COVID19 with decision tree regressions
Data collected from the 'Impact of covid19 pandemic on the global economy' (raw data)

# Import relevant packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE

# Read data with pandas
Assign the file to 'covid' and view head(), info(), and describe()

In [None]:
covid = pd.read_csv(r'../input/impact-of-covid19-pandemic-on-the-global-economy/raw_data.csv', parse_dates = ['date'])
display(covid.head())
covid.info()
covid.describe()

# Prepare features
Note that data is shown per country, on a daily basis
Note that there are features that are irrevelevant (Unnamed: 9, 10, 11, 12)

In [None]:
last_date = covid['date'].max()
covid = covid[covid['date'] == last_date]

covid = covid.loc[:, ['location', 'total_deaths', 'population', 'gdp_per_capita', 'human_development_index']].copy()
covid = covid.dropna(axis = 0)
covid.info()

# Exploratory Data Analysis

In [None]:
plt.scatter(covid['total_deaths'], covid['gdp_per_capita'])
plt.title('Total Deaths and GDP per Capita')
plt.xlabel('Total Deaths')
plt.ylabel('GDP per Capita')
plt.xlim(0, 4000)
plt.show()

In [None]:
plt.scatter(covid['total_deaths'], covid['human_development_index'])
plt.title('Total Deaths and Human Development Index')
plt.xlabel('Total Deaths')
plt.ylabel('Human Development Index')
plt.xlim(0, 4000)
plt.ylim(0.3, 1)
plt.show()

In [None]:
plt.scatter(covid['total_deaths'], covid['population'])
plt.title('Total Deaths and Population')
plt.xlabel('Total Deaths')
plt.ylabel('Population')
plt.xlim(0, 400)
plt.ylim(0, 10000000)
plt.show()

# Prepare training and test sets

In [None]:
X = covid.drop(['location', 'total_deaths'], axis = 1)
y = covid['total_deaths']

dt = DecisionTreeRegressor(max_depth = 7, min_samples_leaf = 0.025, random_state = 3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 3)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
score = dt.score(X_test, y_test)

print('ROC AUC score: {:.2f}'.format(score))

Not the best AUC score, but further analysis needed