In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing data

In [None]:
data =pd.read_csv('/kaggle/input/latest-covid19-cases-eu-countries-data/EU_COVID_data_15July2021.csv')
data.head()

#### See the type of data in the columns

In [None]:
data.info()

### Obtain a list of EU countries

In [None]:
countries = list(data['Country'])

#### Creating new columns
* Cases ratio = Number of cases divided by the population of the country (to ensure fair comparison)
* Death rate = Number of deaths divided by the number of confirmed cases
* Test ratio = Number of tests performed divided by the population of the country
* Test positivity = Number of confirmed cases divided by the number of tests performed


In [None]:
data['Cases_ratio'] = (data['Confirmed']/data['Population'])
data['Death_rate'] = (data['Deaths']/data['Confirmed'])
data['Test_ratio'] = (data['Tests']/data['Population'])
data['Test_positivity'] = (data['Confirmed']/data['Tests'])
data.head()

#### Sort values to create new dataframes for each category

In [None]:
df_confirmed = data.sort_values(by="Confirmed",ascending=False)
df_deaths = data.sort_values(by="Deaths",ascending=False)
df_critical = data.sort_values(by="Critical",ascending=False)
df_cases_ratio = data.sort_values(by="Cases_ratio",ascending=False)
df_death_rate = data.sort_values(by="Death_rate",ascending=False)
df_test_ratio = data.sort_values(by="Test_ratio",ascending=False)
df_test_positivity = data.sort_values(by="Test_positivity",ascending=False)

## Confirmed COVID-19 cases analysis

In [None]:
df_confirmed.head()

In [None]:
df_confirmed.tail()

In [None]:
fig = sns.catplot(x="Country", y="Confirmed", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### France, Italy, Spain and Germany had the largest number of cases. This is not surprising given their large populations. Estonia, Finland, Cyprus, Luxembourg and Malta had very few cases. 

## COVID-19 deaths analysis

In [None]:
df_deaths.head()

In [None]:
df_deaths.tail()

In [None]:
fig = sns.catplot(x="Country", y="Deaths", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### In terms of number of deaths (absolute number), Italy, France and Germany had the most number of deaths. Estonia, Finland and Luxembourg had the least number of deaths. In general, countries with large populations would naturally have more deaths

## COVID-19 critical cases analysis

In [None]:
df_critical.head()

In [None]:
df_critical.tail()

In [None]:
fig = sns.catplot(x="Country", y="Critical", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### Here we can see that France, Spain, Germany, Portugal and Italy still have many patients in a critical condition

## COVID-19 case ratio analysis

In [None]:
df_cases_ratio.head()

In [None]:
df_cases_ratio.tail()

In [None]:
fig = sns.catplot(x="Country", y="Cases_ratio", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### Proportional to population, we can observe that Czechia, Slovenia, Luxembourg, Sweden and Lithuania had a high number of cases. On the other hand, Ireland, Denmark, Germany, Greece and Finland have lesser number of cases

## COVID-19 death rate analysis

In [None]:
df_death_rate.head()

In [None]:
df_death_rate.tail()

In [None]:
fig = sns.catplot(x="Country", y="Death_rate", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### The death rate was low in countries such as Netherlands, Finland, Estonia, Denmark and Cyprus. On the contrary, a large death rate is observed in Bulgaria, Hungary and Slovakia

## COVID-19 test rate analysis

In [None]:
df_test_ratio.head()

In [None]:
df_test_ratio.tail()

In [None]:
fig = sns.catplot(x="Country", y="Test_ratio", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### Considering the number of tests performed per population, Denmark, Cyprus and Austria performed really well and countries such as Slovakia, Croatia, Romania, Bulgaria and Poland did not perform many tests as a proportion to population

## COVID-19 test positivity analysis

In [None]:
df_test_positivity.head()

In [None]:
df_test_positivity.tail()

In [None]:
fig = sns.catplot(x="Country", y="Test_positivity", kind="bar",
            data=df_test_positivity)
fig.set_xticklabels(rotation=90)

#### Here we can observe that countries like Denmark, Cyprus and Austria were extremely effective in testing, and countries like Slovenia, Croatia and Poland were lagging behind. It is to be noted that a test positivity rate of below 5% is recommended by WHO and only a few countries meet that criteria.

## Clustering similar countries

#### By observing the data, it is not obvious to us which countries are similar to which. Clustering could help in uncovering some hidden patterns. Let us make use of the case ratio (confirmed cases per population' and death rate (deaths divided by confirmed cases)

In [None]:
clustering_data = data[['Country', 'Cases_ratio', 'Death_rate']]
clustering_data.head()

#### We can try K-means clustering with these features to see which countries end up in the same cluster

In [None]:
from sklearn.cluster import KMeans
N_CLUSTERS = 7
kmeans = KMeans(n_clusters = N_CLUSTERS, random_state = 0)
clusters = kmeans.fit_predict(clustering_data[['Cases_ratio', 'Death_rate']])
kmeans.cluster_centers_.shape

#### Placing the countries in the predicted clusters

In [None]:
cluster_dict = {}
for index, row in clustering_data.iterrows():
    #print(index, ': ', row['Country'], 'is in', kmeans.predict([[row['Cases_ratio'],row['Death_rate']]]))
    try:
        cluster_dict[kmeans.predict([[row['Cases_ratio'],row['Death_rate']]])[0]].append(row['Country'])
    except:
        cluster_dict[kmeans.predict([[row['Cases_ratio'],row['Death_rate']]])[0]] = [row['Country']]

#### Printing the clusters

In [None]:
for i in range(N_CLUSTERS):
    print('Cluster ' + str(i) +': ', cluster_dict[i])

#### Cluster 3 seems to contain the countries which were affected the most. Cluster 5 is a grouping of countries that were not affected to a large extent. Surprisingly, Finland and Czechia have their own single-element clusters. Clusters 0 and 1 contained countries which were moderately affected