In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
covid_df = pd.read_csv("/kaggle/input/covid-datasets/time_series_covid_19_confirmed.csv")
covid_df

In [None]:
covid_df = covid_df[["Country/Region", "8/12/20"]]
covid_df.rename(columns = {"8/12/20": "confirmed_cases", "Country/Region": "Country or region"}, 
                inplace=True)
covid_df

In [None]:
# Some countries have more that one row
covid_df[covid_df["Country or region"] == "Denmark"] 

In [None]:
summed_covid_df = covid_df.groupby("Country or region").sum()
summed_covid_df

In [None]:
# Check Denmark again
summed_covid_df[summed_covid_df.index == "Denmark"]

In [None]:
# Top ten
summed_covid_df.sort_values("confirmed_cases", ascending = False)[:10]

In [None]:
happiness_df = pd.read_csv("/kaggle/input/covid-datasets/worldwide_happiness_report.csv")
happiness_df

In [None]:
happiness_df.sort_values(by = "GDP per capita", ascending = False)

In [None]:
happiness_updated_df = happiness_df.replace({"Country or region": {"Czech Republic": "Czechia",
                                                                   "Ivory Coast": "Cote d'Ivoire",
                                                                  "Czechia": "Czech Republic",
                                                                   "Palestinian Territories": "West Bank and Gaza",
                                                                  "South Korea": "Korea, South",
                                                                   "Taiwan": "Taiwan*",
                                                                  "Trinidad & Tobago": "Trinidad and Tobago",
                                                                   "United States": "US",}})
happiness_updated_df = happiness_updated_df.set_index("Country or region")
happiness_updated_df.sort_values(by="GDP per capita", ascending = False)

In [None]:
# Merge DataFrames
merged_df = happiness_updated_df.merge(summed_covid_df, left_index=True, right_index=True)
merged_df.sort_values(by = "GDP per capita", ascending = False)

In [None]:
# Make sure index has no duplicates
len(merged_df.index.unique().tolist())
a = merged_df.index.tolist()
b = merged_df.index.unique().tolist()
print(len(a))
print(len(b))

In [None]:
# Check if confimed cases added up and pulled through correctly for Denmark
int(merged_df[merged_df.index == "Denmark"]["confirmed_cases"]) - int(summed_covid_df[summed_covid_df.index == "Denmark"]["confirmed_cases"])

In [None]:
# Sort DataFrame by "GDP per capita"
merged_df.sort_values(by = "GDP per capita", ascending = False)

In [None]:
# Sort DataFrame by "confirmed_cases"
merged_df.sort_values(by = "confirmed_cases", ascending = False)

In [None]:
# Pop the "Overall rank" column and store is a variable
overall_rank_column = merged_df.pop("Overall rank")
merged_df.sort_values(by = "GDP per capita", ascending = False)

In [None]:
# Correlation
merged_df.corr()

In [None]:
# Plot correlations 
from pandas.plotting import scatter_matrix
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style

matplotlib.style.use("bmh")

attributes = ["GDP per capita", "Social support", 
              "Freedom to make life choices", "Healthy life expectancy"]

scatter_matrix(merged_df[attributes], figsize = (18,12))

In [None]:
# Plot correlation  - GDP per capita : Healthy life expectancy
merged_df.plot(figsize = (16, 6), kind = "scatter", x = "Healthy life expectancy",
               y = "GDP per capita", color = "green")

In [None]:
# Plot correlation  - GDP per capita : Social support
merged_df.plot(figsize = (16, 6), kind = "scatter", x = "Social support",
               y = "GDP per capita", color = "orange")

In [None]:
# Plot correlation  - GDP per capita : Freedom to make life choices
merged_df.plot(figsize = (16, 6), kind = "scatter", x = "Freedom to make life choices",
               y = "GDP per capita", color = "grey")

In [None]:
print("Minimum GDP per capita: ", str(merged_df["GDP per capita"].min()))
print("Mean GDP per capita: ", str(round(merged_df["GDP per capita"].mean(),2)))
print("Median GDP per capita: ", str(round(merged_df["GDP per capita"].median(),2)))
print("Max GDP per capita: ", str(round(merged_df["GDP per capita"].max(),2)))

In [None]:
# Box plot showing percentiles
merged_df.plot(figsize = (16, 6), kind = "box", x = "Score", y = "GDP per capita")

In [None]:
# Create continent lists for small test DataFrame
europe_list = ["Finland","Denmark","Norway","Iceland","Netherlands","Switzerland","Sweden","Austria"
               ,"Luxembourg","United Kingdom","Ireland","Germany","Belgium","Czechia","Malta",
               "France", "Spain","Italy", "Slovakia","Poland", "Lithuania","Slovenia","Kosovo", 
               "Romania", "Cyprus","Latvia", "Hungary","Portugal", "Serbia","Montenegro", "Turkey",
               "Greece", "Ukraine","Estonia", "Moldova", "Croatia","Bosnia and Herzegovina", 
               "Belarus","North Macedonia", "Azerbaijan","Bulgaria", "Albania","Armenia"]

asia_list = ["Israel", "United Arab Emirates","Taiwan*", "Singapore","Bahrain", "Uzbekistan",
             "Kuwait","Thailand","Korea South", "Japan","Kazakhstan", "Pakistan","Russia", "China",
             "Vietnam", "Iran","Iraq", "Bangladesh","India", "Afghanistan","Sri Lanka", "Cambodia",
             "Mongolia", "Philippines","Tajikistan", "Malaysia"," Kyrgyzstan", "Lebanon",
             "Indonesia", "Nepal","Bhutan", "Jordan","Laos", "West Bank and Gaza",
             "Georgia", "Yemen","Syria", "Comoros", "Qatar"]

africa_list = ["Senegal", "Mauritius", "Nigeria","Cameroon", 
               '"Cote dIvoire"',"Ghana", "Congo (Brazzaville)","South Africa", "Namibia",
               "Kenya", "Congo (Kinshasa)","Tunisia", "Mozambique","Ethiopia", "Uganda","Egypt",
               "Zambia","Togo", "Madagascar","Botswana", "South Sudan","Zimbabwe", "Burundi",
               "Rwanda", "Tanzania","Lesotho", "Liberia","Niger", "Libya","Algeria", "Morocco",
               "Benin", "Gabon", "Somalia","Burkina Faso", "Guinea","Gambia", 
               "Mauritania","Central African Republic", "Malawi"," Sierra Leone", "Mali","Chad"]

north_america_list = ["US", "Canada","Mexico", "Trinidad and Tobago","Jamaica",
                      "Dominican Republic","Haiti"]

central_america_list = ["Costa Rica", "Guatemala","Saudi Arabia","Panama", "El Salvador",
                        "Nicaragua", "Honduras"]

south_america_list = ["Chile", "Brazil", "Uruguay", "Colombia","Argentina", "Ecuador","Paraguay",
                      "Peru","Venezuela", "Bolivia"]

australisia_list = ["New Zealand", "Australia"]

# Check total list countries. Should be equal to 151
len(europe_list) + len(australisia_list) + len(asia_list) + len(africa_list) + len(north_america_list) + len(south_america_list) + len(central_america_list)

In [None]:
# Create a continent column
merged_df["continent"] = ""
merged_df

In [None]:
# Asia countries
asia_mask = merged_df.index.isin(asia_list)
asia_df = merged_df[asia_mask]
asia_df["continent"] = "asia" 
print(asia_df.shape)
print(len(asia_list))
asia_df

In [None]:
# Europe countries
europe_mask = merged_df.index.isin(europe_list)
europe_df = merged_df[europe_mask]
europe_df["continent"] = "europe" 
print(europe_df.shape)
print(len(europe_list))
europe_df

In [None]:
# North America countries
north_america_mask = merged_df.index.isin(north_america_list)
north_america_df = merged_df[north_america_mask]
north_america_df["continent"] = "north_america" 
print(north_america_df.shape)
print(len(north_america_list))
north_america_df

In [None]:
# Central America countries
central_america_mask = merged_df.index.isin(central_america_list)
central_america_df = merged_df[central_america_mask]
central_america_df["continent"] = "central_america" 
print(central_america_df.shape)
print(len(central_america_list))
central_america_df

In [None]:
# South America countries
south_america_mask = merged_df.index.isin(south_america_list)
south_america_df = merged_df[south_america_mask]
south_america_df["continent"] = "south_america" 
print(south_america_df.shape)
print(len(south_america_list))
south_america_df

In [None]:
# Africa countries
africa_mask = merged_df.index.isin(africa_list)
africa_df = merged_df[africa_mask]
africa_df["continent"] = "africa" 
print(africa_df.shape)
print(len(africa_list))
africa_df

In [None]:
# Australisia countries
australisia_mask = merged_df.index.isin(australisia_list)
australisia_df = merged_df[australisia_mask]
australisia_df["continent"] = "australisia" 
australisia_df

In [None]:
# Combine all the DataFrames
continent_df = africa_df.append([asia_df, australisia_df, europe_df, south_america_df, 
                                 central_america_df, north_america_df])
continent_df

In [None]:
# Try scatter with colours per continent
# Got the plot code from 
# https://stackoverflow.com/questions/44802561/python-plot-scatter-plot-with-category-and-markersize
# Plot correlation  - GDP per capita : Healthy life expectancy
fig, ax = plt.subplots()
groups = continent_df.groupby('continent')
colors = ['b','g','r','y', 'orange', 'brown', 'purple']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Healthy life expectancy", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])


In [None]:
# Plot correlation  - GDP per capita : Freedom to make life choices
fig, ax = plt.subplots()
groups = continent_df.groupby('continent')
colors = ['b','g','r','y', 'orange', 'brown', 'black']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Freedom to make life choices", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])


In [None]:
# Plot correlation  - GDP per capita : Social support
fig, ax = plt.subplots()
groups = continent_df.groupby('continent')
colors = ['b','g','r','y', 'orange', 'brown', 'black']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])


In [None]:
# Plot correlation  - GDP per capita : Social support
# Africa vs Europe
fig, ax = plt.subplots()

africa_and_europe_df = continent_df.loc[(continent_df["continent"] == "africa") |
                                      (continent_df["continent"] == "europe")]

groups = africa_and_europe_df.groupby('continent')
colors = ['b','g']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])

In [None]:
# Plot correlation  - GDP per capita : Social support
# Africa vs Asia
fig, ax = plt.subplots()

africa_and_asia_df = continent_df.loc[(continent_df["continent"] == "africa") |
                                      (continent_df["continent"] == "asia")]

groups = africa_and_asia_df.groupby('continent')
colors = ['b','g']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])

In [None]:
# Plot correlation  - GDP per capita : Social support
# Europe vs Asia
fig, ax = plt.subplots()

europe_and_asia_df = continent_df.loc[(continent_df["continent"] == "europe") |
                                      (continent_df["continent"] == "asia")]

groups = europe_and_asia_df.groupby('continent')
colors = ['b','g']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])

In [None]:
# Plot correlation  - GDP per capita : Social support
# Europe vs North America
fig, ax = plt.subplots()

europe_and_north_america_df = continent_df.loc[(continent_df["continent"] == "europe") |
                                      (continent_df["continent"] == "north_america")]

groups = europe_and_north_america_df.groupby('continent')
colors = ['b','g']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])

In [None]:
# Plot correlation  - GDP per capita : Social support
# Europe vs North America vs Africa
fig, ax = plt.subplots()

europe_africa_and_north_america_df = continent_df.loc[(continent_df["continent"] == "europe") |
                                      (continent_df["continent"] == "north_america") |
                                                      (continent_df["continent"] == "africa")]
                                                      

groups = europe_africa_and_north_america_df.groupby('continent')
colors = ['orange','darkblue','black']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])

In [None]:
# Plot correlation  - GDP per capita : Social support
# Africa vs South Anmerica
fig, ax = plt.subplots()

africa_and_south_america_df = continent_df.loc[(continent_df["continent"] == "south_america") |
                                      (continent_df["continent"] == "africa")]

groups = africa_and_south_america_df.groupby('continent')
colors = ['orange','black']
for i, (name, group) in enumerate(groups):
    group.plot(figsize = (18,10), kind='scatter',x = "Social support", 
               y='GDP per capita',label=name, ax=ax, color=colors[i])
lgd = ax.legend(numpoints=1)
for handle in lgd.legendHandles:
    handle.set_sizes([150.0])