In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Full dataset from github

This data also comes from ourworldindata.org, but it is a comprehensive set of all data relevant to COVID-19 for every country. This will be the dataset used for further analysis, unless specified. There are lots of unique information such as diabetes prevalence and hospital beds per thousand that will allow for an in depth look at how different countries have responded to the pandemic.

In [None]:
url='https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
new = pd.read_csv(url, error_bad_lines=False)
new

In [None]:
#this dataset is similar in structure to the first dataset, the one that was imported to kaggle from a file
#same procedure follows, except instead of 'Entity', the country column is titled 'location'
countries=new['location'].value_counts()
full_list=[]
for country in countries.index:
    fix=new['location'].isin([country])
    p=new[fix]
    full_list.append(p)

In [None]:
#same idea as get_country method from before
def get_country_github(country):
    for num in range(len(full_list)):
        short=full_list[num]['location'].value_counts()
        if short.index[0]==country:
            return full_list[num]

In [None]:
#sample output from get_country_github method
get_country_github('South Korea')

Because this dataset is so comprehensive, all of the methods will have an 'attribute' parameter. Before, it was assumed that we were analyzing the total number of cases because that was the only piece of data, but now there are many options in which the method call will specify.

In [None]:
import matplotlib.pyplot as plt

In [None]:
#plot any attribute for any country over time
def country_plot_any(country,attribute):
    data = get_country_github(country)
    data.plot(x='date',y=attribute,kind='line')
    plt.xticks(rotation=45)
    
    #add title because now it's important to document which attribute (and country) the graph represents
    plt.title(country+ ' '+ attribute)
    return plt.show()

In [None]:
#sample output for country_plot_any method
country_plot_any('Sweden','total_cases')

In [None]:
#similar to the compare method from before, this method campares any two countries against any (available) attribute
def compare_any(country1,country2,attribute):
    data1=get_country_github(country1)
    data2=get_country_github(country2)
    ax=plt.gca()
    data1.plot(x='date',y=attribute,kind='line',ax=ax)
    data2.plot(x='date',y=attribute,kind='line',ax=ax)
    plt.title(country1+' vs. '+country2+' '+attribute)
    plt.legend(labels=[country1,country2])
    plt.xticks(rotation=45)
    return plt.show()

In [None]:
#sample output for compare_any method
compare_any('Italy','United States','new_cases_per_million')

Let's do some analysis by comparing the countries with the five highest gdp per capita and analyze their rate of new cases per million.

In [None]:
#create blank dataframe
gdp_pc_df=pd.DataFrame()

#iterate through every country in the list of countries' dataframes
for num in range(len(full_list)):
    
    #save country's series of gdp_per_capita entries, reset its index to [0,1,2...], and extract the gdp_per_capita of the first row
    #every row for a given country will have the same gdp_per_capita
    #resetting the index makes it easy to call the [0] element and be sure not to get an error
    var=full_list[num]['gdp_per_capita'].reset_index()['gdp_per_capita'][0]
    
    #save country's series of location entries, reset its index to [0,1,2...], and extract the location of the first row
    #every row for a given country will have the same location
    #resetting the index makes it easy to call the [0] element and be sure not to get an error
    country=full_list[num]['location'].reset_index()['location'][0]
    
    #add the location (aka name of country) and gdp_per_capita to the empty dataframe
    gdp_pc_df=gdp_pc_df.append({'gdp_per_capita':var,'location':country},ignore_index=True)
    
#creates a new dataframe of countries with the 5 largest gdp per capitas
top5_gdp_pc=gdp_pc_df.nlargest(5,'gdp_per_capita')
top5_gdp_pc

I want to graph all five of these countries--> this suggests I create a more vague mathod that takes any number of countries and plots all of their graphs for one attribute.

In [None]:
def compare_any_n(num_countries,attribute):
    
    #create axis variable outside of for loop so I can put all plots on same axis
    ax=plt.gca()
    
    #create blank dataframe of country names that will serve as labels for the legend
    legend_labels=pd.DataFrame()
    
    #number of loops will be the declares number of countries (num_countries)
    for i in range(num_countries):
        
        #this prompts the user to type in the country each time (if num_countries=3, then user will type in the names of 3 countries)
        country=input()
        data = get_country_github(country)
        data.plot(x='date',y=attribute,kind='line',ax=ax)
        
        #add the country to the legend_labels dataframe
        legend_labels=legend_labels.append({'title':country},ignore_index=True)
    
    #normally I would write out each label in order, but each country does not have a distinct variable, as they're all stored as'country'in the for loop
    #storing them in a dataframe allows me to call all of them at once, and in the correct order
    plt.legend(labels=legend_labels['title'])
    plt.xticks(rotation=45)
    
    #make the title the attribute so we know what we're analyzing
    plt.title(attribute)
    return plt.show()

This is great for analyzing any list of countries, in which each country would have to be typed in. However, this could be simplified even further. I created a dataframe to see the countries with the 5 highest GDP per capitas. Let's make a method that instead of having a user input the countries, the countries are read from a dataframe.

In [None]:
def compare_any_n_df(df,attribute):
    
    #most of time time dataframes that are extracted from larger dataframes will retain their original indices
    #reset the index so it is [0,1,2...] and we can iterate through it easily
    df=df.reset_index()
    
    #create axis variabe outside of for loop so that all plots can be put on same axis
    ax=plt.gca()
    
    #like before, create blank dataframe of country names that will serve as labels for the legend
    legend_labels=pd.DataFrame()
    
    #iterate through every row in dataframe
    for i in range(len(df)):
        
        #save name of country in each row
        country=df['location'][i]
        
        #call get_country_github method to get country's full data
        data = get_country_github(country)
        data.plot(x='date',y=attribute,kind='line',ax=ax)
        legend_labels=legend_labels.append({'title':country},ignore_index=True)
    
    #use legend_labels dataframe to correctly apply labels to the legend
    plt.legend(labels=legend_labels['title'])
    plt.xticks(rotation=45)
    plt.title(attribute)
    return plt.show()

In [None]:
#sample output for compare_any_df method
compare_any_n_df(top5_gdp_pc,'total_cases')

Now I can pass a dataframe to the compare method and it will compare all of the countries in the dataframe. This is much easier than having to type out every country. Now the neat part is to pick which countries to compare, and what attributes to compare.

Let's make a method that automatically generates a dataframe of the countries with the highest selected attribute, just like the dataframe we above that got the 5 highest gdp per capitas.

In [None]:
def get_top(n,attribute):
    top=pd.DataFrame()
    for num in range(len(full_list)):
        var=full_list[num][attribute].reset_index()[attribute][0]
        country=full_list[num]['location'].reset_index()['location'][0]
        top=top.append({attribute:var,'location':country},ignore_index=True)
    topn=top.nlargest(n,attribute)
    return topn

Do the same if we want n smallest.

In [None]:
def get_bottom(n,attribute):
    top=pd.DataFrame()
    for num in range(len(full_list)):
        var=full_list[num][attribute].reset_index()[attribute][0]
        country=full_list[num]['location'].reset_index()['location'][0]
        top=top.append({attribute:var,'location':country},ignore_index=True)
    topn=top.nsmallest(n,attribute)
    return topn

Let's mimic some of the above methods by allowing for attributes to be on the x axis and y axis.

In [None]:
#double tag will indicate that there's an attribute on both the xaxis and yaxis.
def compare_any_double(country1,country2,xaxis,yaxis):
    data1=get_country_github(country1)
    data2=get_country_github(country2)
    ax=plt.gca()
    data1.plot(x=xaxis,y=yaxis,kind='line',ax=ax)
    data2.plot(x=xaxis,y=yaxis,kind='line',ax=ax)
    plt.title(country1+' and '+country2+': '+yaxis+' vs. '+xaxis)
    plt.legend(labels=[country1,country2])
    plt.xticks(rotation=45)
    plt.ylabel(yaxis)
    return plt.show()

In [None]:
#compare_any_double sample output
compare_any_double('Sweden','Mexico','total_cases','total_cases_per_million')

In [None]:
def compare_any_n_double(num_countries,xaxis,yaxis):
    ax=plt.gca()
    legend_labels=pd.DataFrame()
    for i in range(num_countries):
        country=input()
        data = get_country_github(country)
        data.plot(x=xaxis,y=yaxis,kind='line',ax=ax)
        legend_labels=legend_labels.append({'title':country},ignore_index=True)
    
    plt.legend(labels=legend_labels['title'])
    plt.xticks(rotation=45)
    plt.title(yaxis+' vs. '+xaxis)
    plt.ylabel(yaxis)
    return plt.show()

In [None]:
#compare_any_n_double sample output
compare_any_n_double(2,'total_cases','total_cases_per_million')

In [None]:
def compare_any_n_df_double(df,xaxis,yaxis):
    df=df.reset_index()
    ax=plt.gca()
    legend_labels=pd.DataFrame()
    for i in range(len(df)):
        country=df['location'][i]
        data = get_country_github(country)
        data.plot(x=xaxis,y=yaxis,kind='line',ax=ax)
        legend_labels=legend_labels.append({'title':country},ignore_index=True)
    
    plt.legend(labels=legend_labels['title'])
    plt.xticks(rotation=45)
    plt.title(yaxis+' vs. '+xaxis)
    plt.ylabel(yaxis)
    return plt.show()

In [None]:
#compare_any_n_df_double sample output
compare_any_n_df_double(top5_gdp_pc,'total_cases','total_cases_per_million')

# An overview of all the methods I have so far

* total_cases (file imported dataset)
    * get_country_dataset(country)
        * returns dataframe of data for specified country 
      
      
* df (original github import)
    * country_data(country)
        * returns dataframe of data for specified country
    * country_plot(country)
        * returns plot of total number of cases over time 
    * compare(country1,country2)
        * returns plot of total number of cases over time for two countries     
    * get_population(country)
        *  returns population of country (from imported population dataset from kaggle)    
    * compare_per_capita 
        *  returns plot of total number of cases over time per capita for two countries 
        
        
* new (full github import)
    * get_country_github(country)
        * returns full dataframe for country
    * country_plot_any(country,attribute)
        * returns plot of country attribute over time
    * compare_any(country1,country2,attribute)
        * returns plot of two countries' attribute over time
    * compare_any_n(num_countries, attribute)
        * returns plot of num_countries specified countries' attribute over time  
    * compare_any_n_df(df,attribute)
        * returns plot of all countries within a dateframes's attribute over time
    * get_top(n,attribute)
        * returns dataframe of top n countries in terms of specified attribute        
    * compare_any_double(country1,country2,xaxis,yaxis)
        * returns plot of two countries' yaxis vs. xaxis
    * compare_any_n_double(num_countries,xaxis,yaxis)
        *  returns plot of num_countries specified countries' yaxis vs. xaxis
    * compare_any_n_df_double(df,xaxis,yaxis)
        *  returns plot of all countries within a dataframe's yaxis vs. xaxis    

In [None]:
# Set it to None to display all columns in the dataframe
get_country_github('Sweden')

In [None]:
get_country_github('United States').columns

In [None]:
compare_any('United States','Denmark','total_tests_per_thousand')

In [None]:
compare_any('United States','Denmark','new_cases_per_million')

Let's look at how the death rate differs among countries with the oldest populations.

In [None]:
old=get_top(8,'aged_65_older')
old

In [None]:
compare_any_n_df(old,'total_deaths_per_million')

In [None]:
diabetes=get_top(5,'diabetes_prevalence')
diabetes

In [None]:
compare_any_n_df(diabetes,'total_deaths_per_million')

In [None]:
country_plot_any('United States','new_cases_per_million')