# More Covid Data Analysis
`Jordan Renaud`

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/kaggle/input/the-our-world-in-data-covid-vaccination-data/owid-covid-data_3.csv")

# only use certain columns, data cleaning here <<<

dfclean = df[['date', 'location', 'continent', 'new_cases', 'new_deaths', 'new_tests', 'population', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index']]

In [None]:
dfclean

In [None]:
# Group by country, sum the values for each country, reset index so the country can be used in the chart below, then sort
sum_country = dfclean.groupby(['location']).sum().reset_index().sort_values(by="new_cases")
# Get rid of the world (sum of all countries)
sum_country = sum_country[sum_country["location"] != "World"]

In [None]:
def bar(x, y, data):
    # create bar plot
    plt.figure(figsize=(25,8))
    ax = sns.barplot(x=x, y=y, data=data)
    plt.xticks(rotation=90)
    plt.ticklabel_format(style='plain', axis='y')
    None

In [None]:
bar('location', 'new_cases', sum_country[sum_country['new_cases'] > 0])

In [None]:
def plot_ratio(a, b):
    sum_country = dfclean.groupby(['location']).sum().reset_index()
    # Calculate deaths per case per country
    sum_country['ratio'] = sum_country[a] / sum_country[b]
    sum_country['ratio']
    # plot deaths per case by country
    plt.figure(figsize=(25,8))
    ax = sns.barplot(x="location", y="ratio", data=sum_country.sort_values(by="ratio")[sum_country['ratio'] > 0])
    plt.xticks(rotation=90)

In [None]:
# plots a chart of the ratio of deaths to cases for each country
plot_ratio("new_deaths", "new_cases")

In [None]:
def plot_cases(country):
    # pull country data from clean dataframe
    c = dfclean[dfclean['location'] == country]
    g = c.groupby(by='location').sum()
    
    
    print("Cases:", g['new_cases'].values[0], "Tests:", g['new_tests'].values[0], "Deaths:", g['new_deaths'].values[0], "Population:", c.iloc[-1]['population'])
    
    # plot the line
    plt.figure(figsize=(30,8))
    ax = sns.lineplot(data=pd.melt(c[['date', 'new_deaths', 'new_cases', 'new_tests']], ['date']), x="date", y="value", hue="variable")
    ax.set_title(country + ": Cases, Deaths, and Tests")
    plt.legend(labels=['Deaths', 'Cases', 'Tests'])
    plt.ticklabel_format(style='plain', axis='y')
    
    # skip 30 ticks on the x axis at a time (too many tick labels)
    for ind, label in enumerate(ax.get_xticklabels()):
        if ind % 30 == 0:  
            label.set_visible(True)
        else:
            label.set_visible(False)
    plt.xticks(rotation=30)

In [None]:
# A few countries look like statistical outliers so we'll plot some more data for each of those
"""
    First case: 
    On 11 November, Vanuatu confirmed its first asymptomatic case, resulting from a man who had traveled 
    to the islands from the United States via Sydney and Auckland. The man had arrived on Vanuatu on 4 November and undergone 
    managed isolation and quarantine with no symptoms. He tested positive on 10 November.
    
    Second Case:
    On 6 March 2021, Prime Minister Bob Loughman announced two new cases.
    
    Third Case:
    As of 23 March 2021, total number of cases in Vanuatu was 3, with 2 active cases and 1 has recovered.
    
    Fourth Cases:
    On 19 April 2021, Prime minister Bob Loughman confirmed a new positive case on a deceased 
    Filipino fisherman, from a UK-flagged tanker, found on a beach on Efate.
"""
plot_cases("Vanuatu")

In [None]:
# The United States seems to be testing well. The pattern in the chart shows an obvious structure to how tests are administered/reported with respect to time.
# It seems that tests are administered on a weekly basis
plot_cases("United States")

In [None]:
plot_cases("Mexico")

In [None]:
plot_cases("Yemen")

In [None]:
# Total cases by continent
sum_continent = dfclean.groupby(['continent']).sum().reset_index().sort_values(by="new_cases")
sum_continent = sum_continent[sum_continent["continent"] != "World"]

In [None]:
sum_continent

In [None]:
# plot above data
bar('continent', 'new_cases', sum_continent)

In [None]:
# Africa looks strange, let's investigate
africa = dfclean[dfclean['continent'] == "Africa"]
africa = africa.groupby('location').sum().reset_index().sort_values('new_cases')


In [None]:
africa

In [None]:
bar('location', 'new_cases', africa)

In [None]:
# South Africa looks strange, let's investigate
"""In South Africa,the current population as of May, 2010, 2021, reached 59,945,412, with a population density 49 people per Km^2 
and an overall population that ranks them 25th in the world for the highest population density.
With the sole knowledge of this information, one could predict that they would be towards the top in the number of confirmed cases 
in all of Africa. As you can see above, our chart has South Africa at number one for the most new cases in Africa,
more than doubling the amount of cases that Morocco has, who has the second most. However, to our suprise, 
Morocco has a population density of 83 people per Km^2, which is almost twice as more as South Africa. From this research, 
we could make a very strong arugment that when you look just at the recent studies, population density seems to have
little to no effect on the amount of covid cases that have been confirmed for a country. This also may be caused by a different 
strand of covid that is more contagious in South Africa than the strand that is effecting Morocco. When you look at the Graph of South Africa, you 
will see two major rises in both of the number of cases and tests. We learned that this was because when the virus first effected them, their 
government enforced a lockdown to stop the spread as quickly as possible, however, once the numbers started to decrease, all of the policies were 
not being followed as strongly as they were and maybe should have been, which is the major cause for their second spike in confirmed cases.""" 
plot_cases('South Africa')
