In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
# %%
# Load the dataset
try:
    df = pd.read_csv('owid-covid-data.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")

# %%
# Initial exploration
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nColumns:")
print(df.columns.tolist())

print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False)[:10])

# %% [markdown]
# ## 2. Data Cleaning

# %%
# Select key countries and columns
countries = ['Kenya', 'United States', 'India', 'Brazil', 'Germany']
cols = ['date', 'location', 'total_cases', 'new_cases', 'total_deaths', 
        'new_deaths', 'people_vaccinated', 'population']

# Filter data
df_clean = df[df['location'].isin(countries)][cols]

# Convert date and sort
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean = df_clean.sort_values(['location', 'date'])

# Forward fill missing values for time-series continuity
df_clean = df_clean.groupby('location').apply(lambda x: x.ffill())
# Calculate derived metrics
df_clean['death_rate'] = df_clean['total_deaths'] / df_clean['total_cases']
df_clean['vaccination_rate'] = df_clean['people_vaccinated'] / df_clean['population']

# Drop remaining missing values
df_clean = df_clean.dropna()

# %% [markdown]
# ## 3. Exploratory Data Analysis

# %%
# Basic statistics
print("Global Statistics:")
display(df_clean.describe())

# %%
# Country comparison
country_stats = df_clean.groupby('location').agg({
    'total_cases': 'max',
    'total_deaths': 'max',
    'vaccination_rate': 'max'
}).sort_values('total_cases', ascending=False)

print("\nCountry Comparison:")
display(country_stats)

# %% [markdown]
# ## 4. Data Visualization

# %%
# Line Chart: Cases Over Time
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_clean[df_clean['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title('COVID-19 Total Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases (Millions)')
plt.legend()
plt.show()

# %%
# Bar Chart: Total Cases by Country
country_stats['total_cases'].sort_values().plot(kind='barh', color='skyblue')
plt.title('Total COVID-19 Cases by Country')
plt.xlabel('Total Cases (Millions)')
plt.show()

# %%
# Histogram: Daily New Cases
plt.figure(figsize=(10, 6))
sns.histplot(df_clean['new_cases'], bins=30, kde=True)
plt.title('Distribution of Daily New Cases')
plt.xlabel('Daily New Cases')
plt.show()

# %%
# Scatter Plot: Cases vs Deaths
sns.scatterplot(data=df_clean, x='total_cases', y='total_deaths', hue='location')
plt.title('Total Cases vs Total Deaths')
plt.xlabel('Total Cases (Millions)')
plt.ylabel('Total Deaths (Thousands)')
plt.show()

# %%
# Vaccination Progress
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_clean[df_clean['location'] == country]
    plt.plot(country_data['date'], country_data['vaccination_rate'], label=country)

plt.title('Vaccination Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Vaccination Rate (%)')
plt.legend()
plt.show()

# %% [markdown]
# ## 5. Choropleth Map (Optional)

# %%
# Prepare data for choropleth
latest_data = df.sort_values('date').groupby('iso_code').last().reset_index()

# Create map
fig = px.choropleth(latest_data,
                    locations="iso_code",
                    color="total_cases",
                    hover_name="location",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title="Global COVID-19 Case Distribution")
fig.show()



Dataset loaded successfully!
Dataset shape: (350085, 67)

First 5 rows:


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,



Columns:
['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred

Unnamed: 0,date,total_cases,new_cases,total_deaths,new_deaths,people_vaccinated,population,death_rate,vaccination_rate
count,5042,5042.0,5042.0,5042.0,5042.0,5042.0,5042.0,5042.0,5042.0
mean,2022-06-01 22:18:19.563665152,32465600.0,37429.38,424872.4,365.459937,246541700.0,424383300.0,0.015724,0.552795
min,2020-12-13 00:00:00,106801.0,0.0,1866.0,0.0,0.0,54027480.0,0.004437,0.0
25%,2021-09-23 00:00:00,4479591.0,0.0,97998.0,0.0,22334570.0,83369840.0,0.011822,0.263618
50%,2022-06-02 00:00:00,32722170.0,499.0,476011.5,6.0,154088100.0,215313500.0,0.01654,0.721675
75%,2023-02-09 00:00:00,43149540.0,31001.75,667032.0,317.0,262988800.0,338289900.0,0.019047,0.778149
max,2023-10-23 00:00:00,103436800.0,1588891.0,1136920.0,6148.0,1027419000.0,1417173000.0,0.031996,0.880778
std,,28660720.0,105273.2,346453.7,778.795584,325324300.0,507814300.0,0.006376,0.294766


ValueError: 'location' is both an index level and a column label, which is ambiguous.