# COVID-19 Global Data Tracker

**Project goals:** Load and analyze global COVID-19 data (cases, deaths, vaccinations), perform EDA, create visualizations, and write insights.

**Instruction:** Download `owid-covid-data.csv` from Our World in Data and place it in the same folder as this notebook, or update the path in the data loading cell.

In [None]:
# 1. Imports and settings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline
sns.set(style="whitegrid")

DATA_PATH = "owid-covid-data.csv"  # change if file is in another location

In [None]:
# 2. Load dataset with error handling
try:
    df = pd.read_csv(DATA_PATH, low_memory=False)
    print("Loaded dataset with shape:", df.shape)
except FileNotFoundError as e:
    print(f"File not found: {DATA_PATH}")
    print("Please download owid-covid-data.csv from Our World in Data and place it in the notebook folder.")
    raise e

# Preview
df.head()

In [None]:
# 3. Explore structure & missing values
print("Columns:", list(df.columns))
print("\nData types:")
print(df.dtypes)
print("\nMissing values (top columns):")
display(df.isnull().sum().sort_values(ascending=False).head(30))

# Convert date column
df['date'] = pd.to_datetime(df['date'], errors='coerce')
print("\nDate range:", df['date'].min(), "to", df['date'].max())

In [None]:
# 4. Data cleaning (example pipeline)
cols = ['iso_code','continent','location','date','total_cases','new_cases','total_deaths','new_deaths',
        'total_vaccinations','people_vaccinated','people_fully_vaccinated','new_vaccinations','population']

df_sub = df[cols].copy()
df_sub = df_sub.dropna(subset=['date','location'])

cum_cols = ['total_cases','total_deaths','total_vaccinations','people_vaccinated','people_fully_vaccinated']
df_sub[cum_cols] = df_sub[cum_cols].fillna(0)
df_sub['new_cases'] = df_sub['new_cases'].fillna(0)
df_sub['new_deaths'] = df_sub['new_deaths'].fillna(0)
df_sub['new_vaccinations'] = df_sub['new_vaccinations'].fillna(0)

print("After cleaning shape:", df_sub.shape)
df_sub.head()

In [None]:
# 5. Basic analysis & groupings
display(df_sub.describe())

continent_group = df_sub.groupby('continent', dropna=True)['new_cases'].mean().sort_values(ascending=False)
print("\nAverage new cases per day (by continent):\n", continent_group)

latest = df_sub.sort_values('date').groupby('location').tail(1)
latest = latest.set_index('location')
latest['death_rate'] = latest['total_deaths'] / latest['total_cases']
latest['death_rate'] = latest['death_rate'].replace([np.inf, -np.inf], np.nan)
latest[['total_cases','total_deaths','death_rate']].sort_values('total_cases', ascending=False).head(10)

In [None]:
# 6. Visualizations (compare a few countries)
countries = ['United States', 'India', 'Brazil', 'Germany']
df_countries = df_sub[df_sub['location'].isin(countries)].copy()

In [None]:
# Line chart: total cases
plt.figure(figsize=(10,6))
for country in countries:
    country_df = df_countries[df_countries['location']==country]
    plt.plot(country_df['date'], country_df['total_cases'], label=country)
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.show()

In [None]:
# Line chart: total deaths
plt.figure(figsize=(10,6))
for country in countries:
    country_df = df_countries[df_countries['location']==country]
    plt.plot(country_df['date'], country_df['total_deaths'], label=country)
plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.show()

In [None]:
# Histogram: distribution of new_cases
sample = df_sub[df_sub['new_cases']>0]['new_cases'].sample(min(5000, df_sub[df_sub['new_cases']>0].shape[0]), random_state=1)
plt.figure(figsize=(8,5))
plt.hist(np.log1p(sample), bins=50)
plt.title('Distribution of log(new_cases) (sampled)')
plt.xlabel('log(1 + new_cases)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter: vaccinations vs cases (latest)
plt.figure(figsize=(8,6))
latest_plot = latest.dropna(subset=['total_vaccinations','total_cases'])
plt.scatter(latest_plot['total_vaccinations'], latest_plot['total_cases'], alpha=0.7)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Total Vaccinations (log scale)')
plt.ylabel('Total Cases (log scale)')
plt.title('Vaccinations vs Cases (latest by country)')
plt.show()

In [None]:
# Bar chart: top 10 countries by cases
top10_cases = latest.sort_values('total_cases', ascending=False).head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=top10_cases['total_cases'], y=top10_cases.index)
plt.title('Top 10 countries by total cases (latest)')
plt.xlabel('Total Cases')
plt.ylabel('Country')
plt.show()

# Insights & Findings

- Add 3–5 observations here based on analysis and plots.
- Examples: Vaccination rollouts slowed death rates, different waves in different regions, variation in death rate.