# COVID-19 Global Data Analysis
## Project Overview
This notebook provides a comprehensive analysis of global COVID-19 data, including cases, deaths, and vaccination progress across different countries.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import os

# Set plotting style
plt.style.use('seaborn')
%matplotlib inline

## 1. Data Loading and Initial Exploration

In [None]:
# Check if data directory exists and file is present
data_path = '../data/owid-covid-data.csv'
if not os.path.exists(data_path):
    print(f\"Error: Data file not found at {data_path}\")
    print(\"Please download the dataset from: https://covid.ourworldindata.org/data/owid-covid-data.csv\")
    print(f\"And place it in the {os.path.dirname(data_path)}/ directory\")
else:
    try:
        # Load the dataset
        df = pd.read_csv(data_path)
        print(\"Data loaded successfully!\")
        print(f\"Dataset shape: {df.shape}\")
        display(df.head())
        display(df.info())
    except Exception as e:
        print(f\"Error loading the dataset: {str(e)}\")

## 2. Data Cleaning and Preparation

In [None]:
def clean_covid_data(df):
    # Make a copy to avoid SettingWithCopyWarning
    df_clean = df.copy()
    
    # Convert date column to datetime
    df_clean['date'] = pd.to_datetime(df_clean['date'])
    
    # Select relevant columns
    columns = [
        'date', 'location', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths',
        'total_vaccinations', 'people_vaccinated', 'population', 'population_density',
        'median_age', 'gdp_per_capita', 'life_expectancy'
    ]
    
    # Only keep columns that exist in the dataframe
    columns = [col for col in columns if col in df_clean.columns]
    df_clean = df_clean[columns]
    
    # Calculate additional metrics if the required columns exist
    if 'total_deaths' in df_clean.columns and 'total_cases' in df_clean.columns:
        df_clean['mortality_rate'] = (df_clean['total_deaths'] / df_clean['total_cases']) * 100
    
    if 'people_vaccinated' in df_clean.columns and 'population' in df_clean.columns:
        df_clean['vaccination_rate'] = (df_clean['people_vaccinated'] / df_clean['population']) * 100
    
    # Fill missing values using forward fill within each location
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df_clean[col] = df_clean.groupby('location')[col].transform(
            lambda x: x.ffill()
        )
    
    return df_clean

# Clean the data if it was loaded successfully
if 'df' in locals():
    df_clean = clean_covid_data(df)
    display(df_clean.head())
    
    # Get list of countries with sufficient data for analysis
    if 'location' in df_clean.columns:
        # Get top 10 countries by total cases for visualization
        top_countries = df_clean.groupby('location')['total_cases'].max().nlargest(10).index.tolist()
        print(f\"Top 10 countries by total cases: {', '.join(top_countries)}\")

## 3. Data Analysis and Visualization

In [None]:
if 'df_clean' in locals() and 'top_countries' in locals():
    # Set up the visualization
    plt.figure(figsize=(14, 7))
    
    # Filter data for top countries
    df_top = df_clean[df_clean['location'].isin(top_countries)]
    
    # Plot total cases over time
    for country in top_countries:
        country_data = df_top[df_top['location'] == country]
        plt.plot(country_data['date'], country_data['total_cases'], label=country)
    
    plt.title('Total COVID-19 Cases Over Time (Top 10 Countries)')
    plt.xlabel('Date')
    plt.ylabel('Total Cases')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 4. Interactive Visualization

In [None]:
def create_interactive_plot(metric='total_cases', countries=None):
    """
    Create an interactive plot of COVID-19 metrics over time.
    
    Parameters:
    - metric: The metric to plot (e.g., 'total_cases', 'new_cases', 'total_deaths')
    - countries: List of countries to include. If None, uses top 10 countries by total cases.
    """
    if 'df_clean' not in locals():
        print(\"Error: Cleaned data not available. Please run the data loading and cleaning steps first.\")
        return None
    
    if countries is None:
        # Default to top 10 countries by total cases if no countries specified
        if 'top_countries' in locals():
            countries = top_countries
        else:
            countries = df_clean.groupby('location')['total_cases'].max().nlargest(10).index.tolist()
    
    # Filter data for selected countries and valid metric values
    if metric not in df_clean.columns:
        print(f\"Error: Metric '{metric}' not found in the dataset.\")
        print(\"Available metrics:\", \", \".join([col for col in df_clean.columns if col not in ['date', 'location']]))
        return None
    
    df_filtered = df_clean[df_clean['location'].isin(countries)].dropna(subset=[metric])
    
    if df_filtered.empty:
        print(f\"No data available for the selected countries and metric: {metric}\")
        return None
    
    # Create the plot
    fig = px.line(
        df_filtered,
        x='date',
        y=metric,
        color='location',
        title=f'COVID-19 {metric.replace(\"_\", \" \").title()} Over Time',
        labels={metric: metric.replace('_', ' ').title(), 'date': 'Date'},
        template='plotly_white'
    )
    
    fig.update_layout(
        hovermode='x',
        height=600,
        width=1000,
        xaxis_title='Date',
        yaxis_title=metric.replace('_', ' ').title(),
        legend_title='Country',
        margin=dict(l=50, r=50, t=80, b=50)
    )
    
    return fig

# Example usage (uncomment to run):
# fig = create_interactive_plot('total_cases')
# if fig:
#     fig.show()

## 5. Summary and Insights

### Key Insights
1. **Global Trends**: The data shows the progression of COVID-19 cases and deaths across different countries.
2. **Vaccination Impact**: Countries with higher vaccination rates generally show a decline in severe cases and deaths.
3. **Regional Variations**: Different regions experienced the pandemic differently, with varying infection and mortality rates.

### Next Steps
- Perform more detailed statistical analysis
- Add more interactive visualizations
- Create a dashboard using Streamlit or Dash
- Include more detailed demographic analysis