In [2]:
# Import necessary libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- 1. Data Acquisition ---
# Using direct links to the Johns Hopkins CSSE COVID-19 Data repository on GitHub
# These datasets are time-series data for global confirmed cases and deaths.
# The 'recovered' dataset is often less consistently updated, so we will focus on confirmed and deaths.

CONFIRMED_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
DEATHS_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"

print("Fetching COVID-19 data...")

try:
    df_confirmed = pd.read_csv(CONFIRMED_URL)
    df_deaths = pd.read_csv(DEATHS_URL)
    print("Data fetched successfully!")
except Exception as e:
    print(f"Error fetching data: {e}")
    print("Please check your internet connection or the URLs.")
    exit() # Exit if data cannot be fetched

# --- 2. Data Preprocessing and Cleaning ---

# Define a function to melt the data from wide to long format
# This makes time-series analysis easier as dates become a single column.
def melt_dataframe(df, value_name):
    """
    Melts a wide-format DataFrame (with dates as columns) into a long format.

    Args:
        df (pd.DataFrame): The input DataFrame.
        value_name (str): The name for the column containing the melted values.

    Returns:
        pd.DataFrame: The melted DataFrame.
    """
    # Identify columns that are not Lat, Long, Province/State, Country/Region
    # These are our date columns
    id_vars = ['Province/State', 'Country/Region', 'Lat', 'Long']
    date_columns = df.columns.difference(id_vars)

    df_melted = df.melt(
        id_vars=id_vars,
        value_vars=date_columns,
        var_name='Date',
        value_name=value_name
    )
    # Convert 'Date' column to datetime objects
    df_melted['Date'] = pd.to_datetime(df_melted['Date'])
    return df_melted

# Apply the melting function to both confirmed and deaths dataframes
df_confirmed_melted = melt_dataframe(df_confirmed, 'Confirmed')
df_deaths_melted = melt_dataframe(df_deaths, 'Deaths')

# Merge the melted dataframes on common columns (Province/State, Country/Region, Lat, Long, Date)
# Using an outer merge to ensure all dates/regions are included, then fill NaNs with 0.
df_full = pd.merge(
    df_confirmed_melted,
    df_deaths_melted,
    on=['Province/State', 'Country/Region', 'Lat', 'Long', 'Date'],
    how='outer'
)
df_full.fillna(0, inplace=True) # Fill any missing values (e.g., if a country had cases but no deaths initially)

# Aggregate data globally and by country
print("Aggregating data...")

# Global daily totals for confirmed cases and deaths
global_data = df_full.groupby('Date')[['Confirmed', 'Deaths']].sum().reset_index()

# Country-wise daily totals for confirmed cases and deaths
country_data = df_full.groupby(['Country/Region', 'Date'])[['Confirmed', 'Deaths']].sum().reset_index()

# Calculate daily new cases and new deaths for each country
# We sort by Country/Region and Date to ensure correct difference calculation
country_data = country_data.sort_values(by=['Country/Region', 'Date'])
country_data['New_Confirmed'] = country_data.groupby('Country/Region')['Confirmed'].diff().fillna(0)
country_data['New_Deaths'] = country_data.groupby('Country/Region')['Deaths'].diff().fillna(0)

# Ensure new cases/deaths are non-negative (sometimes data anomalies can cause negative values)
country_data['New_Confirmed'] = country_data['New_Confirmed'].apply(lambda x: max(0, x))
country_data['New_Deaths'] = country_data['New_Deaths'].apply(lambda x: max(0, x))

# Get the latest data for snapshot visualizations
latest_date = global_data['Date'].max()
latest_global_data = global_data[global_data['Date'] == latest_date]
latest_country_data = country_data[country_data['Date'] == latest_date]

print(f"Data processing complete. Latest date available: {latest_date.strftime('%Y-%m-%d')}")

# --- 3. Data Visualization ---

print("Generating visualizations...")

# --- Plot 1: Global COVID-19 Confirmed Cases and Deaths Over Time ---
fig_global_trend = px.line(
    global_data,
    x='Date',
    y=['Confirmed', 'Deaths'],
    title='Global COVID-19 Confirmed Cases and Deaths Over Time',
    labels={'value': 'Number of People', 'variable': 'Metric'},
    line_dash_map={'Confirmed': 'solid', 'Deaths': 'dash'}, # Different line styles
    color_discrete_map={'Confirmed': 'blue', 'Deaths': 'red'} # Custom colors
)
fig_global_trend.update_layout(
    hovermode="x unified", # Shows all values for a given date on hover
    title_font_size=20,
    xaxis_title="Date",
    yaxis_title="Total Cases/Deaths"
)
fig_global_trend.show()


# --- Plot 2: Top N Countries by Total Confirmed Cases (Latest Data) ---
# Let's visualize the top 15 countries by latest confirmed cases
top_n = 15
top_countries_confirmed = latest_country_data.nlargest(top_n, 'Confirmed')

fig_top_countries = px.bar(
    top_countries_confirmed,
    x='Country/Region',
    y='Confirmed',
    color='Confirmed', # Color bars based on confirmed count
    color_continuous_scale=px.colors.sequential.Plasma, # Use a color scale
    title=f'Top {top_n} Countries by Total Confirmed COVID-19 Cases ({latest_date.strftime("%Y-%m-%d")})',
    labels={'Confirmed': 'Total Confirmed Cases'},
    text='Confirmed' # Display count on bars
)
fig_top_countries.update_traces(texttemplate='%{text:.2s}', textposition='outside') # Format text on bars
fig_top_countries.update_layout(
    xaxis_title="Country/Region",
    yaxis_title="Total Confirmed Cases",
    title_font_size=20
)
fig_top_countries.show()


# --- Plot 3: Daily New Cases for Selected Countries ---
# Allow selection of a few key countries to compare daily new cases trends
selected_countries = ['US', 'India', 'Brazil', 'United Kingdom', 'France', 'Germany']
filtered_country_daily_new = country_data[country_data['Country/Region'].isin(selected_countries)]

fig_new_cases_countries = px.line(
    filtered_country_daily_new,
    x='Date',
    y='New_Confirmed',
    color='Country/Region',
    title='Daily New Confirmed COVID-19 Cases in Selected Countries',
    labels={'New_Confirmed': 'Daily New Confirmed Cases'},
    hover_data={'New_Confirmed': ':,', 'Date': '|%Y-%m-%d', 'Country/Region': True} # Custom hover info
)
fig_new_cases_countries.update_layout(
    hovermode="x unified",
    title_font_size=20,
    xaxis_title="Date",
    yaxis_title="Daily New Confirmed Cases"
)
fig_new_cases_countries.show()


# --- Plot 4: Global Map of Total Confirmed Cases (Interactive Choropleth) ---
# This map visualizes the geographical distribution of confirmed cases.
# Plotly automatically handles country names if 'locationmode' is set.

fig_geo_map = px.choropleth(
    latest_country_data,
    locations="Country/Region",
    locationmode='country names', # Automatically map country names to geographical shapes
    color="Confirmed",
    hover_name="Country/Region",
    hover_data={'Confirmed': ':,', 'Deaths': ':,'}, # Include deaths in hover info
    color_continuous_scale=px.colors.sequential.PuRd, # Use a red/purple color scale
    title=f'Global COVID-19 Confirmed Cases by Country ({latest_date.strftime("%Y-%m-%d")})',
    projection='natural earth' # A nice geographical projection
)
fig_geo_map.update_layout(
    title_font_size=20
)
fig_geo_map.show()


# --- Plot 5: Case Fatality Rate (CFR) for Top Affected Countries ---
# CFR = (Deaths / Confirmed) * 100
# Filter out divisions by zero where confirmed cases might be 0.
latest_country_data['CFR'] = (latest_country_data['Deaths'] / latest_country_data['Confirmed']) * 100
latest_country_data['CFR'].fillna(0, inplace=True) # Fill NaNs (from 0/0) with 0
latest_country_data = latest_country_data[latest_country_data['Confirmed'] > 0] # Exclude countries with 0 confirmed

# Get top 20 countries by confirmed cases again, then calculate CFR for them
top_cfr_countries = latest_country_data.nlargest(20, 'Confirmed').sort_values(by='CFR', ascending=False)


fig_cfr = px.bar(
    top_cfr_countries,
    x='Country/Region',
    y='CFR',
    color='CFR',
    color_continuous_scale=px.colors.sequential.Viridis,
    title=f'Case Fatality Rate (CFR) for Top 20 Affected Countries ({latest_date.strftime("%Y-%m-%d")})',
    labels={'CFR': 'Case Fatality Rate (%)'},
    text='CFR'
)
fig_cfr.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_cfr.update_layout(
    xaxis_title="Country/Region",
    yaxis_title="CFR (%)",
    title_font_size=20
)
fig_cfr.show()


print("All visualizations generated.")

# --- End of Python Script ---

Fetching COVID-19 data...
Data fetched successfully!



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Aggregating data...
Data processing complete. Latest date available: 2023-03-09
Generating visualizations...




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All visualizations generated.
