In [4]:
# !pip install pandas matplotlib seaborn plotly prophet scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from prophet import Prophet
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Load Our World in Data COVID dataset
url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
df = pd.read_csv(url)

df['date'] = pd.to_datetime(df['date'])
df.head()

# Select useful columns
columns = [
    'location', 'date', 'total_cases', 'new_cases', 'total_deaths',
    'new_deaths', 'people_vaccinated', 'population', 'stringency_index'
]
df = df[columns]

# Fill missing values
df.fillna(0, inplace=True)

# Create latest snapshot
latest = df[df['date'] == df['date'].max()]

# Top 10 countries by total cases
top_cases = latest.sort_values(by='total_cases', ascending=False).head(10)
px.bar(top_cases, x='location', y='total_cases', title='Top 10 Countries by Total Cases')

# Forecasting new cases for a country
country = "India"
df_country = df[df['location'] == country][['date', 'new_cases']].rename(columns={'date': 'ds', 'new_cases': 'y'})
df_country = df_country[df_country['y'] > 0]

model = Prophet()
model.fit(df_country)

future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)
fig = model.plot(forecast)

features = ['total_cases', 'total_deaths', 'people_vaccinated']
df_cluster = latest[latest['location'].apply(lambda x: x.isalpha())][features].dropna()
scaled = StandardScaler().fit_transform(df_cluster)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled)

kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(scaled)

df_cluster['Cluster'] = labels
px.scatter(
    x=pca_result[:, 0], y=pca_result[:, 1], color=labels.astype(str),
    title='Country Clustering by COVID-19 Stats'
)

corr_matrix = latest[features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Between COVID-19 Metrics")
plt.show()

df_india = df[df['location'] == "India"]
df_india['new_cases_7day'] = df_india['new_cases'].rolling(7).mean()

fig = px.line(df_india, x='date', y=['new_cases_7day', 'stringency_index'],
              title='India: 7-Day Avg New Cases vs Government Stringency')
fig.show()



## 🔍 Insights Summary

- Countries with higher vaccination rates tend to have lower death rates.
- Clustering revealed 3 broad COVID-19 response profiles globally.
- India’s case trend shows correlation with changes in stringency index.
- Forecasting suggests a flattening trend in new cases (India), but regional variations matter.

## 📌 Next Steps

- Add interactive controls with Streamlit or Voila.
- Incorporate variant data or healthcare capacity metrics.
- Track policy impact using lagged effects or causality tests.
