<a href="https://colab.research.google.com/github/richarddushime/EDA-and-Prediction-on-Global-Data-and-Sustainable-energy-/blob/main/Covid19_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
luisheitorribeiro_covid_19_fatalities_data_path = kagglehub.dataset_download('luisheitorribeiro/covid-19-fatalities-data')

print('Data source import complete.')


In [None]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn  as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# dataset download
path = kagglehub.dataset_download("luisheitorribeiro/covid-19-fatalities-data")

print("Path to dataset files:", path)

In [None]:
covid_global_data = pd.read_csv(path + "/data_global_data.csv")

print(covid_global_data.head())
print("--------------------------------------------------")
print("")
print(covid_global_data.tail())


In [None]:
covid_global_data.shape

In [None]:
covid_global_data.info()

In [None]:
covid_global_data.describe()

In [None]:
covid_global_data.isnull().sum()

In [None]:
# # Fill missing values with mean/median for numeric columns or 'Unknown' for categorical
# for i in covid_global_data.columns:
#     if covid_global_data[i].dtype == 'float64' or covid_global_data[i].dtype == 'int64':
#         covid_global_data[i].fillna({"deaths" and "daily_deaths": covid_global_data[i].mean()}, inplace=True)
#     else:
#         covid_global_data[i].fillna('Unknown', inplace=True)


In [None]:
# date conversion to supported format
covid_global_data['date']=pd.to_datetime(covid_global_data['date'])

In [None]:
covid_global_data.date.info()

In [None]:
# filtering necessary columns
da_covid_global = covid_global_data[["country","date","deaths","daily_deaths"]]

In [None]:
da_covid_global.info()

In [None]:
# Deaths over time
plt.figure(figsize=(12,6))
sns.lineplot(data=da_covid_global, x="date", y="deaths")
plt.xlabel("Dates")
plt.ylabel("Deaths over time")
plt.xticks(rotation=45)
plt.legend(title="Total Deaths")
plt.title("Deaths over time")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(data=da_covid_global, x="date", y="daily_deaths")
plt.xlabel("Dates")
plt.ylabel("Total Daily Deaths")
plt.legend(title="Daily Deaths")
plt.show()

In [None]:
# per country = Afghanistan deaths plot
afghanistan_data = covid_global_data[covid_global_data["country"] == "Afghanistan"]

plt.figure(figsize=(12, 6))
sns.lineplot(data=afghanistan_data, x="date", y="deaths")
plt.legend(title="Total Afghanistan Deaths Over Time")
plt.xlabel("Date")
plt.ylabel("Total Afghanistan Deaths")

In [None]:
# correlation
da_covid_global = da_covid_global[["date","deaths","daily_deaths"]]
correlation_matrix = da_covid_global.corr()
plt.figure(figsize=(12,6))
sns.heatmap(data=correlation_matrix,annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# # preparing data for modeling
# #feature engineering
da_covid_global["month"] = covid_global_data["date"].dt.month
da_covid_global["day_of_week"] = covid_global_data["date"].dt.dayofweek
da_covid_global["day_of_month"] = covid_global_data["date"].dt.day

In [None]:
# lag features
da_covid_global["deaths_lag_1"] = covid_global_data["daily_deaths"].shift(1)
da_covid_global["deaths_lag_7"] = covid_global_data["daily_deaths"].shift(7)
da_covid_global["deaths_lag_30"] = covid_global_data["daily_deaths"].shift(30)

In [None]:
da_covid_global.dropna(inplace=True)


In [None]:
# Define features and target variable
x_features = ["month", "day_of_week", "day_of_month", "deaths_lag_1", "deaths_lag_7", "deaths_lag_30"]
y_target = "daily_deaths"

In [None]:
x_train,x_test,y_train,y_test = train_test_split(da_covid_global[x_features],da_covid_global[y_target], test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)

# model.fit(x_train,y_train)

In [None]:
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(x_test)
mse= mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared : {r2}")


In [None]:
# forecast next year
future_dates = pd.date_range(start=da_covid_global["date"].max(), periods=365, freq="D")
#initialize future data
future_data = pd.DataFrame({"date": future_dates})

#adding lagged features
future_data["month"] = future_data["date"].dt.month
future_data["day_of_week"] = future_data["date"].dt.dayofweek
future_data["day_of_month"] = future_data["date"].dt.day
future_data["deaths_lag_1"] = covid_global_data["daily_deaths"].shift(1)
future_data["deaths_lag_7"] = covid_global_data["daily_deaths"].shift(7)
future_data["deaths_lag_30"] = covid_global_data["daily_deaths"].shift(30)

future_data


In [None]:
future_data.dropna(inplace=True)

In [None]:
# predict futures data
future_predictions = model.predict(future_data[x_features])
# fill in lagged features
future_data["daily_deaths"] = future_predictions

future_data.head()

In [None]:
future_data.describe()

In [None]:
# visualizing the future data and trends
plt.figure(figsize=(12, 6))
sns.lineplot(data=future_data, x="date", y="daily_deaths")
plt.title("Future Daily Deaths Over Time")
plt.xlabel("Date")
plt.ylabel("Future Daily Deaths")
plt.xticks(rotation=45)
plt.show()

In [None]:
# future deaths over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=future_data, x="month", y="daily_deaths")
plt.title("Future Daily Deaths Over Time")
plt.xlabel("month")
plt.ylabel("Future Daily Deaths")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Descriptive statistics of future daily deaths
old = covid_global_data["daily_deaths"].describe()


future= future_data["daily_deaths"].describe()
print(old)
print("=======================================")
print(future)


In [None]:
# Check for seasonal trends

future_data['month'] = future_data['date'].dt.month
monthly_means = future_data.groupby('month')['daily_deaths'].mean()

# Plot monthly averages
plt.figure(figsize=(12, 6))
plt.plot(monthly_means.index, monthly_means.values, marker='o', linestyle='-')


In [None]:
# Compare historical data (last year) and future predictions
historical_mean = covid_global_data['daily_deaths'].mean()
future_mean = future_data['daily_deaths'].mean()

print(f"Historical Mean: {historical_mean}, Future Mean: {future_mean}")


In [None]:
# Find dates with peak deaths
peak_dates = future_data[future_data['daily_deaths'] == future_data['daily_deaths'].max()]
print("Peak Death Dates:", peak_dates)

print("==============================================")
# Find low points
low_dates = future_data[future_data['daily_deaths'] == future_data['daily_deaths'].min()]
print("Lowest Death Dates:", low_dates)


In [None]:
# conlusion
# The analysis of future COVID-19 fatalities predicts an average of [] daily deaths,
# with potential peaks reaching up to 92 deaths.