In [17]:
import pandas as pd   # data manipulation
import seaborn as sns # data visualization
import matplotlib.pyplot as plt # visualization
from sklearn.metrics import mean_squared_error # metrics
from sklearn.linear_model import LinearRegression # model
from sklearn.model_selection import train_test_split  # model

In [None]:
df = pd.read_csv("dataset.csv") # Load the data from the CSV file

print("Dataset Head:", df.head()) # Print the first few rows of the dataset
print("\nDataset Info:", df.info()) # Print information about the dataset

In [None]:
# Cell 3 - Data Cleaning
# Clean the data
df_cleaned = df.copy()
df_cleaned = df_cleaned.dropna()
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned = df_cleaned.reset_index(drop=True)

# Convert percentage strings to floats
percent_columns = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
for col in percent_columns:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col])


Cell 4 - Exploratory Data Analysis

In [None]:
# Plot 1: Top 10 Countries by Confirmed Cases
plt.figure(figsize=(12, 6))
top_10_confirmed = df_cleaned.nlargest(10, 'Confirmed')
sns.barplot(data=top_10_confirmed, x='Country/Region', y='Confirmed')
plt.xticks(rotation=45)
plt.title('Top 10 Countries by Confirmed Cases')
plt.show()


In [None]:
# Plot 2: Death Rate Analysis
plt.figure(figsize=(12, 6))
top_10_death_rate = df_cleaned.nlargest(10, 'Deaths / 100 Cases')
sns.barplot(data=top_10_death_rate, x='Country/Region', y='Deaths / 100 Cases')
plt.xticks(rotation=45)
plt.title('Top 10 Countries by Death Rate')
plt.show()


In [None]:
# Plot 3: Recovery Rate by Region
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_cleaned, x='WHO Region', y='Recovered / 100 Cases')
plt.xticks(rotation=45)
plt.title('Recovery Rate Distribution by WHO Region')
plt.show()

Cell 5 - Correlation Analysis

In [None]:
# Create correlation matrix for numerical columns
numeric_cols = ['Confirmed', 'Deaths', 'Recovered', 'Active', 'Deaths / 100 Cases', 'Recovered / 100 Cases']
correlation_matrix = df_cleaned[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of COVID-19 Metrics')
plt.show()

Cell 6 - Predictive Modeling

In [None]:
X = df_cleaned[['Confirmed', 'Recovered', 'Active']]
y = df_cleaned['Deaths']


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared Score: {r2:.2f}')


Cell 7 - Model Visualization

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Deaths')
plt.ylabel('Predicted Deaths')
plt.title('Model Predictions vs Actual Deaths')
plt.show()


In [None]:
# Feature importance
feature_importance = pd.DataFrame({'Feature': X.columns,'Coefficient': model.coef_})

In [None]:
print("\nFeature Importance:")
print(feature_importance)
