In [None]:
# Install necessary packages
!pip install pandas matplotlib seaborn scikit-learn xgboost

# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Generate Synthetic Data
np.random.seed(42)

n = 500  # Number of days
date_range = pd.date_range(start='2022-01-01', periods=n)

# Simulate air pollution and climate data
pm25 = np.random.normal(loc=50, scale=10, size=n)  # PM2.5 levels
no2 = np.random.normal(loc=20, scale=5, size=n)    # NO2 levels
ozone = np.random.normal(loc=30, scale=7, size=n)  # Ozone (O3) levels
temperature = np.random.normal(loc=25, scale=5, size=n)  # Temperature in Celsius
humidity = np.random.uniform(40, 80, size=n)  # Humidity in %

# Simulate respiratory illness cases (dependent variable)
respiratory_cases = (
    10 + 0.5 * pm25 + 0.3 * no2 + 0.2 * ozone 
    - 0.1 * temperature + 0.05 * humidity 
    + np.random.normal(scale=5, size=n)  # Noise
)

# Create DataFrame to hold the data
data = pd.DataFrame({
    'Date': date_range,
    'PM2.5': pm25,
    'NO2': no2,
    'Ozone': ozone,
    'Temperature': temperature,
    'Humidity': humidity,
    'Respiratory_Cases': respiratory_cases
})

# Step 3: Data Exploration
print("Sample Data:")
print(data.head())

# Plot Respiratory Cases vs PM2.5
plt.figure(figsize=(8, 5))
plt.scatter(data['PM2.5'], data['Respiratory_Cases'], alpha=0.7, color='red')
plt.title('Respiratory Cases vs PM2.5 Concentration')
plt.xlabel('PM2.5 (µg/m³)')
plt.ylabel('Respiratory Cases')
plt.show()

# Step 4: Prepare Data for Machine Learning
X = data[['PM2.5', 'NO2', 'Ozone', 'Temperature', 'Humidity']]
y = data['Respiratory_Cases']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train Multiple Models

# 1. Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# 2. Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# 3. XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Step 6: Evaluate Model Performance
def evaluate_model(y_test, y_pred, model_name):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} - Mean Squared Error: {mse:.2f}, R-squared: {r2:.2f}")

print("\nModel Performance:")
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

# Step 7: Plot Actual vs Predicted Values
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label='Actual Values', marker='o', alpha=0.7)
plt.plot(y_pred_xgb, label='XGBoost Predictions', marker='x', alpha=0.7)
plt.title('Actual vs Predicted Respiratory Cases (XGBoost)')
plt.xlabel('Test Sample Index')
plt.ylabel('Respiratory Cases')
plt.legend()
plt.show()

# Step 8: Feature Importance (Random Forest)
plt.figure(figsize=(8, 5))
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.nlargest(5).plot(kind='barh', color='teal')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.show()
