In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:
# 1. Load data
df = pd.read_csv("air_quality_health_impact_data.csv")  # change this to your real path

FileNotFoundError: [Errno 2] No such file or directory: 'air_quality_health_impact_data.csv'

In [None]:
# 2. Clean data
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [None]:
# 3. Balance dataset
df_majority = df[df['HealthImpactClass'] == 0]
df_minority = df[df['HealthImpactClass'] != 0]
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df = pd.concat([df_majority, df_minority_upsampled])

In [None]:
# 4. Rule-based health risk function
def predict_health_risk(row):
    if row['PM2_5'] > 100 or row['AQI'] > 200:
        if row['RespiratoryCases'] > 7:
            return 'High Risk for Asthma'
    if row['NO2'] > 80 or row['PM10'] > 120:
        if row['CardiovascularCases'] > 5:
            return 'High Risk for Heart Patients'
    return 'Low or Moderate Risk'

df['HealthRiskLevel'] = df.apply(predict_health_risk, axis=1)

In [None]:
# 5. Personalized Risk (based on user profile)
def personalized_risk(row, profile):
    risk = []
    if profile['has_asthma'] and row['PM2_5'] > 90:
        risk.append("⚠️ Asthma Risk")
    if profile['has_heart_disease'] and row['NO2'] > 80:
        risk.append("⚠️ Heart Risk")
    if profile['age'] > 60 and row['AQI'] > 150:
        risk.append("⚠️ Senior Risk")
    return ", ".join(risk) if risk else "✅ Safe for your profile"

user_profile = {"age": 65, "has_asthma": True, "has_heart_disease": False}
df['PersonalizedRisk'] = df.apply(lambda row: personalized_risk(row, user_profile), axis=1)

In [None]:
# 6. ML Model Training
X = df[['PM10', 'PM2_5', 'NO2', 'SO2', 'O3', 'Temperature', 'Humidity', 'WindSpeed']]
y = df['HealthImpactClass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
# 7. Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.99       946
         1.0       0.99      0.98      0.99       559
         2.0       0.99      0.98      0.99       275
         3.0       1.00      0.98      0.99        89
         4.0       1.00      1.00      1.00        55

    accuracy                           0.99      1924
   macro avg       0.99      0.99      0.99      1924
weighted avg       0.99      0.99      0.99      1924



In [None]:
# 8. Save the trained model
joblib.dump(model, "health_model.pkl")
print("Model saved as 'health_model.pkl'")

Model saved as 'health_model.pkl'


In [None]:
# -----------------------------
# 🔮 Predicting on New Data
# -----------------------------
def predict_on_new_data(new_data, user_profile):
    model = joblib.load("health_model.pkl")
    ml_prediction = model.predict(new_data[X.columns])[0]

    row = new_data.iloc[0]
    rule_prediction = predict_health_risk(row)
    personalized = personalized_risk(row, user_profile)

    return {
        "ML Prediction (HealthImpactClass)": ml_prediction,
        "Rule-Based Risk": rule_prediction,
        "Personalized Risk": personalized
    }

In [None]:
# ✅ Example new input
new_data = pd.DataFrame([{
    'PM10': 130, 'PM2_5': 105, 'NO2': 90, 'SO2': 20, 'O3': 30,
    'Temperature': 33, 'Humidity': 45, 'WindSpeed': 2,
    'AQI': 250, 'RespiratoryCases': 10, 'CardiovascularCases': 6
}])

result = predict_on_new_data(new_data, user_profile)
print(result)

{'ML Prediction (HealthImpactClass)': np.float64(0.0), 'Rule-Based Risk': 'High Risk for Asthma', 'Personalized Risk': '⚠️ Asthma Risk, ⚠️ Senior Risk'}
