In [None]:
!pip install requests




In [None]:
import json
import random

cities = [
    ("Los Angeles", 34.0522, -118.2437),
    ("New York", 40.7128, -74.0060),
    ("Toronto", 43.6532, -79.3832),
    ("Chicago", 41.8781, -87.6298),
    ("Houston", 29.7604, -95.3698),
    # Add more cities up to 50
]

num_rows = 5000
mock_data = []

for _ in range(num_rows):
    city, lat, lon = random.choice(cities)
    lat_noise = lat + random.uniform(-0.05, 0.05)
    lon_noise = lon + random.uniform(-0.05, 0.05)
    pm25 = random.gauss(50, 25)  # Gaussian distribution
    pm25 = max(0, min(150, pm25))
    no2 = random.gauss(30, 15)
    no2 = max(0, min(100, no2))
    o3 = random.gauss(40, 20)
    o3 = max(0, min(120, o3))
    temperature = random.gauss(20, 10)
    humidity = random.randint(10, 100)
    wind_speed = random.uniform(0, 15)

    # Simple AQI calculation for synthetic data
    aqi = int(0.5*pm25 + 0.3*no2 + 0.2*o3)
    forecast = [
        max(0, min(300, aqi + random.randint(-10, 10))) for _ in range(3)
    ]

    mock_data.append({
        "city": city,
        "lat": round(lat_noise, 4),
        "lon": round(lon_noise, 4),
        "measurements": {"PM2.5": round(pm25,1), "NO2": round(no2,1), "O3": round(o3,1)},
        "weather": {"temperature": round(temperature,1), "humidity": humidity, "wind_speed": round(wind_speed,1)},
        "aqi": aqi,
        "aqi_forecast": forecast
    })

with open("mock_aqi_large.json", "w") as f:
    json.dump(mock_data, f)

print(f"Generated {len(mock_data)} mock AQI rows!")


Generated 5000 mock AQI rows!


In [None]:
import pandas as pd

with open("mock_aqi_large.json") as f:
    data = json.load(f)

df = pd.json_normalize(data, sep="_")
print(df.head())

# Features
X = df[[
    'measurements_PM2.5','measurements_NO2','measurements_O3',
    'weather_temperature','weather_humidity','weather_wind_speed'
]]
# Target
y = df['aqi']

# Train a simple regression model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
print("Model trained. Score:", model.score(X_test, y_test))


          city      lat       lon  aqi  aqi_forecast  measurements_PM2.5  \
0      Houston  29.7902  -95.3380   47  [39, 41, 53]                49.5   
1      Houston  29.7240  -95.3720   59  [59, 51, 66]                71.2   
2  Los Angeles  34.0282 -118.2379   55  [51, 54, 61]                58.4   
3     New York  40.7230  -74.0329   40  [39, 50, 45]                25.1   
4      Chicago  41.8387  -87.5891   45  [37, 37, 53]                56.8   

   measurements_NO2  measurements_O3  weather_temperature  weather_humidity  \
0              49.6             40.5                  5.8                15   
1              49.4             43.7                 24.1                24   
2              61.6             37.8                  6.1                39   
3              47.7             69.8                 12.0                27   
4              42.0             23.5                 21.9                32   

   weather_wind_speed  
0                10.1  
1                13.

data preprocessing

In [None]:
def aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for SG"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df['aqi_class'] = df['aqi'].apply(aqi_class)


Encode class labels to integers for ML models:

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['aqi_class'])

XG BOOST CLASSIFIER

In [None]:
!pip install xgboost




In [None]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import joblib


In [None]:
# Load mock JSON data
with open("mock_aqi_large.json") as f:
    data = json.load(f)

# Flatten nested JSON for ML
df = pd.json_normalize(data, sep="_")
print("Columns:", df.columns)
print(df.head())


Columns: Index(['city', 'lat', 'lon', 'aqi', 'aqi_forecast', 'measurements_PM2.5',
       'measurements_NO2', 'measurements_O3', 'weather_temperature',
       'weather_humidity', 'weather_wind_speed'],
      dtype='object')
          city      lat       lon  aqi  aqi_forecast  measurements_PM2.5  \
0      Houston  29.7902  -95.3380   47  [39, 41, 53]                49.5   
1      Houston  29.7240  -95.3720   59  [59, 51, 66]                71.2   
2  Los Angeles  34.0282 -118.2379   55  [51, 54, 61]                58.4   
3     New York  40.7230  -74.0329   40  [39, 50, 45]                25.1   
4      Chicago  41.8387  -87.5891   45  [37, 37, 53]                56.8   

   measurements_NO2  measurements_O3  weather_temperature  weather_humidity  \
0              49.6             40.5                  5.8                15   
1              49.4             43.7                 24.1                24   
2              61.6             37.8                  6.1                39   
3  

In [None]:
# Define AQI classes
def aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for SG"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df['aqi_class'] = df['aqi'].apply(aqi_class)
print(df[['aqi','aqi_class']].head())


   aqi aqi_class
0   47      Good
1   59  Moderate
2   55  Moderate
3   40      Good
4   45      Good


In [None]:
le = LabelEncoder()
y = le.fit_transform(df['aqi_class'])
print("Encoded classes:", list(le.classes_))


Encoded classes: ['Good', 'Moderate']


In [None]:
# Features: pollutant measurements + weather
X = df[['measurements_PM2.5', 'measurements_NO2', 'measurements_O3',
        'weather_temperature','weather_humidity','weather_wind_speed']]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 4000
Testing samples: 1000


In [None]:
# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # multi-class classification
    num_class=len(le.classes_), # number of AQI classes
    n_estimators=200,           # number of trees
    max_depth=6,                # tree depth
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'      # for multi-class
)

# Train the model
xgb_model.fit(X_train, y_train)

print("XGBoost training complete!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost training complete!


In [None]:
# Predict on test set
y_pred = xgb_model.predict(X_test)

# Classification report
print("Classification Report:\n")
# Provide the unique labels present in y_test to avoid the ValueError
print(classification_report(y_test, y_pred, target_names=le.classes_, labels=le.transform(le.classes_)))

# Optional: Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Classification Report:

              precision    recall  f1-score   support

        Good       0.99      0.99      0.99       747
    Moderate       0.97      0.97      0.97       253

    accuracy                           0.99      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.99      0.99      0.99      1000

Confusion Matrix:
 [[740   7]
 [  7 246]]


RFC

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rfc_model = RandomForestClassifier(
    n_estimators=200,    # number of trees
    max_depth=None,      # let trees grow until all leaves are pure
    random_state=42,
    n_jobs=-1            # use all CPU cores
)


In [None]:
rfc_model.fit(X_train, y_train)
print("Random Forest training complete!")


Random Forest training complete!


In [None]:
# Predict on test set
y_pred_rfc = rfc_model.predict(X_test)

# Classification report
from sklearn.metrics import classification_report, confusion_matrix
print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rfc, target_names=le.classes_, labels=le.transform(le.classes_)))

# Optional: Confusion matrix
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print("Confusion Matrix:\n", cm_rfc)

Random Forest Classification Report:

              precision    recall  f1-score   support

        Good       0.98      0.99      0.98       747
    Moderate       0.97      0.94      0.95       253

    accuracy                           0.98      1000
   macro avg       0.98      0.96      0.97      1000
weighted avg       0.98      0.98      0.98      1000

Confusion Matrix:
 [[740   7]
 [ 16 237]]


In [None]:
import joblib

# Save XGBoost model
joblib.dump(xgb_model, "aqi_classifier_xgb.pkl")
print("XGBoost model saved as aqi_classifier_xgb.pkl")

# Save Random Forest model
joblib.dump(rfc_model, "aqi_classifier_rfc.pkl")
print("Random Forest model saved as aqi_classifier_rfc.pkl")

# Save the LabelEncoder as well for decoding class predictions
joblib.dump(le, "aqi_label_encoder.pkl")
print("Label encoder saved as aqi_label_encoder.pkl")


XGBoost model saved as aqi_classifier_xgb.pkl
Random Forest model saved as aqi_classifier_rfc.pkl
Label encoder saved as aqi_label_encoder.pkl


Generate Predictions & Prepare Backend JSON

In [None]:
import joblib

# Load the best model (XGBoost or Random Forest)
model = joblib.load("aqi_classifier_xgb.pkl")  # or rfc_model
le = joblib.load("aqi_label_encoder.pkl")


preparunf features for all stations

In [None]:
X_all = df[['measurements_PM2.5', 'measurements_NO2', 'measurements_O3',
            'weather_temperature','weather_humidity','weather_wind_speed']]


predict AQI classes

In [None]:
# Predict numeric labels
pred_labels = model.predict(X_all)

# Convert numeric labels back to AQI class names
pred_classes = le.inverse_transform(pred_labels)

# Add predictions to dataframe
df['predicted_aqi_class'] = pred_classes


disease specific alerts

In [None]:
def generate_alerts(aqi_class):
    alerts = {
        "asthma": False,
        "cardio": False,
        "copd": False
    }
    if aqi_class in ["Unhealthy for SG", "Unhealthy", "Very Unhealthy", "Hazardous"]:
        alerts["asthma"] = True
    if aqi_class in ["Unhealthy", "Very Unhealthy", "Hazardous"]:
        alerts["cardio"] = True
    if aqi_class in ["Unhealthy for SG", "Unhealthy", "Very Unhealthy", "Hazardous"]:
        alerts["copd"] = True
    return alerts

df['alerts'] = df['predicted_aqi_class'].apply(generate_alerts)


prepapring JSON for frontent

In [None]:
# Select relevant columns for map / alerts
json_data = df[['city', 'lat', 'lon', 'measurements_PM2.5', 'measurements_NO2', 'measurements_O3',
                'weather_temperature','weather_humidity','weather_wind_speed',
                'predicted_aqi_class','alerts']]

# Convert to JSON
backend_json = json_data.to_dict(orient='records')

# Save as file
with open("predicted_aqi_for_map.json", "w") as f:
    json.dump(backend_json, f, indent=4)

print("Backend JSON saved! Ready for frontend map.")


Backend JSON saved! Ready for frontend map.


Serve the JSON as a dynamic file

In [None]:
!pip install pyngrok requests




In [None]:
from pyngrok import ngrok
import threading, http.server, socketserver, requests, json, time

# Replace with your auth token from https://dashboard.ngrok.com/get-started/your-authtoken
!ngrok config add-authtoken 33bF5Vibwq9gPTmS1JGJXVSsCSa_7df4j1v9EKkexXGzmWYqg


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
API_KEY = "D7D8577B-A57E-4ED7-8EF2-A10DD9EBAD16"
LAT, LON = 40.7128, -74.0060  # Example: New York City

def fetch_air_quality():
    """Fetches air quality data from OpenWeatherMap and saves it locally."""
    try:
        url = f"http://api.openweathermap.org/data/2.5/air_pollution?lat={LAT}&lon={LON}&appid={API_KEY}"
        response = requests.get(url)
        data = response.json()

        # Add timestamp
        data["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

        # Save JSON
        with open("predicted_aqi_for_map.json", "w") as f:
            json.dump(data, f, indent=2)
        print(f"✅ Updated JSON at {data['timestamp']}")

    except Exception as e:
        print(f"❌ Error updating data: {e}")


In [None]:
def auto_update(interval=3600):  # 3600 seconds = 1 hour
    while True:
        fetch_air_quality()
        time.sleep(interval)

# Run the scheduler in background
threading.Thread(target=auto_update, daemon=True).start()
print("⏳ Auto-update thread started... Fetching data hourly!")


⏳ Auto-update thread started... Fetching data hourly!


In [None]:
!pip install flask-ngrok



In [None]:
!pip install flask flask-cors pyngrok




In [None]:
from flask import Flask, jsonify, request
from flask_cors import CORS
from pyngrok import ngrok

app = Flask(__name__)
CORS(app)  # allows React frontend to call API

# Expose localhost to internet
public_url = ngrok.connect(5000)
print("Public URL:", public_url)

# Sample GET endpoint
@app.route("/getData", methods=["GET"])
def get_data():
    return jsonify({"message": "Backend is working!"})

# Sample POST endpoint
@app.route("/postData", methods=["POST"])
def post_data():
    data = request.json
    return jsonify({"status": "success", "received": data})

# Run Flask app
app.run(port=5000)


Public URL: NgrokTunnel: "https://waggly-extralegal-kurt.ngrok-free.dev" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [05/Oct/2025 06:03:12] "[33mGET /airquality HTTP/1.1[0m" 404 -
