In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
locations = pd.read_csv(r"D:\AI-EnviroProject\india_locations.csv")
air = pd.read_csv(r"D:\AI-EnviroProject\india_air_quality.csv")
weather = pd.read_csv(r"D:\AI-EnviroProject\india_weather.csv")
features = pd.read_csv(r"D:\AI-EnviroProject\india_features.csv")

In [4]:
locations.drop_duplicates(inplace=True)
air.drop_duplicates(inplace=True)
weather.drop_duplicates(inplace=True)
features.drop_duplicates(inplace=True)

In [5]:
locations = locations[
    (locations['latitude'].between(-90, 90)) &
    (locations['longitude'].between(-180, 180))
]

In [6]:
# Remove duplicate rows
locations.drop_duplicates(inplace=True)
air.drop_duplicates(inplace=True)
weather.drop_duplicates(inplace=True)
features.drop_duplicates(inplace=True)

# Remove invalid latitude/longitude
locations = locations[(locations['latitude'].between(-90,90)) & (locations['longitude'].between(-180,180))]
weather = weather[(weather['latitude'].between(-90,90)) & (weather['longitude'].between(-180,180))]

# Remove negative or invalid pollution values
pollution_cols = ['PM25 AQ', 'PM10 AQ', 'CO AQ', 'NO2 AQ', 'SO2 AQ', 'O3 AQ']
air = air[(air[pollution_cols] >= 0).all(axis=1)]

In [7]:
# --- Module 2: Handle Missing Values ---

# 1️⃣ Interpolate missing air quality values
pollution_cols = ['PM25 AQ', 'PM10 AQ', 'CO AQ', 'NO2 AQ', 'SO2 AQ', 'O3 AQ']
air[pollution_cols] = air[pollution_cols].interpolate()

# 2️⃣ Fill missing weather values with median (numeric only)
numeric_weather_cols = ['temperature','humidity','wind_speed','wind_deg']
weather[numeric_weather_cols] = weather[numeric_weather_cols].fillna(weather[numeric_weather_cols].median())

# 3️⃣ Fill missing numeric features in location/features dataset
numeric_features = features.select_dtypes(include=np.number).columns
features[numeric_features] = features[numeric_features].fillna(features[numeric_features].median())


In [8]:
# Standardize Timestamps & GPS
locations['latitude'] = locations['latitude'].astype(float)
locations['longitude'] = locations['longitude'].astype(float)
weather['latitude'] = weather['latitude'].astype(float)
weather['longitude'] = weather['longitude'].astype(float)

In [9]:
# ----------------- STEP 3: Normalize Air Quality & Weather -----------------
scaler = MinMaxScaler()

# Normalize pollution
air[pollution_cols] = scaler.fit_transform(air[pollution_cols])

# Normalize numeric weather columns
weather[numeric_weather_cols] = scaler.fit_transform(weather[numeric_weather_cols])

# ----------------- STEP 4: Temporal Features (from weather) -----------------
weather['timestamp'] = pd.to_datetime(weather['timestamp'])
weather['hour'] = weather['timestamp'].dt.hour
weather['day_of_week'] = weather['timestamp'].dt.dayofweek
weather['month'] = weather['timestamp'].dt.month
weather['season'] = weather['month'] % 12 // 3 + 1  # 1: Winter, 2: Spring, 3: Summer, 4: Fall

In [10]:
# ----------------- STEP 5: Merge DataFrames -----------------
# Merge air + weather on 'city' (or 'location_id' if available)
merged_df = pd.merge(air, weather, on='city', how='left', suffixes=('_air','_weather'))

# Merge with features on 'location_id'
final_df = pd.merge(merged_df, features, on='location_id', how='left')

# Merge with locations to get latitude/longitude
final_df = pd.merge(final_df, locations[['location_id','latitude','longitude']], on='location_id', how='left')
# Keep latitude/longitude from locations and drop the rest
final_df.drop(columns=['latitude_air', 'longitude_air', 'latitude_weather', 'longitude_weather'], inplace=True)

# Rename remaining columns for clarity if needed
final_df.rename(columns={'latitude':'latitude', 'longitude':'longitude'}, inplace=True)

# ----------------- STEP 6: Final Check -----------------
print(final_df.shape)
print(final_df.head())
print(final_df.isnull().sum())

(7050, 37)
       city_x   PM25 AQ PM25 AQI Category   PM10 AQ PM10 AQI Category  \
0      Mumbai  0.243907          Moderate  0.220635          Moderate   
1   New Delhi  0.347064          Moderate  0.349750              Poor   
2        Pune  0.234645          Moderate  0.209854          Moderate   
3  Shrirampur  0.080488              Good  0.077687              Good   
4       Vaduj  0.273963          Moderate  0.248072          Moderate   

      CO AQ CO AQI Category    NO2 AQ NO2 AQI Category    SO2 AQ  ...  \
0  0.118402       Very Poor  0.015383             Good  0.104827  ...   
1  0.291592          Severe  0.234147             Good  0.040097  ...   
2  0.119842       Very Poor  0.035110             Good  0.048713  ...   
3  0.095073       Very Poor  0.139032             Good  0.085276  ...   
4  0.157308          Severe  0.112729             Good  0.049265  ...   

  Farmland_Count  Landfill_Count Dump_Site_Count  Distance_to_Nearest_Road_m  \
0            2.0             0.

In [11]:
# ----------------- STEP 7: Save Cleaned Data -----------------
final_df.to_csv(r"D:\AI-EnviroProject\processed_data_module2.csv", index=False)

print("✅ Module 2: Data Cleaning & Feature Engineering Complete!")

✅ Module 2: Data Cleaning & Feature Engineering Complete!


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# ----------------- LOAD CLEANED DATA -----------------
final_df = pd.read_csv(r"D:\AI-EnviroProject\processed_data_module2.csv")

# ----------------- STEP 1: AQI Classification -----------------
# If 'AQI' column exists
def classify_aqi(aqi):
    if aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 200:
        return 'Poor'
    elif aqi <= 300:
        return 'Very Poor'
    else:
        return 'Hazardous'

final_df['AQI_Class'] = final_df['AQI'].apply(classify_aqi)

# ----------------- STEP 2: Prepare Features & Target -----------------
# Drop non-numeric or target columns from features
feature_cols = ['PM25 AQ', 'PM10 AQ', 'CO AQ', 'NO2 AQ', 'SO2 AQ', 'O3 AQ',
                'temperature', 'humidity', 'wind_speed', 'wind_deg',
                'hour','day_of_week','month','season',
                'Road_Count','Industrial_Count','Farmland_Count',
                'Landfill_Count','Dump_Site_Count',
                'Distance_to_Nearest_Road_m','Distance_to_Nearest_Industrial_m',
                'Distance_to_Nearest_Farmland_m','Distance_to_Nearest_Landfill_m',
                'Distance_to_Nearest_Dump_m']

X = final_df[feature_cols]
y = final_df['AQI_Class']

# ----------------- STEP 3: Split Train/Test -----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Module 3: AQI Classification & ML Data Preparation Complete!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# ----------------- STEP 4: Save for Modeling -----------------
X_train.to_csv(r"D:\AI-EnviroProject\X_train.csv", index=False)
X_test.to_csv(r"D:\AI-EnviroProject\X_test.csv", index=False)
y_train.to_csv(r"D:\AI-EnviroProject\y_train.csv", index=False)
y_test.to_csv(r"D:\AI-EnviroProject\y_test.csv", index=False)

✅ Module 3: AQI Classification & ML Data Preparation Complete!
Training set: (5640, 24)
Test set: (1410, 24)


In [13]:
# =========================
# Module 3: Source Labeling with Data-Driven Thresholds
# =========================

import pandas as pd
import numpy as np

# ----------------- LOAD DATA -----------------
final_df = pd.read_csv(r"D:\AI-EnviroProject\processed_data_module2.csv")

# ----------------- STEP 1: Derive Season from Timestamp -----------------
if 'timestamp' in final_df.columns:
    final_df['timestamp'] = pd.to_datetime(final_df['timestamp'])
    final_df['month'] = final_df['timestamp'].dt.month
    final_df['season'] = final_df['month'].apply(lambda m: 'Winter' if m in [12,1,2]
                                                 else 'Spring' if m in [3,4,5]
                                                 else 'Summer' if m in [6,7,8]
                                                 else 'Autumn')
else:
    final_df['season'] = 'Summer'

# ----------------- STEP 2: Set Data-Driven Thresholds -----------------
road_thresh = final_df['Distance_to_Nearest_Road_m'].quantile(0.25)
industrial_thresh = final_df['Distance_to_Nearest_Industrial_m'].quantile(0.25)
farmland_thresh = final_df['Distance_to_Nearest_Farmland_m'].quantile(0.25)
dump_thresh = final_df['Distance_to_Nearest_Dump_m'].quantile(0.25)

no2_thresh = final_df['NO2 AQ'].quantile(0.75)
so2_thresh = final_df['SO2 AQ'].quantile(0.75)
pm25_thresh = final_df['PM25 AQ'].quantile(0.75)
pm10_thresh = final_df['PM10 AQ'].quantile(0.75)

# ----------------- STEP 3: Define Labeling Function -----------------
def label_pollution_source(row):
    if row['Distance_to_Nearest_Road_m'] <= road_thresh and row['NO2 AQ'] >= no2_thresh:
        return 'Vehicular'
    elif row['Distance_to_Nearest_Industrial_m'] <= industrial_thresh and row['SO2 AQ'] >= so2_thresh:
        return 'Industrial'
    elif row['Distance_to_Nearest_Farmland_m'] <= farmland_thresh and row['PM25 AQ'] >= pm25_thresh and row['season'] in ['Summer','Autumn']:
        return 'Agricultural'
    elif row['Distance_to_Nearest_Dump_m'] <= dump_thresh and row['PM10 AQ'] >= pm10_thresh:
        return 'Burning'
    else:
        return 'Natural'

# ----------------- STEP 4: Apply Labeling -----------------
final_df['Pollution_Source'] = final_df.apply(label_pollution_source, axis=1)

# ----------------- STEP 5: Check Distribution -----------------
print("Pollution Source Counts:")
print(final_df['Pollution_Source'].value_counts())

# ----------------- STEP 6: Save Labeled Dataset -----------------
final_df.to_csv(r"D:\AI-EnviroProject\labeled_dataset.csv", index=False)
print("✅ Module 3: Pollution Source Labeling Complete!")

Pollution Source Counts:
Pollution_Source
Natural       5168
Industrial     890
Burning        692
Vehicular      300
Name: count, dtype: int64
✅ Module 3: Pollution Source Labeling Complete!


In [22]:
# ----------------- STEP 7: Encode Labels -----------------
from sklearn.preprocessing import LabelEncoder
import joblib

# Create LabelEncoder instance
le = LabelEncoder()

# Fit and transform the Pollution_Source column
final_df['Pollution_Source_Encoded'] = le.fit_transform(final_df['Pollution_Source'])

# Check mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("✅ LabelEncoder Mapping:")
print(label_mapping)

# Optional: Save the LabelEncoder for later use (dashboard or deployment)
joblib.dump(le, r"D:\AI-EnviroProject\label_encoder.pkl")
print("✅ LabelEncoder saved as 'label_encoder.pkl'")

✅ LabelEncoder Mapping:
{'Burning': np.int64(0), 'Industrial': np.int64(1), 'Natural': np.int64(2), 'Vehicular': np.int64(3)}
✅ LabelEncoder saved as 'label_encoder.pkl'


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load labeled dataset
df = pd.read_csv(r"D:\AI-EnviroProject\labeled_dataset.csv")

# Features for ML
feature_cols = ['PM25 AQ','PM10 AQ','CO AQ','NO2 AQ','SO2 AQ','O3 AQ',
                'temperature','humidity','wind_speed','wind_deg',
                'hour','day_of_week','month','season',
                'Road_Count','Industrial_Count','Farmland_Count',
                'Landfill_Count','Dump_Site_Count',
                'Distance_to_Nearest_Road_m','Distance_to_Nearest_Industrial_m',
                'Distance_to_Nearest_Farmland_m',
                'Distance_to_Nearest_Landfill_m','Distance_to_Nearest_Dump_m']

X = df[feature_cols]
y = df['Pollution_Source']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Save train.csv (for model training)
train_df = pd.concat([X_train, y_train], axis=1)
train_df.to_csv(r"D:\AI-EnviroProject\train.csv", index=False)

# Save app_daily_data.csv (all labeled data for dashboard)
df.to_csv(r"D:\AI-EnviroProject\app_daily_data.csv", index=False)

print("✅ Generated train.csv and app_daily_data.csv")

✅ Generated train.csv and app_daily_data.csv


In [42]:
# ---------------------- MODULE 4: MODEL TRAINING ----------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Assume final_df is the cleaned, labeled dataset from Module 3
# Features: pollutant, weather, proximity
feature_cols = [
    'PM25 AQ', 'PM10 AQ', 'NO2 AQ', 'SO2 AQ', 'CO AQ', 'O3 AQ',
    'temperature', 'humidity', 'wind_speed', 'wind_deg',
    'Distance_to_Nearest_Road_m', 'Distance_to_Nearest_Industrial_m',
    'Distance_to_Nearest_Farmland_m', 'Distance_to_Nearest_Landfill_m',
    'Distance_to_Nearest_Dump_m'
]

X = final_df[feature_cols]

# Encode target
le = LabelEncoder()
y = le.fit_transform(final_df['Pollution_Source'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ------------------ DECISION TREE ------------------
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt, target_names=le.classes_))

# ------------------ RANDOM FOREST ------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf, target_names=le.classes_))

# ------------------ XGBOOST ------------------
num_classes = len(le.classes_)

xgb = XGBClassifier(
    objective='multi:softmax',   # Multi-class classification
    num_class=num_classes,
    eval_metric='mlogloss',
    random_state=42
)

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb, target_names=le.classes_))

# ------------------ SAVE MODELS ------------------
joblib.dump(dt, r"D:\AI-EnviroProject\decision_tree_model.pkl")
joblib.dump(rf, r"D:\AI-EnviroProject\random_forest_model.pkl")
joblib.dump(xgb, r"D:\AI-EnviroProject\xgboost_model.pkl")
print("Models saved successfully!")


Decision Tree Accuracy: 0.9971631205673759
Decision Tree Classification Report:
               precision    recall  f1-score   support

     Burning       1.00      0.99      0.99       138
  Industrial       1.00      1.00      1.00       178
     Natural       1.00      1.00      1.00      1034
   Vehicular       0.95      0.98      0.97        60

    accuracy                           1.00      1410
   macro avg       0.99      0.99      0.99      1410
weighted avg       1.00      1.00      1.00      1410

Random Forest Accuracy: 0.9971631205673759
Random Forest Classification Report:
               precision    recall  f1-score   support

     Burning       0.99      1.00      0.99       138
  Industrial       0.99      1.00      1.00       178
     Natural       1.00      1.00      1.00      1034
   Vehicular       1.00      0.97      0.98        60

    accuracy                           1.00      1410
   macro avg       0.99      0.99      0.99      1410
weighted avg       1.00

In [14]:
# ---------------------- MODULE 4: MODEL TRAINING WITH GRIDSEARCH & SCALER ----------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ---------------------- FEATURES & TARGET ----------------------
feature_cols = [
    'PM25 AQ', 'PM10 AQ', 'NO2 AQ', 'SO2 AQ', 'CO AQ', 'O3 AQ',
    'temperature', 'humidity', 'wind_speed', 'wind_deg',
    'Distance_to_Nearest_Road_m', 'Distance_to_Nearest_Industrial_m',
    'Distance_to_Nearest_Farmland_m', 'Distance_to_Nearest_Landfill_m',
    'Distance_to_Nearest_Dump_m'
]

X = final_df[feature_cols]

# Encode target
le = LabelEncoder()
y = le.fit_transform(final_df['Pollution_Source'])

# ---------------------- SCALE FEATURES ----------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, r"D:\AI-EnviroProject\scaler.joblib")
print("Scaler saved successfully!")

# Split scaled dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------- DECISION TREE WITH GRIDSEARCH ----------------------
dt = DecisionTreeClassifier(random_state=42)
dt_params = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid.fit(X_train, y_train)
best_dt = dt_grid.best_estimator_

y_pred_dt = best_dt.predict(X_test)
print("Decision Tree Best Params:", dt_grid.best_params_)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt, target_names=le.classes_))

# ---------------------- RANDOM FOREST WITH GRIDSEARCH ----------------------
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

y_pred_rf = best_rf.predict(X_test)
print("Random Forest Best Params:", rf_grid.best_params_)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf, target_names=le.classes_))

# ---------------------- XGBOOST WITH GRIDSEARCH ----------------------
num_classes = len(le.classes_)

xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    eval_metric='mlogloss',
    random_state=42
)
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

y_pred_xgb = best_xgb.predict(X_test)
print("XGBoost Best Params:", xgb_grid.best_params_)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb, target_names=le.classes_))

# ---------------------- SAVE BEST MODELS ----------------------
joblib.dump(best_dt, r"D:\AI-EnviroProject\best_decision_tree_model.pkl")
joblib.dump(best_rf, r"D:\AI-EnviroProject\best_random_forest_model.pkl")
joblib.dump(best_xgb, r"D:\AI-EnviroProject\best_xgboost_model.pkl")
print("Best models saved successfully!")

Scaler saved successfully!
Decision Tree Best Params: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5}
Decision Tree Accuracy: 0.999290780141844
Decision Tree Classification Report:
               precision    recall  f1-score   support

     Burning       1.00      1.00      1.00       138
  Industrial       1.00      1.00      1.00       178
     Natural       1.00      1.00      1.00      1034
   Vehicular       1.00      0.98      0.99        60

    accuracy                           1.00      1410
   macro avg       1.00      1.00      1.00      1410
weighted avg       1.00      1.00      1.00      1410

Random Forest Best Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 0.9971631205673759
Random Forest Classification Report:
               precision    recall  f1-score   support

     Burning       0.99      1.00      0.99       138
  Industrial       0.99      1.00      1.00       178
     Nat

In [52]:
# ---------------------- Module 5: Geospatial Mapping and Heatmap Visualization   ----------------------
import pandas as pd
import folium
from folium.plugins import HeatMap, MarkerCluster
import os

# ------------------ LOAD DATA ------------------
final_df = pd.read_csv(r"D:\AI-EnviroProject\labeled_dataset.csv")
# Columns: latitude, longitude, AQI, Pollution_Source, city_x, timestamp

# Convert timestamp to datetime
final_df['timestamp'] = pd.to_datetime(final_df['timestamp'])

# ------------------ BASE SETTINGS ------------------
india_center = [20.5937, 78.9629]
zoom_start = 5
save_path = r"D:\AI-EnviroProject\all_html_file"  # folder from your screenshot
os.makedirs(save_path, exist_ok=True)

# ------------------ SOURCE ICONS & COLORS ------------------
source_icons = {
    'Vehicular': 'car',
    'Industrial': 'industry',
    'Agricultural': 'leaf',
    'Burning': 'fire',
    'Natural': 'tree'
}

source_colors = {
    'Vehicular': 'blue',
    'Industrial': 'red',
    'Agricultural': 'green',
    'Burning': 'orange',
    'Natural': 'gray'
}

# ------------------ FUNCTION TO CREATE MAP ------------------
def create_map(df, file_name, heatmap=False):
    m = folium.Map(location=india_center, zoom_start=zoom_start)
    
    if heatmap:
        heat_data = [[row['latitude'], row['longitude'], row['AQI']] for idx, row in df.iterrows()]
        HeatMap(heat_data, radius=15, max_zoom=13).add_to(m)
    
    marker_cluster = MarkerCluster().add_to(m)
    for idx, row in df.iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=(f"City: {row['city_x']}<br>"
                   f"Timestamp: {row['timestamp']}<br>"
                   f"Source: {row['Pollution_Source']}<br>"
                   f"AQI: {row['AQI']}"),
            icon=folium.Icon(color=source_colors.get(row['Pollution_Source'], 'black'),
                             icon=source_icons.get(row['Pollution_Source'], 'info-sign'),
                             prefix='fa')
        ).add_to(marker_cluster)
    
    m.save(os.path.join(save_path, file_name))
    print(f"Saved: {file_name}")

# ------------------ SAVE MAIN MAPS ------------------

# 1. Overall heatmap
create_map(final_df, "pollution_heatmap.html", heatmap=True)

# 2. All pollution sources map
create_map(final_df, "pollution_sources_map.html", heatmap=False)

# 3. High-risk zones (AQI > 150)
high_risk_df = final_df[final_df['AQI'] > 150]
create_map(high_risk_df, "high_risk_zones.html", heatmap=True)

# 4. High AQI locations (AQI > 200)
high_aqi_df = final_df[final_df['AQI'] > 200]
create_map(high_aqi_df, "high_aqi_locations.html", heatmap=True)

print("✅ All four main maps saved successfully in 'all_html_file' folder!")

Saved: pollution_heatmap.html
Saved: pollution_sources_map.html
Saved: high_risk_zones.html
Saved: high_aqi_locations.html
✅ All four main maps saved successfully in 'all_html_file' folder!


In [13]:
final_df.columns

Index(['city_x', 'PM25 AQ', 'PM25 AQI Category', 'PM10 AQ',
       'PM10 AQI Category', 'CO AQ', 'CO AQI Category', 'NO2 AQ',
       'NO2 AQI Category', 'SO2 AQ', 'SO2 AQI Category', 'O3 AQ',
       'O3 AQI Category', 'AQI', 'location_id', 'temperature', 'humidity',
       'wind_speed', 'wind_deg', 'timestamp', 'hour', 'day_of_week', 'month',
       'season', 'city_y', 'Road_Count', 'Industrial_Count', 'Farmland_Count',
       'Landfill_Count', 'Dump_Site_Count', 'Distance_to_Nearest_Road_m',
       'Distance_to_Nearest_Industrial_m', 'Distance_to_Nearest_Farmland_m',
       'Distance_to_Nearest_Landfill_m', 'Distance_to_Nearest_Dump_m',
       'latitude', 'longitude', 'Pollution_Source'],
      dtype='object')