In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting numpy (from xgboost)
  Downloading numpy-2.4.0-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Collecting scipy (from xgboost)
  Downloading scipy-1.16.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 2.1/72.0 MB 11.8 MB/s eta 0:00:06
   - -------------------------------------- 3.4/72.0 MB 8.6 MB/s eta 0:00:09
   --- ------------------------------------ 6.0/72.0 MB 9.8 MB/s eta 0:00:07
   ---- ----------------------------------- 8.7/72.0 MB 10.4 MB/s eta 0:00:07
   ----- ---------------------------------- 10.5/72.0 MB 10.0 MB/s eta 0:00:07
   ------ --------------------------------- 12.6/72.0 MB 10.3 MB/s eta 0:00:06
   -------- ------------------------------- 14.7/72.0 MB 10.4 MB/s eta 0:00:06
   --------- -------------------

In [4]:
!pip install pandas scikit-learn joblib


Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting joblib
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ------ ------

In [1]:
import xgboost
import pandas as pd
import sklearn
import joblib

print("XGBoost version:", xgboost.__version__)
print("Pandas version:", pd.__version__)
print("Scikit-learn version:", sklearn.__version__)


XGBoost version: 3.1.2
Pandas version: 2.3.3
Scikit-learn version: 1.8.0


In [4]:
# ============================================================
# XGBOOST TRAINING PIPELINE (WITH LABEL ENCODING)
# ============================================================

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import randint, uniform

# ------------------------------------------------------------
# STEP 1: LOAD LABELED DATASET
# ------------------------------------------------------------
df = pd.read_csv("Final_clean_dataset_with_source.csv")

# ------------------------------------------------------------
# STEP 2: DEFINE FEATURES AND TARGET
# ------------------------------------------------------------
target = 'pollution_source'

numeric_features = [
    'no2','so2','pm25','co',
    'dist_nearest_road_m',
    'dist_nearest_industry_m',
    'dist_nearest_agriculture_m'
]

categorical_features = ['season']

X = df[numeric_features + categorical_features]
y = df[target]

# ------------------------------------------------------------
# STEP 2a: ENCODE TARGET LABELS
# ------------------------------------------------------------
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, "label_encoder.pkl")  # Save encoder for later

# ------------------------------------------------------------
# STEP 3: TRAIN–TEST SPLIT (80/20)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# ------------------------------------------------------------
# STEP 4: PREPROCESSING PIPELINE
# ------------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# ------------------------------------------------------------
# STEP 5: DEFINE XGBOOST MODEL
# ------------------------------------------------------------
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),  # Number of classes
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_jobs=-1,
    random_state=42
)

# Hyperparameter search space
param_dist = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__max_depth': randint(3, 15),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__subsample': uniform(0.6, 0.4),
    'classifier__colsample_bytree': uniform(0.6, 0.4)
}

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42
)

# ------------------------------------------------------------
# STEP 6: TRAIN THE MODEL
# ------------------------------------------------------------
random_search.fit(X_train, y_train)

# ------------------------------------------------------------
# STEP 7: EVALUATE MODEL
# ------------------------------------------------------------
y_pred = random_search.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred)  # Decode to original string labels
y_test_labels = le.inverse_transform(y_test)

print("Best Hyperparameters:", random_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_labels, y_pred_labels))
print("\nAccuracy Score:", accuracy_score(y_test_labels, y_pred_labels))

# ------------------------------------------------------------
# STEP 8: SAVE TRAINED MODEL
# ------------------------------------------------------------
joblib.dump(random_search.best_estimator_, "xgb_pollution_source_model.pkl")
print("\n✅ XGBoost model saved as: xgb_pollution_source_model.pkl")
print("✅ Label encoder saved as: label_encoder.pkl")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Hyperparameters: {'classifier__colsample_bytree': np.float64(0.9332779646944658), 'classifier__learning_rate': np.float64(0.062009396052331626), 'classifier__max_depth': 3, 'classifier__n_estimators': 263, 'classifier__subsample': np.float64(0.672894435115225)}

Classification Report:
               precision    recall  f1-score   support

     Burning       1.00      1.00      1.00       804
  Industrial       1.00      1.00      1.00       453
     Natural       1.00      1.00      1.00     10759
   Vehicular       1.00      1.00      1.00      3583

    accuracy                           1.00     15599
   macro avg       1.00      1.00      1.00     15599
weighted avg       1.00      1.00      1.00     15599


Confusion Matrix:
 [[  802     0     0     2]
 [    0   452     1     0]
 [    0     1 10745    13]
 [    0     0     1  3582]]

Accuracy Score: 0.9988460798769152

✅ XGBoost model saved as: xgb_pollution_source_model.pkl
✅ Label encoder saved as: label_encoder.pkl


In [5]:
# =============================================================
# XGBOOST TRAINING FOR POLLUTION SOURCE PREDICTION
# =============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
import joblib

# ------------------------------------------------------------
# STEP 1: LOAD LABELED DATASET
# ------------------------------------------------------------
df = pd.read_csv("Final_clean_dataset_with_source.csv")

# ------------------------------------------------------------
# STEP 2: DEFINE FEATURES AND TARGET
# ------------------------------------------------------------
features = [
    'pm25', 'pm10', 'no2', 'co', 'so2', 'o3',
    'temperature', 'humidity', 'wind_speed', 'wind_direction',
    'dist_nearest_road_m', 'dist_nearest_industry_m', 
    'dist_nearest_dump_m', 'dist_nearest_agriculture_m'
]

target = 'pollution_source'

X = df[features]
y = df[target]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, "xgb_label_encoder.pkl")  # Save label encoder

# ------------------------------------------------------------
# STEP 3: SPLIT DATASET (80/20 TRAIN-TEST)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ------------------------------------------------------------
# STEP 4: DEFINE XGBOOST PIPELINE & HYPERPARAMETERS
# ------------------------------------------------------------
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
    ('classifier', XGBClassifier(
        objective='multi:softmax', 
        num_class=len(np.unique(y_encoded)),
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ))
])

param_dist = {
    'classifier__n_estimators': [100, 200, 300, 400],
    'classifier__max_depth': [3, 5, 7, 9, 11],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 1.0]
}

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=20, cv=3,
    scoring='accuracy', random_state=42, n_jobs=-1, verbose=1
)

# ------------------------------------------------------------
# STEP 5: TRAIN MODEL
# ------------------------------------------------------------
random_search.fit(X_train, y_train)
xgb_model = random_search.best_estimator_

print("\n✅ Best Hyperparameters:", random_search.best_params_)

# ------------------------------------------------------------
# STEP 6: EVALUATE MODEL
# ------------------------------------------------------------
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\n✅ Accuracy Score:", accuracy)

print("\n✅ Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ------------------------------------------------------------
# STEP 7: SAVE TRAINED MODEL
# ------------------------------------------------------------
joblib.dump(xgb_model, "xgb_pollution_source_model.pkl")
print("\n✅ XGBoost model saved as: xgb_pollution_source_model.pkl")

# ------------------------------------------------------------
# SUMMARY
# ------------------------------------------------------------
print("\n================ MODEL TRAINING SUMMARY ================")
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")
print("Features used:", features)
print("Target variable: pollution_source")
print("XGBoost trained with hyperparameter tuning using RandomizedSearchCV")
print("Evaluation metrics: Accuracy, Precision, Recall, F1-score, Confusion Matrix")
print("Model and label encoder saved for future predictions.")
print("========================================================")


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Best Hyperparameters: {'classifier__subsample': 0.8, 'classifier__n_estimators': 400, 'classifier__max_depth': 7, 'classifier__learning_rate': 0.1, 'classifier__colsample_bytree': 0.7}

✅ Accuracy Score: 0.9987178665299058

✅ Classification Report:
               precision    recall  f1-score   support

     Burning       1.00      1.00      1.00       804
  Industrial       1.00      1.00      1.00       453
     Natural       1.00      1.00      1.00     10759
   Vehicular       1.00      1.00      1.00      3583

    accuracy                           1.00     15599
   macro avg       1.00      1.00      1.00     15599
weighted avg       1.00      1.00      1.00     15599


✅ Confusion Matrix:
 [[  804     0     0     0]
 [    0   452     1     0]
 [    0     1 10745    13]
 [    0     1     4  3578]]

✅ XGBoost model saved as: xgb_pollution_source_model.pkl

Total samples: 77994
Training samples: 62395 | Testing samples: 15599
Features used: ['pm25', 'pm10', 'no2', 'co', 'so2', 