In [2]:
# ============================================================
# RANDOM FOREST TRAINING PIPELINE (FAST VERSION)
# ============================================================

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import randint

# ------------------------------------------------------------
# STEP 1: LOAD LABELED DATASET
# ------------------------------------------------------------
df = pd.read_csv("Final_clean_dataset_with_source.csv")

# ------------------------------------------------------------
# STEP 2: DEFINE FEATURES AND TARGET
# ------------------------------------------------------------
target = 'pollution_source'

numeric_features = [
    'no2','so2','pm25','co',
    'dist_nearest_road_m',
    'dist_nearest_industry_m',
    'dist_nearest_agriculture_m'
]

categorical_features = ['season']

X = df[numeric_features + categorical_features]
y = df[target]

# ------------------------------------------------------------
# STEP 3: TRAIN–TEST SPLIT (80/20)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------------------------------------------------
# STEP 4: PREPROCESSING PIPELINE
# ------------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# ------------------------------------------------------------
# STEP 5: DEFINE RANDOM FOREST MODEL
# ------------------------------------------------------------
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

# RandomizedSearchCV hyperparameter space
param_dist = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__max_depth': randint(5, 30),
    'classifier__min_samples_split': randint(2, 15),
    'classifier__min_samples_leaf': randint(1, 10)
}

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])

# RandomizedSearchCV (faster than GridSearch)
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,                # number of random combinations
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42
)

# ------------------------------------------------------------
# STEP 6: TRAIN THE MODEL
# ------------------------------------------------------------
random_search.fit(X_train, y_train)

# ------------------------------------------------------------
# STEP 7: EVALUATE MODEL
# ------------------------------------------------------------
y_pred = random_search.predict(X_test)

print("Best Hyperparameters:", random_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

# ------------------------------------------------------------
# STEP 8: SAVE TRAINED MODEL
# ------------------------------------------------------------
joblib.dump(random_search.best_estimator_, "rf_pollution_source_model.pkl")
print("\n✅ Random Forest model saved as: rf_pollution_source_model.pkl")


Best Hyperparameters: {'classifier__max_depth': 25, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 191}

Classification Report:
               precision    recall  f1-score   support

     Burning       1.00      1.00      1.00       804
  Industrial       1.00      1.00      1.00       453
     Natural       1.00      1.00      1.00     10759
   Vehicular       1.00      1.00      1.00      3583

    accuracy                           1.00     15599
   macro avg       1.00      1.00      1.00     15599
weighted avg       1.00      1.00      1.00     15599


Confusion Matrix:
 [[  804     0     0     0]
 [    0   453     0     0]
 [    0     0 10759     0]
 [    0     0     0  3583]]

Accuracy Score: 1.0

✅ Random Forest model saved as: rf_pollution_source_model.pkl


In [3]:
# =============================================================
# RANDOM FOREST TRAINING FOR POLLUTION SOURCE PREDICTION
# =============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# ------------------------------------------------------------
# STEP 1: LOAD LABELED DATASET
# ------------------------------------------------------------
df = pd.read_csv("Final_clean_dataset_with_source.csv")

# ------------------------------------------------------------
# STEP 2: DEFINE FEATURES AND TARGET
# ------------------------------------------------------------
features = [
    'pm25', 'pm10', 'no2', 'co', 'so2', 'o3',
    'temperature', 'humidity', 'wind_speed', 'wind_direction',
    'dist_nearest_road_m', 'dist_nearest_industry_m', 
    'dist_nearest_dump_m', 'dist_nearest_agriculture_m'
]

target = 'pollution_source'

X = df[features]
y = df[target]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, "rf_label_encoder.pkl")  # Save label encoder

# ------------------------------------------------------------
# STEP 3: SPLIT DATASET (80/20 TRAIN-TEST)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ------------------------------------------------------------
# STEP 4: DEFINE RANDOM FOREST PIPELINE & HYPERPARAMETERS
# ------------------------------------------------------------
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
    ('classifier', RandomForestClassifier(random_state=42))
])

param_dist = {
    'classifier__n_estimators': [100, 150, 200, 250],
    'classifier__max_depth': [10, 15, 20, 25, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=20, cv=3,
    scoring='accuracy', random_state=42, n_jobs=-1, verbose=1
)

# ------------------------------------------------------------
# STEP 5: TRAIN MODEL
# ------------------------------------------------------------
random_search.fit(X_train, y_train)
rf_model = random_search.best_estimator_

print("\n✅ Best Hyperparameters:", random_search.best_params_)

# ------------------------------------------------------------
# STEP 6: EVALUATE MODEL
# ------------------------------------------------------------
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\n✅ Accuracy Score:", accuracy)

print("\n✅ Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ------------------------------------------------------------
# STEP 7: SAVE TRAINED MODEL
# ------------------------------------------------------------
joblib.dump(rf_model, "rf_pollution_source_model.pkl")
print("\n✅ Random Forest model saved as: rf_pollution_source_model.pkl")

# ------------------------------------------------------------
# SUMMARY
# ------------------------------------------------------------
print("\n================ MODEL TRAINING SUMMARY ================")
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")
print("Features used:", features)
print("Target variable: pollution_source")
print("Random Forest trained with hyperparameter tuning using RandomizedSearchCV")
print("Evaluation metrics: Accuracy, Precision, Recall, F1-score, Confusion Matrix")
print("Model and label encoder saved for future predictions.")
print("========================================================")


Fitting 3 folds for each of 20 candidates, totalling 60 fits

✅ Best Hyperparameters: {'classifier__n_estimators': 150, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 25}

✅ Accuracy Score: 0.9998717866529906

✅ Classification Report:
               precision    recall  f1-score   support

     Burning       1.00      1.00      1.00       804
  Industrial       1.00      1.00      1.00       453
     Natural       1.00      1.00      1.00     10759
   Vehicular       1.00      1.00      1.00      3583

    accuracy                           1.00     15599
   macro avg       1.00      1.00      1.00     15599
weighted avg       1.00      1.00      1.00     15599


✅ Confusion Matrix:
 [[  803     0     1     0]
 [    0   452     1     0]
 [    0     0 10759     0]
 [    0     0     0  3583]]

✅ Random Forest model saved as: rf_pollution_source_model.pkl

Total samples: 77994
Training samples: 62395 | Testing samples: 15599
Features used: 