## üîß Model Training Pipeline
We will now train predictive models on the pollution dataset, compare their performance, and evaluate using metrics like RMSE and R¬≤.

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Core modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
import pandas as pd

# Load dataset
data_path = 'Dataset_Cities.csv'
df = pd.read_csv(data_path)

print(df.shape)
df.head()

(3164, 11)


Unnamed: 0,country,state,city,station,last_update,latitude,longitude,pollutant_id,pollutant_min,pollutant_max,pollutant_avg
0,India,Bihar,Aurangabad,"Gurdeo Nagar, Aurangabad - BSPCB",28-08-2025 19:00:00,24.75746,84.366208,PM10,25.0,166.0,73.0
1,India,Bihar,Begusarai,"Lohiyanagar, Begusarai - BSPCB",28-08-2025 19:00:00,25.42742,86.138861,NH3,3.0,4.0,4.0
2,India,Bihar,Bettiah,"Kamalnath Nagar, Bettiah - BSPCB",28-08-2025 19:00:00,26.80365,84.51954,PM10,5.0,59.0,32.0
3,India,Bihar,Bettiah,"Kamalnath Nagar, Bettiah - BSPCB",28-08-2025 19:00:00,26.80365,84.51954,NO2,5.0,12.0,8.0
4,India,Bihar,Bettiah,"Kamalnath Nagar, Bettiah - BSPCB",28-08-2025 19:00:00,26.80365,84.51954,CO,39.0,75.0,59.0


### 1. Preprocessing

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# üîé Inspect dataset columns to pick a target
print("Available columns:", df.columns.tolist())

# ‚úÖ Set your target column here (update manually if needed)
# Example: target = "AirQualityIndex" or "PM2.5"
target = df.columns[-1]  # assumes the LAST column is the target

# Separate features and target
X = df.drop(columns=[target])
y = df[target]

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

Available columns: ['country', 'state', 'city', 'station', 'last_update', 'latitude', 'longitude', 'pollutant_id', 'pollutant_min', 'pollutant_max', 'pollutant_avg']


### 2. Model Training

In [4]:
from sklearn.linear_model import LinearRegression

# Drop rows where target is NaN
data = pd.concat([X, y], axis=1).dropna(subset=[target])
X = data.drop(columns=[target])
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# RandomForest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Linear Regression baseline pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Fit models
rf_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Model Comparision

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset (update path if needed)
file_path = "Dataset_Cities.csv"  # change to "Dataset_Cities.csv" if running locally in same folder
df = pd.read_csv(file_path)

target = "pollutant_avg"  # correct target name
df = df.dropna(subset=[target])  # drop rows with missing target

X = df.drop(columns=[target])
y = df[target]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
])

rf = Pipeline([("preprocessor", preprocessor),
               ("model", RandomForestRegressor(random_state=42))])

lr = Pipeline([("preprocessor", preprocessor),
               ("model", LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit & evaluate
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

def show_metrics(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    # Manual RMSE calculation (works across all scikit-learn versions)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> R2: {r2:.3f} | MAE: {mae:.3f} | RMSE: {rmse:.3f}")


show_metrics("RandomForest", y_test, rf.predict(X_test))
show_metrics("LinearRegression", y_test, lr.predict(X_test))

print("CV R2 (RF):", cross_val_score(rf, X, y, cv=5, scoring="r2").mean())
print("CV R2 (LR):", cross_val_score(lr, X, y, cv=5, scoring="r2").mean())
print("Model is trained ‚òëÔ∏è")

RandomForest -> R2: 0.950 | MAE: 3.392 | RMSE: 6.444
LinearRegression -> R2: 0.914 | MAE: 5.033 | RMSE: 8.431
CV R2 (RF): 0.9194206631447299
CV R2 (LR): 0.8874436788433362
Model is trained ‚òëÔ∏è
