In [1]:
# ===== Robust fixed pipeline (paste this entire cell) =====
import os, sys
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")

# Paths
csv_path = 'C:/Users/91955/Desktop/infosys_aqi_project/data/AirQualityUCI.csv'
model_path = r"C:/Users/91955/Desktop/infosys_aqi_project/models/stack_ensemble_aqi.joblib"
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# --- 1) Load and canonicalize raw data ---
df = pd.read_csv(csv_path, sep=';')
df = df.replace(-200, np.nan)

# Remove trailing/leading spaces and stray semicolons in object columns
for c in df.columns:
    if df[c].dtype == object:
        df[c] = df[c].astype(str).str.strip().str.replace(';','', regex=False)

# --- 2) Convert numeric columns you plan to use to numeric (coerce invalid -> NaN) ---
candidate_features = ['CO(GT)', 'NO2(GT)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
for c in candidate_features:
    if c in df.columns:
        # handle categorical dtype as well
        if pd.api.types.is_categorical_dtype(df[c]):
            df[c] = df[c].cat.as_ordered().codes
        df[c] = pd.to_numeric(df[c], errors='coerce')

# --- 3) Compute IAQI buckets (your deterministic rules) ---
# safe guards: if column missing, create NaN/placeholder
df['IAQI_CO'] = np.nan
df['IAQI_NO2'] = np.nan
df['IAQI_O3'] = np.nan

if 'CO(GT)' in df.columns:
    df.loc[:, 'IAQI_CO'] = np.where(df['CO(GT)'] <= 2, 25,
                                    np.where(df['CO(GT)'] <= 4, 75, 125))

if 'NO2(GT)' in df.columns:
    df.loc[:, 'IAQI_NO2'] = np.where(df['NO2(GT)'] <= 40, 25,
                                     np.where(df['NO2(GT)'] <= 80, 75, 125))

if 'PT08.S5(O3)' in df.columns:
    try:
        # qcut on non-NA values only; use duplicates='drop' to prevent errors
        df.loc[:, 'O3_bucket'] = pd.qcut(df['PT08.S5(O3)'], q=3, labels=[0,1,2], duplicates='drop')
        df.loc[:, 'IAQI_O3'] = df['O3_bucket'].map({0:25, 1:75, 2:125})
    except Exception:
        df.loc[:, 'IAQI_O3'] = 75
else:
    df.loc[:, 'IAQI_O3'] = 0

# --- 4) AQI and class mapping ---
df.loc[:, 'AQI_calc'] = df[['IAQI_CO','IAQI_NO2','IAQI_O3']].max(axis=1)

def aqi_to_label(v):
    if pd.isna(v): return np.nan
    if v <= 50: return 0
    if v <= 100: return 1
    return 2

df.loc[:, 'AQI_class'] = df['AQI_calc'].apply(aqi_to_label)

# --- 5) Create next-hour target so target is not identical to features (prevents trivial mapping) ---
df = df.reset_index(drop=True)
df.loc[:, 'AQI_next'] = df['AQI_class'].shift(-1)
df_time = df.dropna(subset=['AQI_next']).copy()
df_time.loc[:, 'AQI_next'] = df_time['AQI_next'].astype(int)

# --- 6) Build feature matrix & target ---
features = [f for f in candidate_features if f in df_time.columns]
if len(features) == 0:
    raise RuntimeError("No feature columns found. Check CSV columns.")

X_time = df_time[features].copy()
y_time = df_time['AQI_next'].copy()

# --- 7) Ensure feature columns are numeric (force) BEFORE split to avoid strange categorical dtypes ---
for c in X_time.columns:
    # strip & coerce once more
    X_time.loc[:, c] = pd.to_numeric(X_time[c], errors='coerce')

# --- 8) Time-based split (chronological). If later test contains <2 classes, fallback to stratified split. ---
split_idx = int(0.8 * len(X_time))
X_tr, X_te = X_time.iloc[:split_idx].copy(), X_time.iloc[split_idx:].copy()
y_tr, y_te = y_time.iloc[:split_idx].copy(), y_time.iloc[split_idx:].copy()
print("Initial time split sizes -> train:", X_tr.shape, " test:", X_te.shape)
print("Initial class counts (train/test):", y_tr.value_counts().to_dict(), y_te.value_counts().to_dict())

# If test has fewer than 2 classes, fallback to stratified random split so evaluation is meaningful
if y_te.nunique() < 2:
    print("WARNING: time-based test has <2 classes. Falling back to stratified random split.")
    X_tr, X_te, y_tr, y_te = train_test_split(X_time, y_time, test_size=0.2, random_state=42, stratify=y_time)
    print("Stratified split sizes -> train:", X_tr.shape, " test:", X_te.shape)
    print("Class counts (train/test):", y_tr.value_counts().to_dict(), y_te.value_counts().to_dict())

# --- 9) Force numeric types again and convert all to float64 to satisfy xgboost ---
for c in X_tr.columns:
    X_tr.loc[:, c] = pd.to_numeric(X_tr[c], errors='coerce').astype('float64')
    X_te.loc[:, c] = pd.to_numeric(X_te[c], errors='coerce').astype('float64')

# --- 10) Drop columns that are entirely NaN in training set (cannot impute meaningful value) ---
all_nan = [c for c in X_tr.columns if X_tr[c].isna().all()]
if all_nan:
    print("Dropping all-NaN columns:", all_nan)
    X_tr.drop(columns=all_nan, inplace=True)
    X_te.drop(columns=all_nan, inplace=True)

# --- 11) Impute medians using training set ONLY ---
numeric_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) == 0:
    raise RuntimeError("No numeric columns available for training after coercion.")
imputer = SimpleImputer(strategy='median')
imputer.fit(X_tr[numeric_cols])
X_tr.loc[:, numeric_cols] = imputer.transform(X_tr[numeric_cols])
X_te.loc[:, numeric_cols] = imputer.transform(X_te[numeric_cols])

print("Any NaN left in X_tr after impute?", X_tr.isna().any().any())
print("Any NaN left in X_te after impute?", X_te.isna().any().any())
print("Final feature dtypes:\n", X_tr.dtypes)

# --- 12) Final check: ensure y are ints and have at least 2 classes ---
y_tr = y_tr.astype(int)
y_te = y_te.astype(int)
print("Final class distribution (train):\n", y_tr.value_counts())
print("Final class distribution (test):\n", y_te.value_counts())

# --- 13) Train RandomForest baseline ---
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_tr, y_tr)
y_pred_rf = rf.predict(X_te)
print("\nRF accuracy:", round(accuracy_score(y_te, y_pred_rf),4))
print(classification_report(y_te, y_pred_rf))
print("RF confusion matrix:\n", confusion_matrix(y_te, y_pred_rf))

# --- 14) Train Stacking ensemble (ensure all columns numeric floats) ---
for c in X_tr.columns:
    X_tr.loc[:, c] = X_tr[c].astype('float64')
    X_te.loc[:, c] = X_te[c].astype('float64')

rf_b = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
xg = xgb.XGBClassifier(n_estimators=150, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
lg = lgb.LGBMClassifier(n_estimators=150, random_state=42)

estimators = [('rf', rf_b), ('xg', xg), ('lg', lg)]
stack = StackingClassifier(estimators=estimators,
                           final_estimator=LogisticRegression(max_iter=1000),
                           cv=5, n_jobs=-1, passthrough=False)

print("\nTraining stacking ensemble...")
stack.fit(X_tr, y_tr)
print("Done training.")

# --- 15) Evaluate ---
y_pred_stack = stack.predict(X_te)
acc_stack = accuracy_score(y_te, y_pred_stack)
print("\nStacking ensemble accuracy:", round(acc_stack, 4))
print(classification_report(y_te, y_pred_stack))
print("Ensemble confusion matrix:\n", confusion_matrix(y_te, y_pred_stack))

# --- 16) Save model + imputer + feature order ---
save_dict = {'model': stack, 'imputer': imputer, 'features': X_tr.columns.tolist()}
joblib.dump(save_dict, model_path)
print("\nSaved ensemble + imputer to:", model_path)

# --- 17) Example single-row inference (coerce -> impute -> predict) ---
sample = {'CO(GT)': 2.8, 'NO2(GT)': 55.0, 'PT08.S5(O3)': 650, 'T': 11.2, 'RH': 36.0, 'AH': 0.75}
sample_df = pd.DataFrame([sample])
sample_df = sample_df[[c for c in X_tr.columns if c in sample_df.columns]]
for c in sample_df.columns:
    sample_df.loc[:, c] = pd.to_numeric(sample_df[c], errors='coerce').astype('float64')
sample_df.loc[:, numeric_cols] = imputer.transform(sample_df[numeric_cols])
pred_next = stack.predict(sample_df)[0]
probs_next = stack.predict_proba(sample_df)[0]
label_map = {0:'Good', 1:'Moderate', 2:'Unhealthy'}
print("\nSample next-hour prediction:")
print(sample_df)
print("Predicted next-hour class:", pred_next, label_map[pred_next])
print("Class probs:", probs_next)


Initial time split sizes -> train: (7576, 6)  test: (1894, 6)
Initial class counts (train/test): {2.0: 7243, 1.0: 270, 0.0: 63} {2.0: 1894}
Stratified split sizes -> train: (7576, 6)  test: (1894, 6)
Class counts (train/test): {2.0: 7310, 1.0: 216, 0.0: 50} {2.0: 1827, 1.0: 54, 0.0: 13}
Any NaN left in X_tr after impute? False
Any NaN left in X_te after impute? False
Final feature dtypes:
 CO(GT)         float64
NO2(GT)        float64
PT08.S5(O3)    float64
T              float64
RH             float64
AH             float64
dtype: object
Final class distribution (train):
 AQI_next
2    7310
1     216
0      50
Name: count, dtype: int64
Final class distribution (test):
 AQI_next
2    1827
1      54
0      13
Name: count, dtype: int64

RF accuracy: 0.953
              precision    recall  f1-score   support

           0       0.09      0.08      0.08        13
           1       0.11      0.04      0.06        54
           2       0.97      0.99      0.98      1827

    accuracy      

  File "C:\Users\91955\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "C:\Users\91955\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\91955\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\91955\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
            


Saved ensemble + imputer to: C:/Users/91955/Desktop/infosys_aqi_project/models/stack_ensemble_aqi.joblib

Sample next-hour prediction:
   CO(GT)  NO2(GT)  PT08.S5(O3)     T    RH    AH
0     2.8     55.0          650  11.2  36.0  0.75
Predicted next-hour class: 2 Unhealthy
Class probs: [0.00511942 0.02466155 0.97021903]


In [2]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Paths
csv_path = 'C:/Users/91955/Desktop/infosys_aqi_project/data/AirQualityUCI.csv'
model_path = r"C:/Users/91955/Desktop/infosys_aqi_project/models/stack_ensemble_aqi.joblib"
os.makedirs(os.path.dirname(model_path), exist_ok=True)

In [23]:
# --- 1) Load and canonicalize raw data ---
df = pd.read_csv(csv_path, sep=';')
df = df.replace(-200, np.nan)
df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888,,


In [5]:
# Remove trailing/leading spaces and stray semicolons in object columns
for c in df.columns:
    if df[c].dtype == object:
        df[c] = df[c].astype(str).str.strip().str.replace(';','', regex=False)

In [6]:
# --- 2) Convert numeric columns you plan to use to numeric (coerce invalid -> NaN) ---
candidate_features = ['CO(GT)', 'NO2(GT)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
for c in candidate_features:
    if c in df.columns:
        # handle categorical dtype as well
        if pd.api.types.is_categorical_dtype(df[c]):
            df[c] = df[c].cat.as_ordered().codes
        df[c] = pd.to_numeric(df[c], errors='coerce')

In [7]:
# --- 3) Compute IAQI buckets (your deterministic rules) ---
# safe guards: if column missing, create NaN/placeholder
df['IAQI_CO'] = np.nan
df['IAQI_NO2'] = np.nan
df['IAQI_O3'] = np.nan

if 'CO(GT)' in df.columns:
    df.loc[:, 'IAQI_CO'] = np.where(df['CO(GT)'] <= 2, 25,
                                    np.where(df['CO(GT)'] <= 4, 75, 125))

if 'NO2(GT)' in df.columns:
    df.loc[:, 'IAQI_NO2'] = np.where(df['NO2(GT)'] <= 40, 25,
                                     np.where(df['NO2(GT)'] <= 80, 75, 125))

if 'PT08.S5(O3)' in df.columns:
    try:
        # qcut on non-NA values only; use duplicates='drop' to prevent errors
        df.loc[:, 'O3_bucket'] = pd.qcut(df['PT08.S5(O3)'], q=3, labels=[0,1,2], duplicates='drop')
        df.loc[:, 'IAQI_O3'] = df['O3_bucket'].map({0:25, 1:75, 2:125})
    except Exception:
        df.loc[:, 'IAQI_O3'] = 75
else:
    df.loc[:, 'IAQI_O3'] = 0


In [8]:
# --- 4) AQI and class mapping ---
df.loc[:, 'AQI_calc'] = df[['IAQI_CO','IAQI_NO2','IAQI_O3']].max(axis=1)

def aqi_to_label(v):
    if pd.isna(v): return np.nan
    if v <= 50: return 0
    if v <= 100: return 1
    return 2

df.loc[:, 'AQI_class'] = df['AQI_calc'].apply(aqi_to_label)

In [9]:
# --- 5) Create next-hour target so target is not identical to features (prevents trivial mapping) ---
df = df.reset_index(drop=True)
df.loc[:, 'AQI_next'] = df['AQI_class'].shift(-1)
df_time = df.dropna(subset=['AQI_next']).copy()
df_time.loc[:, 'AQI_next'] = df_time['AQI_next'].astype(int)

In [10]:
# --- 6) Build feature matrix & target ---
features = [f for f in candidate_features if f in df_time.columns]
if len(features) == 0:
    raise RuntimeError("No feature columns found. Check CSV columns.")

X_time = df_time[features].copy()
y_time = df_time['AQI_next'].copy()

In [11]:

# --- 7) Ensure feature columns are numeric (force) BEFORE split to avoid strange categorical dtypes ---
for c in X_time.columns:
    # strip & coerce once more
    X_time.loc[:, c] = pd.to_numeric(X_time[c], errors='coerce')


In [13]:
# --- 8) Time-based split (chronological). If later test contains <2 classes, fallback to stratified split. ---
split_idx = int(0.8 * len(X_time))
X_tr, X_te = X_time.iloc[:split_idx].copy(), X_time.iloc[split_idx:].copy()
y_tr, y_te = y_time.iloc[:split_idx].copy(), y_time.iloc[split_idx:].copy()
print(" train:", X_tr.shape, " test:", X_te.shape)
print("Initial class counts (train/test):", y_tr.value_counts().to_dict(), y_te.value_counts().to_dict())

# If test has fewer than 2 classes, fallback to stratified random split so evaluation is meaningful
if y_te.nunique() < 2:
    print("WARNING: time-based test has <2 classes. Falling back to stratified random split.")
    X_tr, X_te, y_tr, y_te = train_test_split(X_time, y_time, test_size=0.2, random_state=42, stratify=y_time)
    print("Stratified split sizes -> train:", X_tr.shape, " test:", X_te.shape)
    print("Class counts (train/test):", y_tr.value_counts().to_dict(), y_te.value_counts().to_dict())


 train: (7576, 6)  test: (1894, 6)
Initial class counts (train/test): {2.0: 7243, 1.0: 270, 0.0: 63} {2.0: 1894}
Stratified split sizes -> train: (7576, 6)  test: (1894, 6)
Class counts (train/test): {2.0: 7310, 1.0: 216, 0.0: 50} {2.0: 1827, 1.0: 54, 0.0: 13}


In [14]:
# --- 9) Force numeric types again and convert all to float64 to satisfy xgboost ---
for c in X_tr.columns:
    X_tr.loc[:, c] = pd.to_numeric(X_tr[c], errors='coerce').astype('float64')
    X_te.loc[:, c] = pd.to_numeric(X_te[c], errors='coerce').astype('float64')

In [15]:
# --- 10) Drop columns that are entirely NaN in training set (cannot impute meaningful value) ---
all_nan = [c for c in X_tr.columns if X_tr[c].isna().all()]
if all_nan:
    print("Dropping all-NaN columns:", all_nan)
    X_tr.drop(columns=all_nan, inplace=True)
    X_te.drop(columns=all_nan, inplace=True)

In [16]:

# --- 11) Impute medians using training set ONLY ---
numeric_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) == 0:
    raise RuntimeError("No numeric columns available for training after coercion.")
imputer = SimpleImputer(strategy='median')
imputer.fit(X_tr[numeric_cols])
X_tr.loc[:, numeric_cols] = imputer.transform(X_tr[numeric_cols])
X_te.loc[:, numeric_cols] = imputer.transform(X_te[numeric_cols])

print("Any NaN left in X_tr after impute?", X_tr.isna().any().any())
print("Any NaN left in X_te after impute?", X_te.isna().any().any())
print("Final feature dtypes:\n", X_tr.dtypes)


Any NaN left in X_tr after impute? False
Any NaN left in X_te after impute? False
Final feature dtypes:
 CO(GT)         float64
NO2(GT)        float64
PT08.S5(O3)    float64
T              float64
RH             float64
AH             float64
dtype: object


In [17]:
# --- 12) Final check: ensure y are ints and have at least 2 classes ---
y_tr = y_tr.astype(int)
y_te = y_te.astype(int)
print("Final class distribution (train):\n", y_tr.value_counts())
print("Final class distribution (test):\n", y_te.value_counts())

Final class distribution (train):
 AQI_next
2    7310
1     216
0      50
Name: count, dtype: int64
Final class distribution (test):
 AQI_next
2    1827
1      54
0      13
Name: count, dtype: int64


In [18]:
# --- 13) Train RandomForest baseline ---
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_tr, y_tr)
y_pred_rf = rf.predict(X_te)
print("\nRF accuracy:", round(accuracy_score(y_te, y_pred_rf),4))
print(classification_report(y_te, y_pred_rf))
print("RF confusion matrix:\n", confusion_matrix(y_te, y_pred_rf))


RF accuracy: 0.953
              precision    recall  f1-score   support

           0       0.09      0.08      0.08        13
           1       0.11      0.04      0.06        54
           2       0.97      0.99      0.98      1827

    accuracy                           0.95      1894
   macro avg       0.39      0.37      0.37      1894
weighted avg       0.94      0.95      0.94      1894

RF confusion matrix:
 [[   1    0   12]
 [   1    2   51]
 [   9   16 1802]]


In [19]:
# --- 14) Train Stacking ensemble (ensure all columns numeric floats) ---
for c in X_tr.columns:
    X_tr.loc[:, c] = X_tr[c].astype('float64')
    X_te.loc[:, c] = X_te[c].astype('float64')

rf_b = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
xg = xgb.XGBClassifier(n_estimators=150, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
lg = lgb.LGBMClassifier(n_estimators=150, random_state=42)

estimators = [('rf', rf_b), ('xg', xg), ('lg', lg)]
stack = StackingClassifier(estimators=estimators,
                           final_estimator=LogisticRegression(max_iter=1000),
                           cv=5, n_jobs=-1, passthrough=False)

print("\nTraining stacking ensemble...")
stack.fit(X_tr, y_tr)
print("Done training.")



Training stacking ensemble...
Done training.


In [20]:
# --- 15) Evaluate ---
y_pred_stack = stack.predict(X_te)
acc_stack = accuracy_score(y_te, y_pred_stack)
print("\nStacking ensemble accuracy:", round(acc_stack, 4))
print(classification_report(y_te, y_pred_stack))
print("Ensemble confusion matrix:\n", confusion_matrix(y_te, y_pred_stack))



Stacking ensemble accuracy: 0.9641
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.00      0.00      0.00        54
           2       0.97      1.00      0.98      1827

    accuracy                           0.96      1894
   macro avg       0.32      0.33      0.33      1894
weighted avg       0.93      0.96      0.95      1894

Ensemble confusion matrix:
 [[   0    0   13]
 [   1    0   53]
 [   1    0 1826]]


In [21]:
# --- 16) Save model + imputer + feature order ---
save_dict = {'model': stack, 'imputer': imputer, 'features': X_tr.columns.tolist()}
joblib.dump(save_dict, model_path)
print("\nSaved ensemble + imputer to:", model_path)


Saved ensemble + imputer to: C:/Users/91955/Desktop/infosys_aqi_project/models/stack_ensemble_aqi.joblib
