In [1]:
import pandas as pd
import numpy as np
import json
import joblib
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

# ============================================================
# ðŸ“Œ PATHS
# ============================================================
data_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection\archive\pollution_us_2000_2016.csv"
save_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection"

os.makedirs(save_path, exist_ok=True)

# ============================================================
# ðŸ“Œ Get feature names from first chunk
# ============================================================
chunk0 = next(pd.read_csv(data_path, chunksize=20000))
chunk0 = chunk0.drop(columns=["Address","NO2 Units","O3 Units","SO2 Units","CO Units"])

chunk0["Date Local"] = pd.to_datetime(chunk0["Date Local"])
chunk0["Year"] = chunk0["Date Local"].dt.year
chunk0["Month"] = chunk0["Date Local"].dt.month
chunk0["Day"] = chunk0["Date Local"].dt.day
chunk0 = chunk0.drop(columns=["Date Local", "NO2 Mean"])

feature_names = list(chunk0.columns)
DIM = len(feature_names)

print("Total Features:", DIM)

# ============================================================
# ðŸ“Œ Define AIS Functions
# ============================================================
POP = 10
AIS_GEN = 5
MUT_RATE = 0.25

def generate_mask():
    return np.random.randint(0, 2, DIM)

def evaluate_mask(mask):
    model = SGDRegressor(max_iter=5, eta0=0.001)
    scaler = MinMaxScaler()
    enc = {c: LabelEncoder() for c in ["State","County","City"]}

    first = True
    last_X = last_y = None

    for ch in pd.read_csv(data_path, chunksize=20000):

        ch = ch.drop(columns=["Address","NO2 Units","O3 Units","SO2 Units","CO Units"])
        ch = ch.dropna(subset=["NO2 Mean"])

        ch["Date Local"] = pd.to_datetime(ch["Date Local"])
        ch["Year"] = ch["Date Local"].dt.year
        ch["Month"] = ch["Date Local"].dt.month
        ch["Day"] = ch["Date Local"].dt.day
        ch = ch.drop(columns=["Date Local"])

        for col in ["State","County","City"]:
            ch[col] = enc[col].fit_transform(ch[col].astype(str))

        for col in ch.columns:
            if ch[col].dtype != "object":
                ch[col] = ch[col].fillna(ch[col].mean())

        y = ch["NO2 Mean"]
        X = ch.drop(columns=["NO2 Mean"])

        # apply feature mask
        X = X.iloc[:, mask == 1]

        if X.shape[1] == 0:
            return float("inf")

        if first:
            Xs = scaler.fit_transform(X)
            first = False
        else:
            Xs = scaler.transform(X)

        model.partial_fit(Xs, y)
        last_X, last_y = Xs, y

    pred = model.predict(last_X)
    return mean_squared_error(last_y, pred)

# ============================================================
# ðŸ“Œ Step 1 â†’ AIS Evolution
# ============================================================
print("\n=== AIS STAGE ===")

population = [generate_mask() for _ in range(POP)]
scores = [evaluate_mask(m) for m in population]

for gen in range(AIS_GEN):
    print(f"AIS Generation {gen+1}/{AIS_GEN}")

    best_idx = np.argmin(scores)
    best_mask = population[best_idx].copy()

    new_pop = [best_mask.copy()]
    for _ in range(POP - 1):
        child = best_mask.copy()
        mut_idx = np.random.choice(DIM, int(DIM * MUT_RATE), replace=False)
        child[mut_idx] = 1 - child[mut_idx]
        new_pop.append(child)

    population = new_pop
    scores = [evaluate_mask(m) for m in population]
    print("Scores:", scores)

AIS_best_mask = best_mask.copy()

np.save(save_path + r"\sequential_ais_mask.npy", AIS_best_mask)

# ============================================================
# ðŸ“Œ Step 2 â†’ PSO Refinement
# ============================================================
print("\n=== PSO STAGE ===")

PSO_ITER = 10
vel = np.zeros(DIM)
pbest = AIS_best_mask.copy()
gbest = AIS_best_mask.copy()

for it in range(PSO_ITER):
    r1, r2 = np.random.rand(), np.random.rand()
    vel = 0.5*vel + r1*(pbest - gbest) + r2*(gbest - AIS_best_mask)
    AIS_best_mask = np.clip(AIS_best_mask + vel, 0, 1).round().astype(int)

    score = evaluate_mask(AIS_best_mask)
    print(f"PSO iteration {it+1} score:", score)

final_mask = AIS_best_mask
np.save(save_path + r"\sequential_feature_mask.npy", final_mask)

# ============================================================
# ðŸ“Œ Step 3 â†’ FINAL TRAINING WITH BEST MASK
# ============================================================
print("\n=== FINAL TRAINING ===")

final_model = SGDRegressor(max_iter=5, eta0=0.001)
final_scaler = MinMaxScaler()
final_enc = {c: LabelEncoder() for c in ["State","County","City"]}

first = True
last_X = last_y = None

for ch in pd.read_csv(data_path, chunksize=20000):

    ch = ch.drop(columns=["Address","NO2 Units","O3 Units","SO2 Units","CO Units"])
    ch = ch.dropna(subset=["NO2 Mean"])

    ch["Date Local"] = pd.to_datetime(ch["Date Local"])
    ch["Year"] = ch["Date Local"].dt.year
    ch["Month"] = ch["Date Local"].dt.month
    ch["Day"] = ch["Date Local"].dt.day
    ch = ch.drop(columns=["Date Local"])

    for col in ["State","County","City"]:
        ch[col] = final_enc[col].fit_transform(ch[col].astype(str))

    for col in ch.columns:
        if ch[col].dtype != "object":
            ch[col] = ch[col].fillna(ch[col].mean())

    y = ch["NO2 Mean"]
    X = ch.drop(columns=["NO2 Mean"])
    X = X.iloc[:, final_mask == 1]

    if first:
        Xs = final_scaler.fit_transform(X)
        first = False
    else:
        Xs = final_scaler.transform(X)

    final_model.partial_fit(Xs, y)
    last_X, last_y = Xs, y

# save model files
joblib.dump(final_model, save_path + r"\sequential_model.pkl")
joblib.dump(final_scaler, save_path + r"\sequential_scaler.pkl")
joblib.dump(final_enc, save_path + r"\sequential_encoders.pkl")

# predictions
pred = final_model.predict(last_X)
errors = abs(last_y - pred)

# ============================================================
# ðŸ“Œ Save Results
# ============================================================
res_df = pd.DataFrame({"Actual": last_y.values, "Predicted": pred})
res_df.to_csv(save_path + r"\sequential_pollution_results.csv", index=False)

with open(save_path + r"\sequential_pollution_predictions.json", "w") as f:
    json.dump({
        "actual": last_y.values.tolist(),
        "predicted": pred.tolist(),
        "mse": float(mean_squared_error(last_y, pred)),
        "r2": float(r2_score(last_y, pred))
    }, f, indent=4)

# ============================================================
# ðŸ“Œ Save Graphs
# ============================================================
plt.figure(figsize=(10,5))
plt.plot(errors[:1000])
plt.title("Sequential Hybrid Error Trend")
plt.grid()
plt.savefig(save_path + r"\sequential_error_trend.png")
plt.close()

plt.figure(figsize=(6,6))
plt.scatter(last_y[:1000], pred[:1000], alpha=0.3)
plt.title("Sequential Hybrid Scatter Plot")
plt.grid()
plt.savefig(save_path + r"\sequential_scatter.png")
plt.close()

plt.figure(figsize=(12,5))
plt.plot(last_y.values[:300], label="Actual")
plt.plot(pred[:300], label="Predicted")
plt.legend()
plt.grid()
plt.title("Sequential Hybrid Comparison Graph")
plt.savefig(save_path + r"\sequential_comparison.png")
plt.close()

plt.figure(figsize=(10,5))
plt.hist(errors, bins=40, color='orange')
plt.title("Sequential Hybrid Residual Distribution")
plt.grid()
plt.savefig(save_path + r"\sequential_residual.png")
plt.close()

print("\nðŸŽ‰ ALL SEQUENTIAL HYBRID FILES SAVED SUCCESSFULLY!")


Total Features: 25

=== AIS STAGE ===
AIS Generation 1/5
Scores: [9.59207649437205, 29.951928604562923, 9.663675455490718, 21.60587087557883, 9.04674330791659, 9.809848097344608, 11.277763122108343, 9.63917083721433, 20.77211711229966, 21.012764644894464]
AIS Generation 2/5
Scores: [8.94784076811623, 9.670239032412248, 9.29532924997758, 9.731119160645466, 10.062030254356056, 20.98407182596011, 21.350501592044832, 9.625178102735386, 9.096431154838706, 14.177219571620926]
AIS Generation 3/5
Scores: [8.903893446938824, 9.34350683463979, 8.980190186517907, 12.168596329757628, 28.87595291112622, 22.200484173903146, 21.822015724348116, 9.214407103608993, 11.992174455123548, 9.202303631874319]
AIS Generation 4/5
Scores: [8.905242622293914, 21.425918094666727, 22.130013283495842, 9.2945608217393, 9.67191589146429, 9.342184078435226, 9.241124686842655, 9.486899177679053, 31.49215587952305, 10.479841563651988]
AIS Generation 5/5
Scores: [9.047032641137884, 11.049848603170283, 10.66829552919557, 