In [1]:
import pandas as pd
import numpy as np
import json
import joblib
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

# ============================================================
# ðŸ“Œ PATHS
# ============================================================
data_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection\archive\pollution_us_2000_2016.csv"
save_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection"

os.makedirs(save_path, exist_ok=True)

# ============================================================
# ðŸ“Œ CONFIG
# ============================================================
target = "NO2 Mean"
chunk_size = 20000

# AIS + PSO parameters
POP = 12              # number of feature masks
MAX_GEN = 6           # number of AIS generations
MASK_MUT_RATE = 0.25  # mutation rate
PSO_ITERS = 10        # PSO refinement iterations


# ============================================================
# ðŸ“Œ LOAD FIRST CHUNK â†’ GET FEATURE NAMES
# ============================================================
df0 = next(pd.read_csv(data_path, chunksize=chunk_size))

df0 = df0.drop(columns=["Address", "NO2 Units", "O3 Units", "SO2 Units", "CO Units"])
df0["Date Local"] = pd.to_datetime(df0["Date Local"])
df0["Year"] = df0["Date Local"].dt.year
df0["Month"] = df0["Date Local"].dt.month
df0["Day"] = df0["Date Local"].dt.day
df0 = df0.drop(columns=["Date Local", target])
feature_names = list(df0.columns)

DIM = len(feature_names)
print("Total Features:", DIM)


# ============================================================
# ðŸ“Œ AIS â€” Generate Random Feature Masks
# ============================================================
def generate_mask():
    return np.random.randint(0, 2, DIM)


# ============================================================
# ðŸ“Œ Evaluate Mask (Streaming training)
# ============================================================
def evaluate_mask(mask):
    model = SGDRegressor(max_iter=5, eta0=0.001)
    scaler = MinMaxScaler()
    label_enc = {c: LabelEncoder() for c in ["State", "County", "City"]}

    first_batch = True
    last_X, last_y = None, None

    for chunk in pd.read_csv(data_path, chunksize=chunk_size):

        chunk = chunk.drop(columns=["Address","NO2 Units","O3 Units","SO2 Units","CO Units"])
        chunk = chunk.dropna(subset=[target])

        # date
        chunk["Date Local"] = pd.to_datetime(chunk["Date Local"])
        chunk["Year"] = chunk["Date Local"].dt.year
        chunk["Month"] = chunk["Date Local"].dt.month
        chunk["Day"] = chunk["Date Local"].dt.day
        chunk = chunk.drop(columns=["Date Local"])

        # encode
        for col in ["State","County","City"]:
            chunk[col] = label_enc[col].fit_transform(chunk[col].astype(str))

        # impute
        for col in chunk.columns:
            if chunk[col].dtype != "object":
                chunk[col] = chunk[col].fillna(chunk[col].mean())

        y = chunk[target]
        X = chunk.drop(columns=[target])

        # apply mask
        selected_X = X.iloc[:, mask == 1]

        if selected_X.shape[1] == 0:
            return float("inf")  # invalid mask

        # scale
        if first_batch:
            Xs = scaler.fit_transform(selected_X)
            first_batch = False
        else:
            Xs = scaler.transform(selected_X)

        # online train
        model.partial_fit(Xs, y)

        last_X, last_y = Xs, y

    pred = model.predict(last_X)
    mse = mean_squared_error(last_y, pred)
    return mse


# ============================================================
# ðŸ“Œ AIS + PSO OPTIMIZATION LOOP
# ============================================================
population = [generate_mask() for _ in range(POP)]
scores = [evaluate_mask(m) for m in population]

print("Initial scores:", scores)

for gen in range(MAX_GEN):
    print(f"\n=== AIS GENERATION {gen+1}/{MAX_GEN} ===")

    # Clone best
    sorted_idx = np.argsort(scores)
    best_mask = population[sorted_idx[0]].copy()

    # Mutations
    new_pop = [best_mask.copy()]
    for i in range(POP - 1):
        child = best_mask.copy()
        idx = np.random.choice(DIM, int(DIM * MASK_MUT_RATE), replace=False)
        child[idx] = 1 - child[idx]
        new_pop.append(child)

    # Evaluate
    new_scores = [evaluate_mask(m) for m in new_pop]

    population = new_pop
    scores = new_scores
    print("Scores:", scores)

    # PSO refinement
    vel = np.zeros(DIM)
    pbest = population[0].copy()

    for it in range(PSO_ITERS):
        r1, r2 = np.random.rand(), np.random.rand()
        vel = 0.5*vel + r1*(pbest - population[0]) + r2*(best_mask - population[0])
        population[0] = np.clip(population[0] + vel, 0, 1).round().astype(int)


# ============================================================
# ðŸ“Œ FINAL BEST MASK
# ============================================================
best_idx = np.argmin(scores)
best_mask = population[best_idx]

np.save(save_path + r"\hybrid_feature_mask.npy", best_mask)
print("\nBEST MASK SELECTED:", best_mask)


# ============================================================
# ðŸ“Œ FINAL TRAINING USING BEST MASK
# ============================================================
final_model = SGDRegressor(max_iter=5, eta0=0.001)
final_scaler = MinMaxScaler()
final_label_enc = {c: LabelEncoder() for c in ["State","County","City"]}

first = True

for chunk in pd.read_csv(data_path, chunksize=chunk_size):

    chunk = chunk.drop(columns=["Address","NO2 Units","O3 Units","SO2 Units","CO Units"])
    chunk = chunk.dropna(subset=[target])

    chunk["Date Local"] = pd.to_datetime(chunk["Date Local"])
    chunk["Year"] = chunk["Date Local"].dt.year
    chunk["Month"] = chunk["Date Local"].dt.month
    chunk["Day"] = chunk["Date Local"].dt.day
    chunk = chunk.drop(columns=["Date Local"])

    for col in ["State","County","City"]:
        chunk[col] = final_label_enc[col].fit_transform(chunk[col].astype(str))

    for col in chunk.columns:
        if chunk[col].dtype != "object":
            chunk[col] = chunk[col].fillna(chunk[col].mean())

    y = chunk[target]
    X = chunk.drop(columns=[target])
    X = X.iloc[:, best_mask == 1]

    if first:
        Xs = final_scaler.fit_transform(X)
        first = False
    else:
        Xs = final_scaler.transform(X)

    final_model.partial_fit(Xs, y)

    last_X, last_y = Xs, y


# Save hybrid model
joblib.dump(final_model, save_path + r"\hybrid_model.pkl")
joblib.dump(final_scaler, save_path + r"\hybrid_scaler.pkl")
joblib.dump(final_label_enc, save_path + r"\hybrid_label_encoders.pkl")

print("\nðŸŽ‰ HYBRID MODEL SAVED SUCCESSFULLY!")


# ============================================================
# ðŸ“Œ EVALUATION
# ============================================================
pred = final_model.predict(last_X)
errors = abs(last_y - pred)

mse = mean_squared_error(last_y, pred)
r2 = r2_score(last_y, pred)

print("\nHYBRID MSE:", mse)
print("HYBRID R2:", r2)


# ============================================================
# ðŸ“Œ SAVE RESULTS
# ============================================================
res_df = pd.DataFrame({"Actual": last_y.values, "Predicted": pred})
res_df.to_csv(save_path + r"\hybrid_pollution_results.csv", index=False)

with open(save_path + r"\hybrid_pollution_predictions.json","w") as f:
    json.dump({
        "actual": last_y.values.tolist(),
        "predicted": pred.tolist(),
        "mse": float(mse),
        "r2": float(r2)
    }, f, indent=4)


# ============================================================
# ðŸ“Œ SAVE GRAPHS
# ============================================================

# ERROR TREND
plt.figure(figsize=(10,5))
plt.plot(errors[:1000])
plt.title("Hybrid Error Trend")
plt.grid()
plt.savefig(save_path + r"\hybrid_error_trend.png")
plt.close()

# SCATTER
plt.figure(figsize=(6,6))
plt.scatter(last_y[:1000], pred[:1000], alpha=0.3)
plt.title("Hybrid Scatter Plot")
plt.grid()
plt.savefig(save_path + r"\hybrid_scatter.png")
plt.close()

# COMPARISON
plt.figure(figsize=(12,5))
plt.plot(last_y.values[:300], label="Actual")
plt.plot(pred[:300], label="Predicted")
plt.title("Hybrid Comparison")
plt.legend()
plt.grid()
plt.savefig(save_path + r"\hybrid_comparison.png")
plt.close()

# RESIDUAL
plt.figure(figsize=(10,5))
plt.hist(errors, bins=40, color='orange')
plt.title("Hybrid Residual Distribution")
plt.grid()
plt.savefig(save_path + r"\hybrid_residual.png")
plt.close()

print("\nðŸŽ‰ ALL HYBRID FILES SAVED SUCCESSFULLY!")
print("Saved in:", save_path)


Total Features: 25
Initial scores: [44.29668721856404, 11.295352807083662, 14.693262067128709, 28.863371216882694, 10.507676216885683, 29.735102003259787, 31.796723897542684, 12.684039065923441, 10.31628230402489, 9.802487226422857, 10.866355215658663, 31.688581325934123]

=== AIS GENERATION 1/6 ===
Scores: [9.810527351785622, 9.892625827737284, 22.900529523862907, 10.876308484076535, 22.476278072819547, 9.183932298765042, 9.670287160273462, 14.888711585032631, 32.98454529030254, 10.591091019885603, 49.1142818989095, 9.659027361727837]

=== AIS GENERATION 2/6 ===
Scores: [9.210090539827585, 28.956754965167686, 12.245746127865608, 12.114206803419748, 10.84440349925797, 8.923934018301248, 20.817901623023033, 9.648629862573983, 9.123099186384994, 10.63694553663413, 9.337528574935696, 9.66068320181967]

=== AIS GENERATION 3/6 ===
Scores: [8.89955923352435, 11.05047400854316, 9.010439853267993, 12.977404400515297, 10.813781270986048, 13.850417733116625, 9.44316801780704, 10.290047174819044,