In [3]:

# === Step 1: Import Libraries ===
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error



print("Libraries imported successfully!")

# === Step 2: Load Dataset ===
df = pd.read_csv(r"/Users/satyammukkawar/Downloads/AQI_dataset.csv")   # change file name if needed
print("Dataset loaded successfully!")
print(df.head())

# === Step 3: Data Preprocessing ===
df = df.fillna(df.mean(numeric_only=True))   # fill missing numeric values
df = pd.get_dummies(df, drop_first=True)     # one-hot encoding
print("\nData preprocessed successfully!")

# === Step 4: Define Features & Target ===
target_col = "AQI"      # replace if your column name is different
X = df.drop(columns=[target_col])
y = df[target_col]

# === Step 5: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# === Step 6: Train 4 Models ===
models = {}

print("\nTraining models...")

# 1️⃣ Linear Regression
models["LinearRegression"] = LinearRegression()
models["LinearRegression"].fit(X_train, y_train)

# 2️⃣ Random Forest
models["RandomForest"] = RandomForestRegressor(n_estimators=200, random_state=42)
models["RandomForest"].fit(X_train, y_train)

# 3️⃣ Gradient Boosting
models["GradientBoosting"] = GradientBoostingRegressor(random_state=42)
models["GradientBoosting"].fit(X_train, y_train)


# === Step 7: Evaluate Models ===
print("\n=== Model Evaluation ===")

results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    results[name] = r2
    print(f"\n {name}")
    print(f"R² Score: {r2:.4f}")
    print(f"MSE: {mse:.4f}")

# === Step 8: Ensemble Model ===
print("\nCreating Ensemble Model...")

predictions = []

for model in models.values():
    predictions.append(model.predict(X_test))

# Average predictions
ensemble_pred = np.mean(predictions, axis=0)

ensemble_r2 = r2_score(y_test, ensemble_pred)
ensemble_mse = mean_squared_error(y_test, ensemble_pred)

print("\n Ensemble Model")
print(f"R² Score: {ensemble_r2:.4f}")
print(f"MSE: {ensemble_mse:.4f}")

# === Step 9: Save Best Model ===
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

joblib.dump(best_model, "best_aqi_model.pkl")
print(f"\nBest model saved successfully! ({best_model_name})")

# === Step 10: Manual Prediction ===
loaded_model = joblib.load("best_aqi_model.pkl")
print("\nEnter details to predict AQI:")

feature_names = X.columns.tolist()
user_input = {}

for feature in feature_names:
    try:
        value = float(input(f"Enter value for {feature}: "))
        user_input[feature] = value
    except ValueError:
        print(f"Invalid input for {feature}, defaulting to 0")
        user_input[feature] = 0

input_df = pd.DataFrame([user_input])

predicted_aqi = loaded_model.predict(input_df)
print(f"\n Predicted Air Quality Index (AQI): {predicted_aqi[0]:.2f}")

Libraries imported successfully!
Dataset loaded successfully!
    PM2.5    PM10     NO2    SO2    CO      O3  Temperature  Humidity  \
0   81.16   28.80   98.09   6.03  0.40  128.69        15.91     59.94   
1  190.64  198.19   17.20  43.45  2.72  101.14        19.75     23.89   
2  149.08  108.02   28.44  44.17  1.57   62.62        16.20     45.25   
3  123.75  162.40  135.29  51.72  2.50  148.35        13.10     30.08   
4   39.64  274.12   92.93  58.64  1.03  126.40        14.22     24.75   

   WindSpeed     AQI  
0      14.29   65.76  
1       3.47  163.73  
2      11.74  115.13  
3      12.33  134.13  
4       8.77  120.60  

Data preprocessed successfully!

Training models...

=== Model Evaluation ===

 LinearRegression
R² Score: 1.0000
MSE: 0.0000

 RandomForest
R² Score: 0.9431
MSE: 97.5844

 GradientBoosting
R² Score: 0.9693
MSE: 52.5808

Creating Ensemble Model...

 Ensemble Model
R² Score: 0.9827
MSE: 29.7157

Best model saved successfully! (LinearRegression)

Enter details