In [65]:
import pandas as pd
import numpy as np
import joblib

In [66]:
# Data for the single row
data = {
    'date': ['2025-08-26 12:00:00'],
    'city': ['dhaka'],
    'pm10': [22.5],
    'pm2_5': [20.1],
    'carbon_monoxide': [365.0],
    'nitrogen_dioxide': [19.8],
    'sulphur_dioxide': [13.5],
    'ozone': [31.0],
    'uv_index_clear_sky': [0.4],
    'uv_index': [0.25],
    'dust': [1.2],
    'aerosol_optical_depth': [0.41]
}

# Create the DataFrame
prediction_df = pd.DataFrame(data)

In [67]:
def assign_cluster(df):
    clusterer = joblib.load("../models/clustering.joblib")
    scaler = joblib.load("../encoder/scaling.joblib")
    numeric_columns = df.select_dtypes("number").columns
    X = scaler.transform(df[[col for col in numeric_columns if col != "us_aqi"]].values)
    df["cluster"] = clusterer.predict(X)

    return df

In [68]:
def feature_engineering(df):
    df["date"] = pd.to_datetime(df["date"])
    df['hour_sin'] = np.sin(2 * np.pi * df['date'].dt.hour / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['date'].dt.hour / 24.0)
    df['month_sin'] = np.sin(2 * np.pi * df['date'].dt.month / 12.0)
    df['month_cos'] = np.cos(2 * np.pi * df['date'].dt.month / 12.0)
    df['day_of_week'] = df['date'].dt.dayofweek # Monday=0, Sunday=6
    df['year'] = df['date'].dt.year

    return df

In [69]:
def label_encoding(df):
    label_encoder = joblib.load("../encoder/city_encoder.joblib")
    df["city_encoded"] = label_encoder.transform(df["city"])

    return df

In [70]:
def perform_feature_selection(model):
    return model == "mlp_lstm"

In [71]:
def evaluation_metrics(y_test, y_pred):
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

    # 1. Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error (MSE): {mse:.2f}")

    # 2. Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

    # 3. R-squared (R²)
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared (R²): {r2:.2f}")

    # 4. Mean Absolute Error (MAE) - NEW
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error (MAE): {mae:.2f}")

    # 5. Mean Absolute Percentage Error (MAPE) - NEW
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2%}") # Formats as a percentage

    return mse, rmse, r2, mae, mape

In [72]:
def non_mlp_lstm_models(model, df):
    X_test = df[[col for col in df.columns if col not in ["date", "us_aqi", "city"]]]
    y_pred = model.predict(X_test)
    
    return y_pred

In [73]:
def main(prediction_df):
    df_with_cluster = assign_cluster(df=prediction_df)
    feature_engineered_df = feature_engineering(df=df_with_cluster)
    label_encoded_df = label_encoding(df=feature_engineered_df)

    print(f"Select which of the following models you want to use for prediction:\n[1] Linear Regression\n[2] Decision Tree\n[3] K-Nearest Neighbor\n[4] Random Forest\n[5] eXtreme Gradient Boosting\n[6] MLP-LSTM\n{"-"*30}\n* 'q' to end selection\n")
    model_names = []
    while len(model_names) < 6:
        model_name = str(input("Enter: "))
        if model_name == "1":
            model_names.append("linear_regressor")
        elif model_name == "2":
            model_names.append("decision_tree_regressor")
        elif model_name == "3":
            model_names.append("knn_regressor")
        elif model_name == "4":
            model_names.append("random_forest_regressor")
        elif model_name == "5":
            model_names.append("xgb_regressor")
        elif model_name == "6":
            model_names.append("mlp_lstm")
        else:
            break

    print("Predicting using models: ", model_names)

    for model_name in model_names:
        model = joblib.load(f"../models/{model_name}.joblib")
        if perform_feature_selection(model_name):
            pass
        else:
            predicted_aqi = non_mlp_lstm_models(model, label_encoded_df)
            print(f"Predicted AQI ({model_name}): {predicted_aqi[0]}")

In [74]:
main(prediction_df=prediction_df)

Select which of the following models you want to use for prediction:
[1] Linear Regression
[2] Decision Tree
[3] K-Nearest Neighbor
[4] Random Forest
[5] eXtreme Gradient Boosting
[6] MLP-LSTM
------------------------------
* 'q' to end selection

Predicting using models:  ['knn_regressor', 'random_forest_regressor', 'xgb_regressor']
Predicted AQI (knn_regressor): 78.71454
Predicted AQI (random_forest_regressor): 70.73243244
Predicted AQI (xgb_regressor): 63.82838439941406
