In [None]:
import os

project_root = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
folders = ['AQI_Project/data', 'AQI_Project/models', 'AQI_Project/notebooks']

for folder in folders:
    os.makedirs(os.path.join(project_root, folder), exist_ok=True)

DATA_PATH = os.path.join(project_root, 'AQI_Project', 'data', 'KARACHI_AQI_WEATHER_2023_TO_2025.csv')

print("Folder structure created successfully!\n")
print("Project Root:", project_root)
print("Data Path:", DATA_PATH)
print("\nFolder Tree:")
for folder in folders:
    print(" -", folder)


Folder structure created successfully!

Project Root: /content
Data Path: /content/AQI_Project/data/KARACHI_AQI_WEATHER_2023_TO_2025.csv

Folder Tree:
 - AQI_Project/data
 - AQI_Project/models
 - AQI_Project/notebooks


In [None]:
data_path = "AQI_Project/data/ENGINEERED.csv"
df_recent = pd.read_csv(data_path)
df_recent.columns

Index(['time', 'pm10', 'pm2_5', 'carbon_monoxide', 'nitrogen_dioxide',
       'sulphur_dioxide', 'ozone', 'us_aqi', 'temperature_2m',
       'relative_humidity_2m', 'wind_speed_10m', 'day', 'month', 'year',
       'day_of_week', 'temp_roll24', 'humidity_roll24', 'wind_roll24',
       'AQI_roll24', 'AQI_roll_std24', 'AQI_lag24', 'AQI_roll3',
       'AQI_trend_24h'],
      dtype='object')

In [None]:
import pandas as pd
import joblib

# 1. Load recent hourly data
data_path = "AQI_Project/data/ENGINEERED.csv"
df_recent = pd.read_csv(data_path)

# 2. Ensure datetime column is proper (agar hai)
if 'time' in df_recent.columns:
    df_recent['time'] = pd.to_datetime(df_recent['time'])

# 3. Select only the 15 trained features
trained_features = [
    'pm10', 'pm2_5', 'us_aqi', 'day', 'month', 'year',
    'day_of_week', 'temp_roll24', 'humidity_roll24', 'wind_roll24',
    'AQI_roll24', 'AQI_roll_std24', 'AQI_lag24', 'AQI_roll3', 'AQI_trend_24h'
]

X_recent = df_recent[trained_features]

# 4. Select last 24 rows only
X_last_24 = X_recent.tail(18)
time_last_24 = df_recent['time'].tail(18) if 'time' in df_recent.columns else range(18)

# 5. Load trained multi-output LightGBM model
model_path = "AQI_Project/models/best_lightgbm_multioutput.pkl"
lgbm_model = joblib.load(model_path)

# 6. Make predictions on last 24 hours
predictions = lgbm_model.predict(X_last_24)

# 7. Convert to DataFrame for readability
pred_df = pd.DataFrame(predictions, columns=['AQI_24h', 'AQI_48h', 'AQI_72h'])
pred_df['time'] = time_last_24.values

# 8. Show results
print("\n Predictions for last 18 hours:")
print(pred_df)

# 9. Optional: save predictions
pred_df.to_csv("AQI_Project/data/recent_predictions_24h.csv", index=False)



‚úÖ Predictions for last 18 hours:
     AQI_24h   AQI_48h   AQI_72h                time
0   0.845124 -0.458906  0.433087 2025-11-01 01:00:00
1   0.798093 -0.471503  0.466497 2025-11-01 02:00:00
2   0.795671 -0.509284  0.471349 2025-11-01 03:00:00
3   0.498298 -0.474404  0.383777 2025-11-01 04:00:00
4   0.338871 -0.534999  0.350655 2025-11-01 05:00:00
5   0.317229 -0.563228  0.345803 2025-11-01 06:00:00
6   0.476656 -0.447580  0.378924 2025-11-01 07:00:00
7   0.784219 -0.455768  0.472780 2025-11-01 08:00:00
8   0.850306 -0.475203  0.472780 2025-11-01 09:00:00
9   0.457439 -0.504507  0.361096 2025-11-01 10:00:00
10  0.463472 -0.049238  0.352577 2025-11-01 11:00:00
11  0.444384 -0.067781  0.340733 2025-11-01 12:00:00
12  0.265815 -0.077725  0.203927 2025-11-01 13:00:00
13  0.364651  0.058207  0.206945 2025-11-01 14:00:00
14  0.288232  0.025326  0.194943 2025-11-01 15:00:00
15  0.232311  0.105160  0.155550 2025-11-01 16:00:00
16  0.364707  0.091352  0.249561 2025-11-01 17:00:00
17  0.2677

**FIRST PERFORMING INVERSE TRANSFORMATION TO GET REAL AQI VALUES (TESTING)**

In [None]:
# ==============================
# üìò AQI Prediction with Inverse Transform
# ==============================

import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import PowerTransformer

# Load the engineered dataset (already transformed)
data_path = "AQI_Project/data/ENGINEERED.csv"
df_recent = pd.read_csv(data_path)

# Select only the 15 features used during model training
selected_features = [
    'pm10', 'pm2_5', 'us_aqi', 'day', 'month', 'year',
    'day_of_week', 'temp_roll24', 'humidity_roll24', 'wind_roll24',
    'AQI_roll24', 'AQI_roll_std24', 'AQI_lag24', 'AQI_roll3', 'AQI_trend_24h'
]

X_recent = df_recent[selected_features].copy()

# Keep only the last 24 rows (1 day)
X_recent = X_recent.tail(24)

# Load your trained LightGBM model
model_path = "AQI_Project/models/best_lightgbm_multioutput.pkl"
lgbm_model = joblib.load(model_path)

# Make predictions (transformed scale)
predictions = lgbm_model.predict(X_recent)

# Fit a new PowerTransformer on 'us_aqi' column of the CLEANED dataset
cleaned_path = "AQI_Project/data/KARACHI-AQI-RECORDS-2023-2025-CLEANED.csv"
cleaned_df = pd.read_csv(cleaned_path)

pt_target = PowerTransformer(method='yeo-johnson')
pt_target.fit(cleaned_df[['us_aqi']])

# Inverse transform predictions for each output (24h, 48h, 72h)
preds_24 = pt_target.inverse_transform(predictions[:, 0].reshape(-1, 1)).flatten()
preds_48 = pt_target.inverse_transform(predictions[:, 1].reshape(-1, 1)).flatten()
preds_72 = pt_target.inverse_transform(predictions[:, 2].reshape(-1, 1)).flatten()

# Combine all results into a single DataFrame
pred_df = pd.DataFrame({
    'time': df_recent['time'].tail(24).values if 'time' in df_recent.columns else np.arange(len(preds_24)),
    'AQI_24h': np.round(preds_24, 0),
    'AQI_48h': np.round(preds_48, 0),
    'AQI_72h': np.round(preds_72, 0)
})

# Display and save results
print("\nPredicted AQI values (inverse-transformed):")
print(pred_df.head(10))

# Optional: Save predictions
pred_df.to_csv("AQI_Project/data/recent_predictions_inverse.csv", index=False)
print("\nPredictions saved successfully at: AQI_Project/data/recent_predictions_inverse.csv")

# Check realistic range
print("\nCheck range comparison:")
print("Cleaned AQI range:", cleaned_df['us_aqi'].min(), "-", cleaned_df['us_aqi'].max())
print("Predicted AQI range:", pred_df[['AQI_24h', 'AQI_48h', 'AQI_72h']].min().min(), "-", pred_df[['AQI_24h', 'AQI_48h', 'AQI_72h']].max().max())



‚úÖ Predicted AQI values (inverse-transformed):
                  time  AQI_24h  AQI_48h  AQI_72h
0  2025-10-31 19:00:00    105.0     88.0     97.0
1  2025-10-31 20:00:00    108.0     89.0     98.0
2  2025-10-31 21:00:00    109.0     88.0     98.0
3  2025-10-31 22:00:00    111.0     88.0     98.0
4  2025-10-31 23:00:00    110.0     87.0     99.0
5  2025-11-01 00:00:00     97.0     72.0     88.0
6  2025-11-01 01:00:00    102.0     72.0     91.0
7  2025-11-01 02:00:00    101.0     72.0     92.0
8  2025-11-01 03:00:00    101.0     71.0     92.0
9  2025-11-01 04:00:00     93.0     72.0     90.0

üìÅ Predictions saved successfully at: AQI_Project/data/recent_predictions_inverse.csv

üîç Check range comparison:
Cleaned AQI range: 35 - 143
Predicted AQI range: 70.0 - 111.0




**THE INVERSE-TRANSFORM.PKL FILE IS SAVED TO CONSISTENTLY RESTORE PREDICTED VALUES TO THEIR ORIGINAL AQI SCALE, PREVENTING DECIMAL OR NEGATIVE OUTPUTS IN FUTURE PREDICTIONS.**

In [None]:
# ===== AQI prediction: inverse-transform targets =====
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer

# Paths
engineered_path = "AQI_Project/data/ENGINEERED.csv"   # already transformed & engineered (15 cols)
model_path = "AQI_Project/models/best_lightgbm_multioutput.pkl"
target_transformer_path = "AQI_Project/models/yeo_target_only_us_aqi.pkl"
cleaned_path = "AQI_Project/data/KARACHI-AQI-RECORDS-2023-2025-CLEANED.csv"  # has raw us_aqi

# 1) Load engineered prediction CSV (already transformed)
df_eng = pd.read_csv(engineered_path)

# 2) Keep required 15 features (these must be the same transformed features the model expects)
selected_features = [
    'pm10', 'pm2_5', 'us_aqi', 'day', 'month', 'year',
    'day_of_week', 'temp_roll24', 'humidity_roll24', 'wind_roll24',
    'AQI_roll24', 'AQI_roll_std24', 'AQI_lag24', 'AQI_roll3', 'AQI_trend_24h'
]

X = df_eng[selected_features].copy()

# 3) Take last 18 rows for prediction
X_last = X.tail(18).reset_index(drop=True)

# 4) Load model and predict (these predictions are in transformed target-space)
model = joblib.load(model_path)
pred_transformed = model.predict(X_last)   # shape: (18, 3)

# 5) Load or fit the target-only transformer (Yeo-Johnson fitted on raw us_aqi)
if os.path.exists(target_transformer_path):
    pt_target = joblib.load(target_transformer_path)
    print(" Loaded saved target transformer.")
else:
    # Fit on the raw 'us_aqi' from cleaned dataset (this recreates a transformer for the target)
    cleaned_df = pd.read_csv(cleaned_path)
    if 'us_aqi' not in cleaned_df.columns:
        raise ValueError("cleaned CSV does not contain 'us_aqi' column; cannot fit target transformer.")
    pt_target = PowerTransformer(method='yeo-johnson')
    pt_target.fit(cleaned_df[['us_aqi']])
    joblib.dump(pt_target, target_transformer_path)
    print(" Fitted and saved target transformer at:", target_transformer_path)

# 6) Inverse transform each predicted column back to AQI scale
#    Predictions are shape (n_rows, 3). pt_target.inverse_transform expects 2D arrays.
preds_24 = pt_target.inverse_transform(pred_transformed[:, 0].reshape(-1, 1)).flatten()
preds_48 = pt_target.inverse_transform(pred_transformed[:, 1].reshape(-1, 1)).flatten()
preds_72 = pt_target.inverse_transform(pred_transformed[:, 2].reshape(-1, 1)).flatten()

# 7) Post-process: round to integers and clip negative values to 0 (AQI cannot be negative)
preds_24 = np.round(preds_24).astype(int)
preds_48 = np.round(preds_48).astype(int)
preds_72 = np.round(preds_72).astype(int)

preds_24 = np.clip(preds_24, 0, None)
preds_48 = np.clip(preds_48, 0, None)
preds_72 = np.clip(preds_72, 0, None)

# 8) Build result DataFrame with timestamps if available
times = df_eng['time'].tail(18).reset_index(drop=True) if 'time' in df_eng.columns else pd.RangeIndex(start=0, stop=len(preds_24))
result = pd.DataFrame({
    'time': times,
    'AQI_24h': preds_24,
    'AQI_48h': preds_48,
    'AQI_72h': preds_72
})

# 9) Save and print
out_path = "AQI_Project/data/recent_predictions_inverse.csv"
result.to_csv(out_path, index=False)
print("\n Saved inverse-transformed predictions to:", out_path)
print(result.head(18))

# 10) Quick sanity check: compare result range with cleaned data range
cleaned_df = pd.read_csv(cleaned_path)
print("\nCleaned AQI range:", cleaned_df['us_aqi'].min(), "-", cleaned_df['us_aqi'].max())
print("Predicted AQI range:", result[['AQI_24h','AQI_48h','AQI_72h']].min().min(), "-", result[['AQI_24h','AQI_48h','AQI_72h']].max().max())


‚úÖ Fitted and saved target transformer at: AQI_Project/models/yeo_target_only_us_aqi.pkl

‚úÖ Saved inverse-transformed predictions to: AQI_Project/data/recent_predictions_inverse.csv
                  time  AQI_24h  AQI_48h  AQI_72h
0  2025-11-01 01:00:00      102       72       91
1  2025-11-01 02:00:00      101       72       92
2  2025-11-01 03:00:00      101       71       92
3  2025-11-01 04:00:00       93       72       90
4  2025-11-01 05:00:00       89       71       89
5  2025-11-01 06:00:00       88       70       89
6  2025-11-01 07:00:00       92       72       90
7  2025-11-01 08:00:00      101       72       92
8  2025-11-01 09:00:00      103       72       92
9  2025-11-01 10:00:00       92       71       89

Cleaned AQI range: 35 - 143
Predicted AQI range: 70 - 103




In [None]:
print("\n Saved inverse-transformed predictions to:", out_path)
print(result.head(18))


‚úÖ Saved inverse-transformed predictions to: AQI_Project/data/recent_predictions_inverse.csv
                   time  AQI_24h  AQI_48h  AQI_72h
0   2025-11-01 01:00:00      102       72       91
1   2025-11-01 02:00:00      101       72       92
2   2025-11-01 03:00:00      101       71       92
3   2025-11-01 04:00:00       93       72       90
4   2025-11-01 05:00:00       89       71       89
5   2025-11-01 06:00:00       88       70       89
6   2025-11-01 07:00:00       92       72       90
7   2025-11-01 08:00:00      101       72       92
8   2025-11-01 09:00:00      103       72       92
9   2025-11-01 10:00:00       92       71       89
10  2025-11-01 11:00:00       92       80       89
11  2025-11-01 12:00:00       92       80       89
12  2025-11-01 13:00:00       87       80       86
13  2025-11-01 14:00:00       90       82       86
14  2025-11-01 15:00:00       88       82       86
15  2025-11-01 16:00:00       86       84       85
16  2025-11-01 17:00:00       90      

**TESTING PREDICTIONS WITH HOPSWORKS**

In [None]:
%pip install hopsworks

Collecting hopsworks
  Downloading hopsworks-4.4.2-py3-none-any.whl.metadata (11 kB)
Collecting pyhumps==1.6.1 (from hopsworks)
  Downloading pyhumps-1.6.1-py3-none-any.whl.metadata (3.7 kB)
Collecting furl (from hopsworks)
  Downloading furl-2.1.4-py2.py3-none-any.whl.metadata (25 kB)
Collecting boto3 (from hopsworks)
  Downloading boto3-1.40.65-py3-none-any.whl.metadata (6.6 kB)
Collecting numpy<2 (from hopsworks)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyjks (from hopsworks)
  Downloading pyjks-20.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting mock (from hopsworks)
  Downloading mock-5.2.0-py3-none-any.whl.metadata (3.1 kB)
Collecting avro==1.11.3 (from hopsworks)
  Downloading avro-1.11.3.tar.gz (90 

In [None]:
import hopsworks
import pandas as pd
import numpy as np
import joblib
from datetime import datetime

# ---  Connect to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

# ---  Fetch data from ONLINE FEATURE STORE
feature_group = fs.get_feature_group(name="aqi_features_engineered", version=2)
df_features = feature_group.read(read_options={"online": True})

# ---  Convert 'time' column from integer epoch to datetime
df_features['time'] = pd.to_datetime(df_features['time'], unit='s')

# ---  Sort data in *ascending* order (oldest ‚Üí newest)
df_features = df_features.sort_values(by='time', ascending=True)

# --- Select latest 18 records for prediction (but keep ascending order)
df_recent = df_features.tail(18).reset_index(drop=True)

# --- Load saved transformer and model
pt = joblib.load("AQI_Project/models/yeo_target_only_us_aqi.pkl")
model = joblib.load("AQI_Project/models/best_lightgbm_multioutput.pkl")

# --- Select the same 15 columns used for training
selected_features = [
    'pm10', 'pm2_5', 'us_aqi', 'day', 'month', 'year',
    'day_of_week', 'temp_roll24', 'humidity_roll24', 'wind_roll24',
    'aqi_roll24', 'aqi_roll_std24', 'aqi_lag24', 'aqi_roll3', 'aqi_trend_24h'
]

# ---  Ensure column order matches exactly those used during training
X_recent = df_recent[selected_features].copy()

# ---  Fix warning by ensuring PowerTransformer gets proper feature names
X_recent.columns = selected_features

print(" Skipping input transformation ‚Äî data is already transformed and engineered.")

# --- Perform prediction
predictions = model.predict(X_recent)

# --- Inverse-transform target values if they were transformed before training
try:
    pt_target = joblib.load("AQI_Project/models/yeo_target_only_us_aqi.pkl")
    preds_24 = pt_target.inverse_transform(predictions[:, 0].reshape(-1, 1)).flatten()
    preds_48 = pt_target.inverse_transform(predictions[:, 1].reshape(-1, 1)).flatten()
    preds_72 = pt_target.inverse_transform(predictions[:, 2].reshape(-1, 1)).flatten()
except:
    preds_24, preds_48, preds_72 = predictions[:, 0], predictions[:, 1], predictions[:, 2]

# --- Display results sorted in time order
pred_df = pd.DataFrame({
    'time': df_recent['time'].values,
    'Predicted AQI(Next 24h)': np.round(preds_24, 0),
    'Predicted AQI(Next 48h)': np.round(preds_48, 0),
    'Predicted AQI(Next 72h)': np.round(preds_72, 0)
})

pred_df = pred_df.sort_values(by='time', ascending=True).reset_index(drop=True)

print("\n Predicted AQI values (inverse-transformed):")
print(pred_df.head(10))




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


Connection closed.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1256597
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.25s) 
‚úÖ Skipping input transformation ‚Äî data is already transformed and engineered.

‚úÖ Predicted AQI values (inverse-transformed):
                 time  Predicted AQI(Next 24h)  Predicted AQI(Next 48h)  \
0 2025-11-01 01:00:00                    102.0                     72.0   
1 2025-11-01 02:00:00                    101.0                     72.0   
2 2025-11-01 03:00:00                    101.0                     71.0   
3 2025-11-01 04:00:00                     93.0                     72.0   
4 2025-11-01 05:00:00                     89.0                     71.0   
5 2025-11-01 06:00:00                     88.0                     70.0   
6 2025-11-01 07:00:00                     92.0                     72.0   
7 2025-11-01 08:00:00                    101.0                     72.0   
8 2025-11



In [None]:
print("\n Predicted AQI values (inverse-transformed):")
print(pred_df.head(19))


‚úÖ Predicted AQI values (inverse-transformed):
                  time  Predicted AQI(Next 24h)  Predicted AQI(Next 48h)  \
0  2025-11-01 01:00:00                    102.0                     72.0   
1  2025-11-01 02:00:00                    101.0                     72.0   
2  2025-11-01 03:00:00                    101.0                     71.0   
3  2025-11-01 04:00:00                     93.0                     72.0   
4  2025-11-01 05:00:00                     89.0                     71.0   
5  2025-11-01 06:00:00                     88.0                     70.0   
6  2025-11-01 07:00:00                     92.0                     72.0   
7  2025-11-01 08:00:00                    101.0                     72.0   
8  2025-11-01 09:00:00                    103.0                     72.0   
9  2025-11-01 10:00:00                     92.0                     71.0   
10 2025-11-01 11:00:00                     92.0                     80.0   
11 2025-11-01 12:00:00                 

**SAVING THE YEO-JOHNSON TRANSFORMER FOR ALL FEATURES.**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from scipy.stats import skew
import joblib  #  For saving the transformer

# Load data
cleaned_df = pd.read_csv("AQI_Project/data/KARACHI-AQI-RECORDS-2023-2025-CLEANED.csv")

# Select numeric columns
numeric_cols = cleaned_df.select_dtypes(include=['float64', 'int64']).columns

# Before transformation
df_before = cleaned_df[numeric_cols].copy()
print("Skewness before:\n", df_before.apply(lambda x: round(skew(x.dropna()), 3)))

# Apply Yeo‚ÄìJohnson transformation
pt = PowerTransformer(method='yeo-johnson')
df_after = pd.DataFrame(pt.fit_transform(df_before), columns=numeric_cols)

#  Save the fitted transformer model
joblib.dump(pt, "AQI_Project/models/yeo_transformer.pkl")
print("\nYeo‚ÄìJohnson transformer saved successfully!")

# Replace numeric columns with transformed data
cleaned_df[numeric_cols] = df_after

# Save transformed dataset
transformed_csv = "AQI_Project/data/KARACHI-AQI-RECORDS-2023-2025-TRANSFORMED.csv"
cleaned_df.to_csv(transformed_csv, index=False)
print("\nTransformed dataset saved successfully!")


Skewness before:
 pm10                    0.814
pm2_5                   0.862
carbon_monoxide         1.161
nitrogen_dioxide        1.117
sulphur_dioxide         0.905
ozone                   0.676
us_aqi                  0.841
temperature_2m         -0.574
relative_humidity_2m   -0.638
wind_speed_10m          0.600
dtype: float64

Yeo‚ÄìJohnson transformer saved successfully!

Transformed dataset saved successfully!
