# Fixed LSTM for AQI Prediction

## Key Fixes:
1. ✅ Split data BEFORE scaling (prevents data leakage)
2. ✅ Fit scaler only on training data
3. ✅ Proper inverse transformation
4. ✅ Better evaluation with train/test metrics
5. ✅ Track best model based on test loss

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from copy import deepcopy as dc

In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Load and Prepare Data

In [3]:
# Load data
df = pd.read_csv('data.csv')

# Process dates
df['temp_date'] = pd.to_datetime(df['Date'].astype(str) + '-2020', format='%d-%b-%Y')
years = []
current_year = 2017
prev_month = 3  

for idx, row in df.iterrows():
    current_month = row['temp_date'].month
    if current_month < prev_month:
        current_year += 1
    years.append(current_year)
    prev_month = current_month

df['Year'] = years
date_strings = df['Year'].astype(str) + df['temp_date'].dt.strftime('-%m-%d')
df['Date'] = pd.to_datetime(date_strings)
df.set_index('Date', inplace=True)
df.drop(['temp_date', 'Year'], axis=1, inplace=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index[0]} to {df.index[-1]}")

Dataset shape: (1291, 41)
Date range: 2017-03-22 00:00:00 to 2020-10-07 00:00:00


In [4]:
# Create lagged features
lookback = 7
shifted_df = dc(df)

for i in range(1, lookback + 1):
    shifted_df[f'AQI-{i}'] = shifted_df['Grand Total'].shift(i)

shifted_df.dropna(inplace=True)
shifted_df.rename(columns={'Grand Total': 'AQI'}, inplace=True)

print(f"Shifted DataFrame shape: {shifted_df.shape}")
print(f"Columns: {list(shifted_df.columns)}")

Shifted DataFrame shape: (0, 48)
Columns: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '104', '105', '106', '108', '3269728', '4372603', '4438736', '4439094', '4439475', '5901326', '5917712', '7723955', '30369354', '34703847', '36200067', '36671067', '37052366', '38834077', '39168240', '39379024', 'AQI', 'AQI-1', 'AQI-2', 'AQI-3', 'AQI-4', 'AQI-5', 'AQI-6', 'AQI-7']


## ✅ FIX 1: Split BEFORE Scaling (Prevent Data Leakage)

In [5]:
# Split data first
split_ratio = 0.8
split_index = int(len(shifted_df) * split_ratio)

train_df = shifted_df.iloc[:split_index].copy()
test_df = shifted_df.iloc[split_index:].copy()

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 0, Test size: 0


## ✅ FIX 2: Fit Scaler ONLY on Training Data

In [6]:
# Fit scaler only on training data
scaler = MinMaxScaler(feature_range=(-1, 1))
train_arr = scaler.fit_transform(train_df)  # Fit only on train
test_arr = scaler.transform(test_df)  # Transform test using train statistics

print("Scaler fitted on training data only!")

ValueError: Found array with 0 sample(s) (shape=(0, 48)) while a minimum of 1 is required by MinMaxScaler.

In [None]:
# Prepare sequences
X_train = train_arr[:, 1:]  # Features (lagged values)
y_train = train_arr[:, 0]   # Target (current AQI)
X_test = test_arr[:, 1:]
y_test = test_arr[:, 0]

# Reshape for LSTM
X_train = X_train.reshape((-1, lookback, 1))
X_test = X_test.reshape((-1, lookback, 1))
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

# Convert to tensors
device = 'cuda' if torch.cuda.is_available() else 'cpu'
X_train = torch.tensor(X_train).float().to(device)
y_train = torch.tensor(y_train).float().to(device)
X_test = torch.tensor(X_test).float().to(device)
y_test = torch.tensor(y_test).float().to(device)

print(f"Using device: {device}")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

## Define LSTM Model

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, 
                           batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Initialize model
model = LSTM(1, 64, 2).to(device)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

## ✅ FIX 3: Better Training Loop with Proper Evaluation

In [None]:
num_epochs = 150
train_losses = []
test_losses = []
best_test_loss = float('inf')
best_model = None

print("Training started...")
for epoch in range(num_epochs):
    # Training
    model.train()
    optimizer.zero_grad()
    
    y_pred_train = model(X_train)
    train_loss = loss_function(y_pred_train, y_train)
    
    train_loss.backward()
    optimizer.step()
    
    # Evaluation on test set
    model.eval()
    with torch.no_grad():
        y_pred_test = model(X_test)
        test_loss = loss_function(y_pred_test, y_test)
    
    train_losses.append(train_loss.item())
    test_losses.append(test_loss.item())
    
    # Save best model
    if test_loss.item() < best_test_loss:
        best_test_loss = test_loss.item()
        best_model = dc(model.state_dict())
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss.item():.6f}, Test Loss: {test_loss.item():.6f}')

# Load best model
model.load_state_dict(best_model)
print("\nTraining complete! Best model loaded.")

## ✅ FIX 4: Proper Inverse Transformation

In [None]:
def inverse_transform_predictions(predictions_normalized, scaler, lookback):
    """
    Properly inverse transform predictions
    """
    # Create dummy array with predictions in first column
    dummy = np.zeros((len(predictions_normalized), lookback + 1))
    dummy[:, 0] = predictions_normalized.flatten()
    
    # Inverse transform and extract first column
    predictions_original = scaler.inverse_transform(dummy)[:, 0]
    return predictions_original

# Get predictions
model.eval()
with torch.no_grad():
    train_pred = model(X_train).cpu().numpy()
    test_pred = model(X_test).cpu().numpy()

# Inverse transform
y_train_actual = inverse_transform_predictions(y_train.cpu().numpy(), scaler, lookback)
y_train_predicted = inverse_transform_predictions(train_pred, scaler, lookback)
y_test_actual = inverse_transform_predictions(y_test.cpu().numpy(), scaler, lookback)
y_test_predicted = inverse_transform_predictions(test_pred, scaler, lookback)

## Evaluation Metrics

In [None]:
# Calculate metrics
train_r2 = r2_score(y_train_actual, y_train_predicted)
test_r2 = r2_score(y_test_actual, y_test_predicted)
train_rmse = np.sqrt(mean_squared_error(y_train_actual, y_train_predicted))
test_rmse = np.sqrt(mean_squared_error(y_test_actual, y_test_predicted))
train_mae = mean_absolute_error(y_train_actual, y_train_predicted)
test_mae = mean_absolute_error(y_test_actual, y_test_predicted)

print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(f"Train R²: {train_r2:.4f} | Test R²: {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.2f} | Test RMSE: {test_rmse:.2f}")
print(f"Train MAE: {train_mae:.2f} | Test MAE: {test_mae:.2f}")
print("="*60)

## Visualizations

In [None]:
# Plot training history
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss', alpha=0.7)
plt.plot(test_losses, label='Test Loss', alpha=0.7)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss (normalized)')
plt.title('Training History')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(train_losses, label='Train Loss', alpha=0.7)
plt.plot(test_losses, label='Test Loss', alpha=0.7)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss (normalized)')
plt.title('Training History (Log Scale)')
plt.yscale('log')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Plot predictions scatter
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_train_actual, y_train_predicted, alpha=0.5, s=10)
plt.plot([y_train_actual.min(), y_train_actual.max()], 
         [y_train_actual.min(), y_train_actual.max()], 'r--', lw=2)
plt.xlabel('Actual AQI')
plt.ylabel('Predicted AQI')
plt.title(f'Train Predictions (R² = {train_r2:.4f})')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(y_test_actual, y_test_predicted, alpha=0.5, s=10)
plt.plot([y_test_actual.min(), y_test_actual.max()], 
         [y_test_actual.min(), y_test_actual.max()], 'r--', lw=2)
plt.xlabel('Actual AQI')
plt.ylabel('Predicted AQI')
plt.title(f'Test Predictions (R² = {test_r2:.4f})')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Time series plot
plt.figure(figsize=(15, 6))
train_dates = train_df.index
test_dates = test_df.index

plt.plot(train_dates[-200:], y_train_actual[-200:], label='Train Actual', alpha=0.7, linewidth=1)
plt.plot(train_dates[-200:], y_train_predicted[-200:], label='Train Predicted', alpha=0.7, linewidth=1)
plt.plot(test_dates, y_test_actual, label='Test Actual', alpha=0.7, linewidth=2)
plt.plot(test_dates, y_test_predicted, label='Test Predicted', alpha=0.7, linewidth=2)
plt.axvline(x=test_dates[0], color='red', linestyle='--', alpha=0.5, label='Train/Test Split')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.title('AQI Predictions Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Future Predictions

In [None]:
def predict_future(model, last_sequence, n_future, scaler, lookback, device):
    """
    Predict future AQI values
    """
    model.eval()
    predictions = []
    current_sequence = last_sequence.copy()
    
    with torch.no_grad():
        for _ in range(n_future):
            x = torch.tensor(current_sequence).float().reshape(1, lookback, 1).to(device)
            pred = model(x).cpu().numpy()[0, 0]
            predictions.append(pred)
            current_sequence = np.append(current_sequence[1:], pred)
    
    # Proper inverse transform
    predictions = np.array(predictions)
    predictions = inverse_transform_predictions(predictions, scaler, lookback)
    return predictions

# Get last sequence from test set
last_sequence = X_test[-1].cpu().numpy().flatten()

# Predict next 31 days
future_predictions = predict_future(model, last_sequence, 31, scaler, lookback, device)

# Create future dates
last_date = test_df.index[-1]
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=31, freq='D')

In [None]:
# Plot future predictions
plt.figure(figsize=(15, 5))
plt.plot(test_df.index[-100:], y_test_actual[-100:], label='Historical (Test)', alpha=0.7, linewidth=2)
plt.plot(test_df.index[-100:], y_test_predicted[-100:], label='Model Predictions (Test)', alpha=0.7, linewidth=2)
plt.plot(future_dates, future_predictions, label='Future Predictions', color='red', marker='o', linewidth=2)
plt.axvline(x=last_date, color='green', linestyle='--', alpha=0.5, label='Forecast Start')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.title('AQI Future Predictions (Next 31 Days)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nFuture AQI Predictions:")
for date, pred in zip(future_dates, future_predictions):
    print(f"{date.strftime('%Y-%m-%d')}: {pred:.2f}")