In [38]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
import json
import os
from pathlib import Path

# Visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_palette("viridis")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [39]:
def create_lag_features(data, target_col='AQI', n_lags=7):
    """Create lag features for the target column"""
    data = data.sort_values('Date')
    for i in range(1, n_lags + 1):
        data[f'{target_col}_lag_{i}'] = data[target_col].shift(i)
    return data

# Cell 2: Load pre-split data for each city
processed_dir = '../data/processed'
cities = ['bengaluru', 'chennai', 'delhi', 'hyderabad']

# Dictionary to store data for each city
city_data = {}

for city in cities:
    city_dir = f'{processed_dir}/{city.lower()}'
    city_data[city] = {
        'train': pd.read_csv(f'{city_dir}/train.csv'),
        'val': pd.read_csv(f'{city_dir}/val.csv'),
        'test': pd.read_csv(f'{city_dir}/test.csv')
    }
    
    # Convert date columns to datetime
    for split in ['train', 'val', 'test']:
        city_data[city][split]['Date'] = pd.to_datetime(city_data[city][split]['Date'])
        
        # Check if lag features exist, if not create them
        if 'AQI_lag_1' not in city_data[city][split].columns:
            print(f"Creating lag features for {city} {split} set")
            city_data[city][split] = create_lag_features(city_data[city][split])
    
    print(f"\n{city.title()} data loaded:")
    print(f"Train: {city_data[city]['train'].shape[0]} samples")
    print(f"Validation: {city_data[city]['val'].shape[0]} samples")
    print(f"Test: {city_data[city]['test'].shape[0]} samples")

Creating lag features for bengaluru train set
Creating lag features for bengaluru val set
Creating lag features for bengaluru test set

Bengaluru data loaded:
Train: 1241 samples
Validation: 287 samples
Test: 382 samples
Creating lag features for chennai train set
Creating lag features for chennai val set
Creating lag features for chennai test set

Chennai data loaded:
Train: 1224 samples
Validation: 283 samples
Test: 377 samples
Creating lag features for delhi train set
Creating lag features for delhi val set
Creating lag features for delhi test set

Delhi data loaded:
Train: 1299 samples
Validation: 300 samples
Test: 400 samples
Creating lag features for hyderabad train set
Creating lag features for hyderabad val set
Creating lag features for hyderabad test set

Hyderabad data loaded:
Train: 1222 samples
Validation: 282 samples
Test: 376 samples


In [40]:
# Cell 3: Define utility functions
def calculate_metrics(y_true, y_pred, prefix=''):
    """Calculate comprehensive evaluation metrics with confidence intervals"""
    metrics = {}
    
    # Remove any NaN values from both arrays
    mask = ~(np.isnan(y_true) | np.isnan(y_pred))
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    
    if len(y_true) == 0 or len(y_pred) == 0:
        print("Warning: No valid data points after removing NaN values")
        return {f'{prefix}rmse': np.nan, f'{prefix}mae': np.nan, f'{prefix}r2': np.nan}
    
    # Calculate basic metrics
    metrics[f'{prefix}rmse'] = np.sqrt(mean_squared_error(y_true, y_pred))
    metrics[f'{prefix}mae'] = mean_absolute_error(y_true, y_pred)
    metrics[f'{prefix}r2'] = r2_score(y_true, y_pred)
    
    # Calculate confidence intervals using bootstrap
    n_iterations = 1000
    n_samples = len(y_true)
    
    # Initialize arrays to store bootstrap metrics
    rmse_boots = np.zeros(n_iterations)
    mae_boots = np.zeros(n_iterations)
    r2_boots = np.zeros(n_iterations)
    
    for i in range(n_iterations):
        # Sample with replacement
        indices = np.random.randint(0, n_samples, n_samples)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        
        # Calculate metrics for this bootstrap sample
        rmse_boots[i] = np.sqrt(mean_squared_error(y_true_boot, y_pred_boot))
        mae_boots[i] = mean_absolute_error(y_true_boot, y_pred_boot)
        r2_boots[i] = r2_score(y_true_boot, y_pred_boot)
    
    # Calculate 95% confidence intervals
    for metric_name, metric_boots in [('rmse', rmse_boots), ('mae', mae_boots), ('r2', r2_boots)]:
        lower, upper = np.percentile(metric_boots, [2.5, 97.5])
        metrics[f'{prefix}{metric_name}_ci'] = (lower, upper)
    
    return metrics

def print_metrics(metrics):
    """Print metrics in a formatted way"""
    for metric_name, value in metrics.items():
        if '_ci' in metric_name:
            print(f"{metric_name}: ({value[0]:.3f}, {value[1]:.3f})")
        else:
            print(f"{metric_name}: {value:.3f}")

def plot_metrics_comparison(city, results):
    """Plot comparison of metrics for different models"""
    models = ['persistence', 'moving_average']
    metrics = ['rmse', 'mae', 'r2']
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    for idx, metric in enumerate(metrics):
        # Get values and confidence intervals
        values = []
        cis = []
        for model in models:
            val_metric = results[model]['validation'][f'val_{metric}']
            val_ci = results[model]['validation'][f'val_{metric}_ci']
            values.append(val_metric)
            cis.append(val_ci)
        
        # Plot bars with error bars
        x = np.arange(len(models))
        axes[idx].bar(x, values, yerr=[(v-ci[0], ci[1]-v) for v, ci in zip(values, cis)], 
                     capsize=5, alpha=0.7)
        
        axes[idx].set_xticks(x)
        axes[idx].set_xticklabels([m.replace('_', ' ').title() for m in models])
        axes[idx].set_title(f'{metric.upper()} Comparison')
        axes[idx].set_ylabel(metric.upper())
    
    plt.suptitle(f'Model Performance Comparison - {city.title()}', y=1.05)
    plt.tight_layout()
    return fig

def plot_predictions(city, results, data_split='test'):
    """Plot predictions from baseline models"""
    plt.figure(figsize=(15, 8))
    
    # Get actual values
    actual = city_data[city][data_split]['AQI']
    dates = city_data[city][data_split]['Date']
    
    # Plot actual values
    plt.plot(dates, actual, label='Actual', color='black', alpha=0.6)
    
    # Plot predictions for each model
    for model_name, color in [('persistence', 'blue'), ('moving_average', 'red')]:
        predictions = results[model_name]['predictions'][data_split]
        plt.plot(dates, predictions, 
                label=model_name.replace('_', ' ').title(),
                color=color, alpha=0.6)
    
    plt.title(f'{city.title()} - {data_split.title()} Set Predictions')
    plt.xlabel('Date')
    plt.ylabel('AQI')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    return plt.gcf()

In [41]:

# Cell 4: Define baseline models
class PersistenceModel:
    def __init__(self):
        self.name = "Persistence"
    
    def fit(self, X, y):
        # No training needed for persistence model
        pass
    
    def predict(self, X):
        # For persistence model, we'll use the 1-day lag if it exists
        if 'AQI_lag_1' not in X.columns:
            raise ValueError("AQI_lag_1 feature not found in the data")
        return X['AQI_lag_1'].values

class MovingAverageModel:
    def __init__(self, window_size=7):
        self.name = "Moving Average"
        self.window_size = window_size
    
    def fit(self, X, y):
        # No training needed for moving average model
        pass
    
    def predict(self, X):
        # Create a rolling window of lag features
        lag_cols = [f'AQI_lag_{i}' for i in range(1, self.window_size + 1)]
        if not all(col in X.columns for col in lag_cols):
            raise ValueError(f"Required lag features not found in the data")
        
        # Calculate moving average
        lag_values = X[lag_cols].values
        return np.nanmean(lag_values, axis=1)  # Use nanmean to handle NaN values

In [42]:
# Cell 5: Function to evaluate baseline models
def evaluate_baseline_models(city_data, city_name):
    """Evaluate baseline models for a specific city"""
    results = {}
    
    # Initialize models
    persistence = PersistenceModel()
    moving_avg = MovingAverageModel(window_size=7)
    
    # Fit and evaluate persistence model
    persistence.fit(city_data['train'], city_data['train']['AQI'])
    val_pred_persistence = persistence.predict(city_data['val'])
    test_pred_persistence = persistence.predict(city_data['test'])
    
    # Fit and evaluate moving average model
    moving_avg.fit(city_data['train'], city_data['train']['AQI'])
    val_pred_ma = moving_avg.predict(city_data['val'])
    test_pred_ma = moving_avg.predict(city_data['test'])
    
    # Calculate metrics
    persistence_val_metrics = calculate_metrics(city_data['val']['AQI'].values, val_pred_persistence, 'val_')
    persistence_test_metrics = calculate_metrics(city_data['test']['AQI'].values, test_pred_persistence, 'test_')
    
    ma_val_metrics = calculate_metrics(city_data['val']['AQI'].values, val_pred_ma, 'val_')
    ma_test_metrics = calculate_metrics(city_data['test']['AQI'].values, test_pred_ma, 'test_')
    
    # Store results
    results['persistence'] = {
        'validation': persistence_val_metrics,
        'test': persistence_test_metrics,
        'predictions': {
            'val': val_pred_persistence,
            'test': test_pred_persistence
        }
    }
    
    results['moving_average'] = {
        'validation': ma_val_metrics,
        'test': ma_test_metrics,
        'predictions': {
            'val': val_pred_ma,
            'test': test_pred_ma
        }
    }
    
    return results

In [52]:
# Cell 6: Define ModelResultsManager class
class ModelResultsManager:
    def __init__(self, base_dir='results'):
        self.base_dir = base_dir
        self._create_directory_structure()
    
    def _create_directory_structure(self):
        """Create necessary directories for storing results"""
        os.makedirs(f'{self.base_dir}/baseline', exist_ok=True)
        os.makedirs(f'{self.base_dir}/model_configs', exist_ok=True)
        os.makedirs(f'{self.base_dir}/predictions/baseline', exist_ok=True)
        os.makedirs(f'{self.base_dir}/performance_metrics', exist_ok=True)
        os.makedirs(f'{self.base_dir}/plots/baseline', exist_ok=True)
    
    def _update_metadata(self, city_name, model_type, results):
        """Update metadata file with latest results"""
        metadata_file = f'{self.base_dir}/metadata.json'
        
        # Load existing metadata or create new
        if os.path.exists(metadata_file):
            with open(metadata_file, 'r') as f:
                metadata = json.load(f)
        else:
            metadata = {}
        
        # Update metadata for this city and model type
        if city_name not in metadata:
            metadata[city_name] = {}
        
        metadata[city_name][model_type] = {
            'last_updated': datetime.now().isoformat(),
            'metrics_used': ['rmse', 'mae', 'r2'],
            'results': {
                'persistence': {
                    'validation': results['persistence']['validation'],
                    'test': results['persistence']['test']
                },
                'moving_average': {
                    'validation': results['moving_average']['validation'],
                    'test': results['moving_average']['test']
                }
            }
        }
        
        # Save updated metadata
        with open(metadata_file, 'w') as f:
            json.dump(metadata, f, indent=4)
    
    def _convert_to_serializable(self, obj):
        """Convert numpy arrays and other non-serializable objects to serializable format"""
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {k: self._convert_to_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self._convert_to_serializable(item) for item in obj]
        elif isinstance(obj, tuple):
            return tuple(self._convert_to_serializable(item) for item in obj)
        elif isinstance(obj, (np.int64, np.int32, np.int16, np.int8,
                            np.uint64, np.uint32, np.uint16, np.uint8)):
            return int(obj)
        elif isinstance(obj, (np.float64, np.float32, np.float16)):
            return float(obj)
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return obj
    
    def save_baseline_results(self, city_name, results):
        """Save baseline model results for a city"""
        # Convert results to serializable format
        serializable_results = self._convert_to_serializable(results)
        
        # Save performance metrics
        metrics_file = f'{self.base_dir}/performance_metrics/baseline_{city_name}.json'
        with open(metrics_file, 'w') as f:
            json.dump(serializable_results, f, indent=4)
        
        # Save predictions
        pred_dir = f'{self.base_dir}/predictions/baseline/{city_name}'
        os.makedirs(pred_dir, exist_ok=True)
        
        for model_name in ['persistence', 'moving_average']:
            pred_data = {
                'validation': serializable_results[model_name]['predictions']['val'],
                'test': serializable_results[model_name]['predictions']['test']
            }
            with open(f'{pred_dir}/{model_name}_predictions.json', 'w') as f:
                json.dump(pred_data, f, indent=4)
        
        # Update metadata
        self._update_metadata(city_name, 'baseline', serializable_results)
    
    def save_plots(self, city_name, fig, plot_name):
        """Save plots for a city"""
        # Create directory for city plots if it doesn't exist
        plot_dir = f'{self.base_dir}/plots/baseline/{city_name}'
        os.makedirs(plot_dir, exist_ok=True)
        
        # Save the plot
        plot_path = f'{plot_dir}/{plot_name}.png'
        fig.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)  # Close the figure to free memory
    
    def load_baseline_results(self, city_name):
        """Load baseline model results for a city"""
        metrics_file = f'{self.base_dir}/performance_metrics/baseline_{city_name}.json'
        if not os.path.exists(metrics_file):
            raise FileNotFoundError(f"No results found for {city_name}")
        
        with open(metrics_file, 'r') as f:
            return json.load(f)

In [53]:
# Cell 7: Run evaluation and save results
results_manager = ModelResultsManager()
baseline_results = {}

for city in cities:
    print(f"\nEvaluating baseline models for {city}")
    baseline_results[city] = evaluate_baseline_models(city_data[city], city)
    
    # Print metrics
    print("\nPersistence Model:")
    print("Validation Metrics:")
    print_metrics(baseline_results[city]['persistence']['validation'])
    print("\nTest Metrics:")
    print_metrics(baseline_results[city]['persistence']['test'])
    
    print("\nMoving Average Model:")
    print("Validation Metrics:")
    print_metrics(baseline_results[city]['moving_average']['validation'])
    print("\nTest Metrics:")
    print_metrics(baseline_results[city]['moving_average']['test'])
    
    # Save results
    results_manager.save_baseline_results(city, baseline_results[city])
    
    # Generate and save plots
    fig = plot_metrics_comparison(city, baseline_results[city])
    results_manager.save_plots(city, fig, 'metrics_comparison')
    
    fig = plot_predictions(city, baseline_results[city], 'test')
    results_manager.save_plots(city, fig, 'test_predictions')
    
    print(f"\nResults and plots saved for {city}")


Evaluating baseline models for bengaluru

Persistence Model:
Validation Metrics:
val_rmse: 17.738
val_mae: 12.563
val_r2: 0.512
val_rmse_ci: (15.444, 20.144)
val_mae_ci: (11.185, 14.067)
val_r2_ci: (0.358, 0.629)

Test Metrics:
test_rmse: 11.324
test_mae: 7.906
test_r2: 0.667
test_rmse_ci: (9.908, 12.823)
test_mae_ci: (7.149, 8.745)
test_r2_ci: (0.561, 0.745)

Moving Average Model:
Validation Metrics:
val_rmse: 21.624
val_mae: 16.289
val_r2: 0.275
val_rmse_ci: (19.373, 24.066)
val_mae_ci: (14.733, 18.027)
val_r2_ci: (0.134, 0.388)

Test Metrics:
test_rmse: 13.973
test_mae: 9.713
test_r2: 0.492
test_rmse_ci: (12.062, 15.925)
test_mae_ci: (8.771, 10.733)
test_r2_ci: (0.384, 0.580)

Results and plots saved for bengaluru

Evaluating baseline models for chennai

Persistence Model:
Validation Metrics:
val_rmse: 36.885
val_mae: 23.734
val_r2: 0.269
val_rmse_ci: (31.742, 41.922)
val_mae_ci: (20.489, 27.078)
val_r2_ci: (0.052, 0.447)

Test Metrics:
test_rmse: 27.243
test_mae: 16.415
test_r2: 0