In [14]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ValueError: Mountpoint must not already contain files

In [15]:
# Access files in your Google Drive
import os
os.listdir('/content/drive/MyDrive/Colab Notebooks/flight_delay')


['plots',
 'model_params',
 '.gradio',
 'data',
 '=2.0.0',
 'models',
 'reports',
 '=4.0.0']

In [6]:
# Install required libraries
# !pip install pandas numpy scikit-learn tensorflow matplotlib seaborn joblib scikeras shap kaggle ydata-profiling>=4.0.0 category_encoders>=2.0.0 -q

# Create project directories if they don't exist
import os
import sys
from google.colab import files
# Define base directory (e.g., flight_delay_project under /content/)
BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/flight_delay'
# os.makedirs(os.path.join(BASE_DIR, 'data', 'raw'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'data', 'processed'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'scripts'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'models'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'model_params'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'plots', 'eda'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'plots', 'model_evaluation'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'plots', 'explainability'), exist_ok=True)
# os.makedirs(os.path.join(BASE_DIR, 'reports'), exist_ok=True)

# Change current working directory to BASE_DIR
os.chdir(BASE_DIR)

# Upload kaggle.json
uploaded = files.upload()

# Verify file uploaded
for filename in uploaded:
    print(f'Uploaded {filename}')
# Add scripts directory to Python path
sys.path.insert(0, os.path.join(BASE_DIR, 'scripts'))

print(f"Current working directory: {os.getcwd()}")
print("Directories created and Python path updated.")

# Kaggle API Setup (IMPORTANT!)
# Follow these steps:
# 1. Go to Kaggle.com -> Your Profile -> Account.
# 2. Under 'API', click 'Create New API Token'. This downloads 'kaggle.json'.
# 3. Upload 'kaggle.json' to your Colab environment.
#    You can click the folder icon on the left sidebar in Colab, then "Upload to session storage".
#    Make sure you upload it directly into the BASE_DIR path you defined above (e.g., /content/flight_delay_project/).
# 4. Run the following commands to move and secure the file:
!mkdir -p ~/.kaggle/
!mv kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

print("Kaggle API setup complete. Ensure 'kaggle.json' was uploaded to the correct path.")

Saving kaggle.json to kaggle.json
Uploaded kaggle.json
Current working directory: /content/drive/MyDrive/Colab Notebooks/flight_delay
Directories created and Python path updated.
Kaggle API setup complete. Ensure 'kaggle.json' was uploaded to the correct path.


In [None]:
os.chdir(BASE_DIR)
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content/drive/MyDrive/Colab Notebooks/flight_delay


## Package Installation

In [12]:
!pip install pandas numpy scikit-learn tensorflow matplotlib seaborn joblib scikeras shap kaggle ydata-profiling>=4.0.0 category_encoders>=2.0.0 gradio keras-tuner wandb -q

## Data Modelling


In [8]:
%%writefile scripts/data_modelling.py

import pandas as pd
import numpy as np
import logging

# For Data Splitting and Preprocessing
from sklearn.model_selection import KFold

# For Regression Models
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

# For Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier # NEW: Import for baseline classification model

# For Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.metrics import MeanAbsoluteError, BinaryAccuracy, AUC # Ensure AUC is imported
from tensorflow.keras.callbacks import EarlyStopping # Explicitly import EarlyStopping

# nn_utils for using in tuning and modelling
from scripts.nn_utils import build_nn_model # This is where your build_nn_model should be located

# Import ModelTuner
from scripts.model_tuning import ModelTuner


# Configure a logger for this module
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    console_handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

class DataModeling:
    """
    Handles training of various regression and classification models.
    """
    def __init__(self, tuner: ModelTuner): # Accept a ModelTuner instance
        self.tuner = tuner
        logger.info("DataModeling initialized.")

    def run_baseline_model(self, y_train: pd.Series, y_test: pd.Series) -> tuple[np.ndarray, None]:
        """
        Runs a baseline model that predicts the average of the training target (for regression).

        Args:
            y_train (pd.Series): Training target values.
            y_test (pd.Series): Test target values.

        Returns:
            tuple[np.ndarray, None]: Predicted values (no model object for baseline).
        """
        logger.info("--- Running Baseline Regression Model (Average Prediction) ---")
        y_pred_baseline = np.full(len(y_test), y_train.mean())
        return y_pred_baseline, None # No model object to return for baseline (as it's a simple constant)

    def run_baseline_model_classification(self, y_train_val: pd.Series, y_test: pd.Series) -> tuple[np.ndarray, DummyClassifier]:
        """
        Trains and returns a baseline classification model (predicts majority class).

        Args:
            y_train_val (pd.Series): Training + Validation target values to find majority class.
            y_test (pd.Series): Test target values (used for determining prediction array size).

        Returns:
            tuple[np.ndarray, DummyClassifier]: Predicted class labels for the test set, and the trained DummyClassifier model.
        """
        logger.info("--- Running Baseline Classification Model (Majority Class Classifier) ---")

        # Initialize DummyClassifier with 'most_frequent' strategy
        baseline_model = DummyClassifier(strategy='most_frequent', random_state=42)

        # Fit the model. X is not truly used, but required for the fit method.
        # We just need to fit on y_train_val to determine the most frequent class.
        # We create a dummy X with the same number of samples as y_train_val.
        dummy_X = np.zeros((len(y_train_val), 1))
        baseline_model.fit(dummy_X, y_train_val)

        # Predict on a dummy X for the test set to get the predictions
        dummy_X_test = np.zeros((len(y_test), 1))
        y_pred_baseline = baseline_model.predict(dummy_X_test)

        majority_class = baseline_model.predict(np.array([[0]]))
        if majority_class[0] == 0:
            y_pred_proba_baseline = np.zeros(len(y_test))
        else: # majority_class[0] == 1
            y_pred_proba_baseline = np.ones(len(y_test))

        logger.info(f"Baseline (majority class) for training data: {majority_class[0]}")
        logger.info("Baseline Classification Model training complete.")

        return y_pred_baseline, baseline_model # Return actual predictions and the model itself

    def run_ridge_regression(self, X_train_val: pd.DataFrame, y_train_val: pd.Series) -> Ridge:
        """
        Trains a Ridge Regression model using hyperparameter tuning on the combined
        training and validation set.

        Args:
            X_train_val (pd.DataFrame): Training + Validation features.
            y_train_val (pd.Series): Training + Validation target.

        Returns:
            Ridge: The best trained Ridge model.
        """
        logger.info("\n--- Running Ridge Regression ---")
        ridge = Ridge(random_state=42)
        # Eased out param_grid for faster tuning
        param_grid = {'alpha': [1, 10]} # Smaller, more focused grid

        # Attempt to load best params first
        best_params_loaded = self.tuner.get_best_params("ridge_regression")

        if best_params_loaded:
            logger.info("Using loaded best parameters for Ridge Regression to train final model.")
            best_ridge = Ridge(**best_params_loaded, random_state=42)
            best_ridge.fit(X_train_val, y_train_val)
        else:
            logger.info("No saved best parameters found. Performing full hyperparameter tuning for Ridge Regression.")
            best_ridge, _, _ = self.tuner.tune_model(
                model=ridge,
                param_grid=param_grid,
                X=X_train_val, # Use X_train_val for tuning
                y=y_train_val, # Use y_train_val for tuning
                model_name="ridge_regression",
                scoring='neg_mean_absolute_error' # Ensure correct scoring for regression
            )
        return best_ridge

    def run_random_forest_regression(self, X_train_val: pd.DataFrame, y_train_val: pd.Series) -> RandomForestRegressor:
        """
        Trains a Random Forest Regressor using hyperparameter tuning on the combined
        training and validation set.

        Args:
            X_train_val (pd.DataFrame): Training + Validation features.
            y_train_val (pd.Series): Training + Validation target.

        Returns:
            RandomForestRegressor: The best trained Random Forest model.
        """
        logger.info("\n--- Running Random Forest Regression ---")
        rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

        # Eased out param_grid for faster tuning
        param_grid = {
            'n_estimators': [50, 100], # Fewer estimators
            'max_depth': [5, 10],      # Fewer max depths
            'min_samples_split': [2]   # Single value or very few
        }

        # Attempt to load best params first
        best_params_loaded = self.tuner.get_best_params("random_forest_regression")

        if best_params_loaded:
            logger.info("Using loaded best parameters for Random Forest Regression to train final model.")
            best_rf = RandomForestRegressor(**best_params_loaded, random_state=42, n_jobs=-1)
            best_rf.fit(X_train_val, y_train_val)
        else:
            logger.info("No saved best parameters found. Performing full hyperparameter tuning for Random Forest Regression.")
            best_rf, _, _ = self.tuner.tune_model(
                model=rf_model,
                param_grid=param_grid,
                X=X_train_val, # Use X_train_val for tuning
                y=y_train_val, # Use y_train_val for tuning
                model_name="random_forest_regression",
                search_method='random', # Using random search
                n_iter=3, # Reduced number of iterations for random search
                scoring='neg_mean_absolute_error' # Ensure correct scoring for regression
            )
        return best_rf

    def run_logistic_regression(self, X_train_val: pd.DataFrame, y_train_val: pd.Series) -> LogisticRegression:
        """
        Trains a Logistic Regression model using hyperparameter tuning for classification.

        Args:
            X_train_val (pd.DataFrame): Training + Validation features.
            y_train_val (pd.Series): Training + Validation target.

        Returns:
            LogisticRegression: The best trained Logistic Regression model.
        """
        logger.info("\n--- Running Logistic Regression (Classification) ---")
        log_reg = LogisticRegression(random_state=42, solver='liblinear', max_iter=500, class_weight='balanced')

        # Eased out param_grid for faster tuning
        param_grid = {
            'C': [1, 10],       # Fewer C values
            'penalty': ['l1']   # Single penalty type
        }

        best_params_loaded = self.tuner.get_best_params("logistic_regression")

        if best_params_loaded:
            logger.info("Using loaded best parameters for Logistic Regression to train final model.")
            best_log_reg = LogisticRegression(**best_params_loaded, random_state=42, solver='liblinear', max_iter=500)
            best_log_reg.fit(X_train_val, y_train_val)
        else:
            logger.info("No saved best parameters found. Performing full hyperparameter tuning for Logistic Regression.")
            best_log_reg, _, _ = self.tuner.tune_model(
                model=log_reg,
                param_grid=param_grid,
                X=X_train_val,
                y=y_train_val,
                model_name="logistic_regression",
                scoring='f1' # Common scoring for classification, especially with imbalance
            )
        return best_log_reg

    def run_neural_network(self, X_train: pd.DataFrame, y_train: pd.Series,
                           X_val: pd.DataFrame, y_val: pd.Series, task_type: str = 'regression') -> tf.keras.models.Sequential:
        """
        Trains a Neural Network model for either regression or classification.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series): Training target.
            X_val (pd.DataFrame): Validation features.
            y_val (pd.Series): Validation target.
            task_type (str): 'regression' or 'classification'.

        Returns:
            tf.keras.models.Sequential: The trained Keras model.
        """
        logger.info(f"\n--- Running Neural Network ({task_type.capitalize()}) ---")
        tf.random.set_seed(42)
        np.random.seed(42)

        # Default params, will be overridden by best_hyperparameters if tuning occurs
        nn_model_params = {
            'input_dim': X_train.shape[1],
            'hidden_layers': 2,
            'neurons': 64,
            'dropout_rate': 0.1,
            'learning_rate': 0.001,
            'task_type': task_type # Pass task type to build_nn_model
        }

        # Check if we have best params for 'neural_network_regression' or 'neural_network_classification'
        model_name_for_params = f"neural_network_{task_type}"
        best_params_loaded = self.tuner.get_best_params(model_name_for_params)

        if best_params_loaded:
            logger.info(f"Using loaded best parameters for {model_name_for_params} to train final model.")
            # Merge loaded params with fixed ones (like input_dim, task_type)
            nn_model_params.update(best_params_loaded)
            # Ensure task_type is correctly set for build_nn_model
            nn_model_params['task_type'] = task_type

        nn_model = build_nn_model(**nn_model_params)

        logger.info(f"Neural Network Model Summary ({task_type}):\n{nn_model.summary()}")

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=5, # Reduced patience for faster convergence
            restore_best_weights=True
        )

        history = nn_model.fit(X_train, y_train,
                               validation_data=(X_val, y_val),
                               epochs=50, # Reduced max epochs with early stopping
                               batch_size=32,
                               verbose=0, # Set to 1 for progress bar during training
                               callbacks=[early_stopping])

        logger.info(f"Neural Network {task_type.capitalize()} Model training complete.")
        return nn_model

    def run_neural_network_classification(self, X_train: pd.DataFrame, y_train: pd.Series, X_val: pd.DataFrame, y_val: pd.Series) -> tf.keras.models.Sequential:
        """Trains and returns a Neural Network Classification model."""
        logger.info("Training Neural Network Classification Model...")
        input_dim = X_train.shape[1]

        best_hyperparameters = None
        if self.tuner:
            best_hyperparameters = self.tuner.tune_neural_network_classification(X_train, y_train, X_val, y_val)
            logger.info(f"Neural Network best hyperparameters: {best_hyperparameters}")

        if best_hyperparameters:
            # Use the build_nn_model function with the tuned hyperparameters
            model = build_nn_model(
                input_dim=input_dim,
                hidden_layers=best_hyperparameters['hidden_layers'],
                neurons=best_hyperparameters['neurons'],
                dropout_rate=best_hyperparameters['dropout_rate'],
                learning_rate=best_hyperparameters['learning_rate'],
                task_type='classification' # Explicitly set task_type
            )
        else:
            # Default model for non-tuned scenario (if tuner is None or returns no params)
            # These defaults should align with reasonable starting points for build_nn_model
            model = build_nn_model(
                input_dim=input_dim,
                hidden_layers=2,
                neurons=64,
                dropout_rate=0.3,
                learning_rate=0.001,
                task_type='classification'
            )

        # Define epochs and early stopping based on whether tuning occurred
        # Note: epochs might be tuned in Keras Tuner, but here we just use early stopping to control
        epochs = 50 # Start with a moderate number, EarlyStopping will cut it short
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # Reduced patience

        model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=32,
            validation_data=(X_val, y_val),
            callbacks=[early_stopping],
            verbose=0 # Set to 1 for progress bar during training
        )
        logger.info("Neural Network Classification Model training complete.")
        return model

Overwriting scripts/data_modelling.py


## Data Preprocessing


In [None]:
%%writefile scripts/data_preprocessor.py

import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import category_encoders as ce
from typing import Tuple, Set, Dict, List, Union

# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    console_handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

class DataPreprocessor:
    """
    Handles various data preprocessing steps for the flight delay dataset.
    """
    def __init__(self):
        self.label_encoders = {} # To store LabelEncoders for inverse transform if needed
        self.one_hot_encoder = None # To store the OneHotEncoder transformer
        self.numerical_imputer = SimpleImputer(strategy='mean')
        self.woe_encoder = None # To store the WOE encoder
        logger.info("DataPreprocessor initialized.")

    def handle_missing_values(self, df: pd.DataFrame, delay_cols: List[str]) -> pd.DataFrame:
        """
        Handles missing values, specifically for delay reason columns.
        Fills NaN in delay reason columns with 0.
        Drops rows where 'ARR_DELAY' is NaN (as it's our target).
        """
        logger.info("Handling missing values...")

        # Fill NaN in delay reason columns with 0 (assuming NaN means no delay reason from that category)
        for col in delay_cols:
            if col in df.columns:
                df[col] = df[col].fillna(0)
                logger.debug(f"Filled NaN in '{col}' with 0.")

        # Drop rows where 'ARR_DELAY' is NaN, as it's the target variable
        initial_rows = df.shape[0]
        if 'ARR_DELAY' in df.columns:
            df.dropna(subset=['ARR_DELAY'], inplace=True)
            rows_dropped = initial_rows - df.shape[0]
            if rows_dropped > 0:
                logger.info(f"Dropped {rows_dropped} rows due to NaN values in 'ARR_DELAY'.")
        else:
            logger.warning("'ARR_DELAY' column not found, skipping NaN drop for target.")

        logger.info(f"DataFrame shape after handling missing values: {df.shape}")
        return df

    def remove_outliers_iqr(self, df: pd.DataFrame, column: str = 'ARR_DELAY', iqr_multiplier: float = 3.0) -> pd.DataFrame:
        """
        Removes outliers from a specified numerical column using the IQR method.
        """
        if column not in df.columns or not pd.api.types.is_numeric_dtype(df[column]):
            logger.warning(f"Column '{column}' not found or not numeric. Skipping outlier removal.")
            return df

        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR

        initial_rows = df.shape[0]
        df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
        rows_removed = initial_rows - df_filtered.shape[0]

        logger.info(f"Removed {rows_removed} outliers from '{column}' using IQR (multiplier={iqr_multiplier}).")
        logger.info(f"DataFrame shape after outlier removal: {df_filtered.shape}")
        return df_filtered

    def create_elapsed_time_diff(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculates the difference between actual elapsed time and scheduled elapsed time.
        Handles missing values in time columns by dropping rows or imputing if necessary.
        """
        logger.info("Creating 'ELAPSED_TIME_DIFF' feature...")

        # Define time columns that must be present and numeric
        required_time_cols = ['ACTUAL_ELAPSED_TIME', 'CRS_ELAPSED_TIME']

        for col in required_time_cols:
            if col not in df.columns:
                logger.warning(f"Required time column '{col}' not found. Cannot create 'ELAPSED_TIME_DIFF'.")
                return df
            # Ensure they are numeric, coerce errors to NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Drop rows where essential time components are NaN
        initial_rows = df.shape[0]
        df.dropna(subset=required_time_cols, inplace=True)
        rows_dropped = initial_rows - df.shape[0]
        if rows_dropped > 0:
            logger.warning(f"Dropped {rows_dropped} rows due to missing values in required time columns for 'ELAPSED_TIME_DIFF'.")

        if not df.empty:
            df['ELAPSED_TIME_DIFF'] = df['ACTUAL_ELAPSED_TIME'] - df['CRS_ELAPSED_TIME']
            logger.info("Created 'ELAPSED_TIME_DIFF' column.")
        else:
            logger.warning("DataFrame is empty after dropping rows for time features; 'ELAPSED_TIME_DIFF' not created.")

        return df

    def apply_cyclical_encoding(self, df: pd.DataFrame, time_columns: List[str]) -> pd.DataFrame:
        """
        Applies cyclical (sine and cosine) encoding to time-based features.
        Assumes time columns are in minutes (0-1439 for a day).
        """
        logger.info("Applying cyclical encoding to time columns...")
        for col in time_columns:
            if col in df.columns:
                # Convert to numeric, errors will be NaN
                df[col] = pd.to_numeric(df[col], errors='coerce')
                # Fill NaNs with mean before encoding to avoid NaNs in sin/cos
                if df[col].isnull().any():
                    col_mean = df[col].mean()
                    df[col].fillna(col_mean, inplace=True)
                    logger.warning(f"Filled NaN in '{col}' with mean {col_mean:.2f} before cyclical encoding.")

                max_val = 2359 if 'TIME' in col.upper() else 1439 # Max minutes in 24 hours (HHMM format)
                if 'TIME' in col.upper(): # Heuristic for HHMM format
                    # Convert HHMM to minutes
                    df[f'{col}_MINUTES'] = (df[col] // 100) * 60 + (df[col] % 100)
                    col_to_encode = f'{col}_MINUTES'
                    max_val_for_sin_cos = 1440 # 24 * 60 minutes
                else: # Assume already in minutes or similar scale (e.g. wheels_on/off might be unix timestamp or similar, adjust max_val if needed)
                    col_to_encode = col
                    max_val_for_sin_cos = df[col].max() if df[col].max() > 0 else 1 # Avoid division by zero

                if max_val_for_sin_cos > 0:
                    df[f'{col_to_encode}_SIN'] = np.sin(2 * np.pi * df[col_to_encode] / max_val_for_sin_cos)
                    df[f'{col_to_encode}_COS'] = np.cos(2 * np.pi * df[col_to_encode] / max_val_for_sin_cos)
                    logger.debug(f"Applied cyclical encoding to '{col_to_encode}'.")
                    # Optionally drop the original column to avoid multicollinearity
                    # df.drop(columns=[col_to_encode], inplace=True, errors='ignore')
                else:
                    logger.warning(f"Max value for '{col_to_encode}' is zero, skipping cyclical encoding.")
            else:
                logger.warning(f"Time column '{col}' not found. Skipping cyclical encoding.")
        return df

    def split_city_state(self, df: pd.DataFrame, city_state_columns: List[str]) -> pd.DataFrame:
        """
        Splits 'CITY, STATE' columns into 'CITY' and 'STATE' columns.
        """
        logger.info("Splitting city and state columns...")
        for col in city_state_columns:
            if col in df.columns and df[col].astype(str).str.contains(',').any():
                df[[f'{col}_CITY', f'{col}_STATE']] = df[col].astype(str).str.split(', ', expand=True)
                df.drop(columns=[col], inplace=True)
                logger.debug(f"Split '{col}' into '{col}_CITY' and '{col}_STATE'.")
            else:
                logger.warning(f"Column '{col}' not found or does not contain 'CITY, STATE' format. Skipping split.")
        return df

    def add_weekday_weekend_columns(self, df: pd.DataFrame, date_columns: List[str]) -> pd.DataFrame:
        """
        Adds 'DAY_OF_WEEK' and 'IS_WEEKEND' columns from date columns.
        """
        logger.info("Adding weekday and weekend columns...")
        for col in date_columns:
            if col in df.columns:
                try:
                    df[col] = pd.to_datetime(df[col])
                    df['DAY_OF_WEEK'] = df[col].dt.dayofweek # Monday=0, Sunday=6
                    df['IS_WEEKEND'] = df['DAY_OF_WEEK'].isin([5, 6]).astype(int) # Saturday=5, Sunday=6
                    logger.debug(f"Added 'DAY_OF_WEEK' and 'IS_WEEKEND' from '{col}'.")
                except Exception as e:
                    logger.error(f"Error converting '{col}' to datetime or extracting day features: {e}. Skipping.", exc_info=True)
            else:
                logger.warning(f"Date column '{col}' not found. Skipping weekday/weekend features.")
        return df

    def encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Applies WOE (Weight of Evidence) encoding to all categorical features based on target variable FLIGHT_STATUS.

        Args:
            df (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with WOE encoded categorical features.
        """
        df_copy = df.copy()
        logger.info("Applying WOE encoding to all categorical features based on FLIGHT_STATUS.")

        # Check if target variable exists
        target_variable = 'FLIGHT_STATUS'
        if target_variable not in df_copy.columns:
            logger.error(f"Target variable '{target_variable}' not found in DataFrame. Cannot apply WOE encoding.")
            return df_copy

        # Identify all categorical columns (excluding the target variable)
        categorical_columns = []

        # Get columns with object dtype (string/categorical)
        object_cols = df_copy.select_dtypes(include=['object']).columns.tolist()

        # Get columns with category dtype
        category_cols = df_copy.select_dtypes(include=['category']).columns.tolist()

        # Combine and remove target variable
        categorical_columns = list(set(object_cols + category_cols))
        if target_variable in categorical_columns:
            categorical_columns.remove(target_variable)

        # You can also explicitly specify columns you want to encode
        # Uncomment and modify this list if you want to be more specific:
        # categorical_columns = [
        #     'FL_DATE_day_name', 'AIRLINE', 'FL_NUMBER', 'ORIGIN', 'ORIGIN_CITY',
        #     'DEST', 'DEST_CITY', 'origin_state', 'dest_state'
        # ]
        # categorical_columns = [col for col in categorical_columns if col in df_copy.columns]

        if not categorical_columns:
            logger.warning("No categorical columns found for WOE encoding.")
            return df_copy

        logger.info(f"Found {len(categorical_columns)} categorical columns for WOE encoding: {categorical_columns}")

        # Apply WOE encoding to all categorical columns
        try:
            # Handle missing values in target variable
            if df_copy[target_variable].isnull().any():
                logger.warning(f"Target variable '{target_variable}' contains NaN values. This may affect WOE encoding.")
                # You might want to handle NaNs in target variable before encoding
                # df_copy = df_copy.dropna(subset=[target_variable])  # Option 1: Drop rows with NaN target
                # df_copy[target_variable] = df_copy[target_variable].fillna('Unknown')  # Option 2: Fill NaNs

            # Handle missing values in categorical columns
            # for col in categorical_columns:
            #     if df_copy[col].isnull().any():
            #         logger.info(f"Column '{col}' contains NaN values. Filling with 'Missing' before WOE encoding.")
            #         df_copy[col] = df_copy[col].fillna('Missing')

            # Initialize WOE encoder
            self.woe_encoder = ce.WOEEncoder(cols=categorical_columns, handle_unknown='value', handle_missing='value')

            # Apply WOE encoding
            df_encoded = self.woe_encoder.fit_transform(df_copy[categorical_columns], df_copy[target_variable])

            # Replace original categorical columns with WOE encoded versions
            # Add '_WOE' suffix to distinguish encoded columns
            woe_columns = {}
            for i, col in enumerate(categorical_columns):
                woe_col_name = f'{col}_WOE'
                df_copy[woe_col_name] = df_encoded.iloc[:, i]
                woe_columns[col] = woe_col_name

            logger.info(f"Successfully applied WOE encoding to {len(categorical_columns)} categorical columns.")
            logger.info(f"Created WOE encoded columns: {list(woe_columns.values())}")

            # Drop original categorical columns
            df_copy = df_copy.drop(columns=categorical_columns)
            logger.info(f"Dropped original categorical columns: {categorical_columns}")

        except Exception as e:
            logger.error(f"Error applying WOE encoding: {e}", exc_info=True)
            return df_copy

        return df_copy

    def identify_high_correlation_pairs(self, df: pd.DataFrame, threshold: float = 0.95) -> Tuple[Set[str], Dict[Tuple[str, str], float]]:
        """
        Identifies pairs of highly correlated features and suggests one to drop.
        Returns a set of columns to drop and a dictionary of correlated pairs.
        """
        logger.info(f"Identifying high correlation features (threshold={threshold})...")

        # Select only numeric columns for correlation calculation
        numeric_df = df.select_dtypes(include=np.number)

        if numeric_df.empty:
            logger.warning("No numeric columns found for correlation analysis.")
            return set(), {}

        corr_matrix = numeric_df.corr().abs()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        to_drop = set()
        correlated_pairs = {}

        for i in range(len(upper_tri.columns)):
            for j in range(i + 1, len(upper_tri.columns)):
                col1 = upper_tri.columns[i]
                col2 = upper_tri.columns[j]
                correlation_value = upper_tri.iloc[i, j]

                if pd.notna(correlation_value) and correlation_value >= threshold:
                    correlated_pairs[(col1, col2)] = correlation_value
                    # Simple heuristic: remove the second column in the pair
                    to_drop.add(col2)
                    logger.debug(f"High correlation found: {col1} and {col2} (Corr: {correlation_value:.2f}). Suggesting to drop {col2}.")

        logger.info(f"Found {len(to_drop)} columns to drop due to high correlation.")
        return to_drop, correlated_pairs

    def exclude_columns(self, df: pd.DataFrame, columns_to_exclude: List[str]) -> pd.DataFrame:
        """
        Excludes specified columns from the DataFrame.
        """
        logger.info("Excluding specified columns...")
        existing_cols_to_drop = [col for col in columns_to_exclude if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, inplace=True)
            logger.info(f"Excluded columns: {existing_cols_to_drop}")
        else:
            logger.info("No specified columns found to exclude.")
        return df

    def create_classification_target(self, df: pd.DataFrame, delay_column: str, delay_threshold_minutes: int) -> pd.DataFrame:
        """
        Creates a binary classification target based on arrival delay.
        1 if ARR_DELAY > delay_threshold_minutes, 0 otherwise.
        """
        if delay_column not in df.columns:
            logger.error(f"Delay column '{delay_column}' not found for creating classification target.")
            return df

        logger.info(f"Creating classification target: FLIGHT_STATUS_CLASSIFICATION (1 if {delay_column} > {delay_threshold_minutes} mins, else 0).")
        df['FLIGHT_STATUS_CLASSIFICATION'] = (df[delay_column] > delay_threshold_minutes).astype(int)
        logger.info(f"Classification target value counts:\n{df['FLIGHT_STATUS_CLASSIFICATION'].value_counts().to_string()}")
        return df

Overwriting scripts/data_preprocessor.py


## data Evaluate

In [5]:
%%writefile scripts/data_evaluate.py
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
import pickle
import logging
import tensorflow as tf
import wandb # NEW: Import wandb

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    console_handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

class ModelEvaluator:
    """
    Handles evaluation and logging of machine learning model performance.
    """
    def __init__(self, output_dir="models", plots_dir="plots/model_evaluation", wandb_run=None): # NEW: Add wandb_run
        self.output_dir = output_dir
        self.plots_dir = plots_dir
        self.wandb_run = wandb_run # NEW: Store wandb_run
        os.makedirs(self.plots_dir, exist_ok=True)
        os.makedirs(self.output_dir, exist_ok=True)
        self.results_df = pd.DataFrame(columns=['Model', 'Stage', 'Metric', 'Value'])
        logger.info(f"ModelEvaluator initialized. Models will be saved to '{self.output_dir}', plots to '{self.plots_dir}'.")


    def save_model(self, model, filename_prefix: str):
        """
        Saves a trained model to the specified output directory.
        Handles both scikit-learn models (pickle) and Keras models (HDF5).

        Args:
            model: The trained model object.
            filename_prefix (str): Prefix for the filename (e.g., 'logistic_regression_model').
        """
        if isinstance(model, tf.keras.Model):
            file_path = os.path.join(self.output_dir, f"{filename_prefix}.h5")
            model.save(file_path)
            logger.info(f"Keras model saved to {file_path}")
        else:
            file_path = os.path.join(self.output_dir, f"{filename_prefix}.pkl")
            with open(file_path, 'wb') as f:
                pickle.dump(model, f)
            logger.info(f"Scikit-learn model saved to {file_path}")

        # NEW: Log model artifact to WandB
        if self.wandb_run:
            artifact = wandb.Artifact(name=f"{filename_prefix}-model", type="model")
            artifact.add_file(file_path)
            self.wandb_run.log_artifact(artifact)
            logger.info(f"Model artifact '{filename_prefix}' logged to WandB.")


    def load_model(self, filename_prefix: str):
        """
        Loads a trained model from the specified output directory.
        Handles both scikit-learn models (pickle) and Keras models (HDF5).

        Args:
            filename_prefix (str): Prefix of the filename (e.g., 'logistic_regression_model').

        Returns:
            The loaded model object, or None if not found/error.
        """
        keras_path = os.path.join(self.output_dir, f"{filename_prefix}.h5")
        sklearn_path = os.path.join(self.output_dir, f"{filename_prefix}.pkl")

        if os.path.exists(keras_path):
            try:
                # Keras model loading requires custom objects if used
                # For this pipeline, assuming standard layers
                model = tf.keras.models.load_model(keras_path)
                logger.info(f"Keras model loaded from {keras_path}")
                return model
            except Exception as e:
                logger.error(f"Error loading Keras model from {keras_path}: {e}")
                return None
        elif os.path.exists(sklearn_path):
            try:
                with open(sklearn_path, 'rb') as f:
                    model = pickle.load(f)
                logger.info(f"Scikit-learn model loaded from {sklearn_path}")
                return model
            except Exception as e:
                logger.error(f"Error loading scikit-learn model from {sklearn_path}: {e}")
                return None
        else:
            logger.info(f"No model found for '{filename_prefix}' at {self.output_dir}")
            return None

    def evaluate_classification_model(self, model_name: str, y_true: np.ndarray, y_pred: np.ndarray, y_pred_proba: np.ndarray, stage: str = "Test"):
        """
        Evaluates a classification model and logs various metrics.

        Args:
            model_name (str): Name of the model being evaluated.
            y_true (np.ndarray): True labels.
            y_pred (np.ndarray): Predicted labels.
            y_pred_proba (np.ndarray): Predicted probabilities for the positive class.
            stage (str): The stage of evaluation (e.g., "Train", "Validation", "Test").
        """
        logger.info(f"--- {model_name} Evaluation ({stage} Set) ---")

        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        roc_auc = 0.0
        try:
            fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
            roc_auc = auc(fpr, tpr)
        except ValueError as e:
            logger.warning(f"Could not calculate ROC AUC: {e}. Ensure y_true contains at least two classes.")

        metrics = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC AUC': roc_auc
        }

        for metric_name, value in metrics.items():
            logger.info(f"{metric_name}: {value:.4f}")
            # NEW: Log metrics to WandB
            if self.wandb_run:
                self.wandb_run.log({f"{model_name}/{stage}_{metric_name.lower().replace(' ', '_')}": value})

            # Append to internal results_df
            self.results_df.loc[len(self.results_df)] = [model_name, stage, metric_name, value]

        logger.info("\nClassification Report:\n" + classification_report(y_true, y_pred, zero_division=0))

    def plot_confusion_matrix(self, model_name: str, y_true: np.ndarray, y_pred: np.ndarray, stage: str = "Test"):
        """
        Plots and saves the confusion matrix.

        Args:
            model_name (str): Name of the model.
            y_true (np.ndarray): True labels.
            y_pred (np.ndarray): Predicted labels.
            stage (str): The stage of evaluation.
        """
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                    xticklabels=['Predicted 0', 'Predicted 1'],
                    yticklabels=['Actual 0', 'Actual 1'])
        plt.title(f'Confusion Matrix for {model_name} ({stage} Set)')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plot_path = os.path.join(self.plots_dir, f"{model_name.lower().replace(' ', '_')}_confusion_matrix_{stage.lower()}.png")
        plt.savefig(plot_path)
        plt.close()
        logger.info(f"Saved confusion matrix plot to {plot_path}")

        # NEW: Log plot to WandB
        if self.wandb_run:
            self.wandb_run.log({f"{model_name}/{stage}_Confusion_Matrix": wandb.Image(plot_path)})
            logger.info(f"Confusion Matrix plot logged to WandB for {model_name}.")


    def plot_roc_curve(self, model_name: str, y_true: np.ndarray, y_pred_proba: np.ndarray, stage: str = "Test"):
        """
        Plots and saves the ROC curve.

        Args:
            model_name (str): Name of the model.
            y_true (np.ndarray): True labels.
            y_pred_proba (np.ndarray): Predicted probabilities for the positive class.
            stage (str): The stage of evaluation.
        """
        try:
            fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
            roc_auc = auc(fpr, tpr)

            plt.figure(figsize=(8, 6))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'Receiver Operating Characteristic (ROC) Curve for {model_name} ({stage} Set)')
            plt.legend(loc="lower right")
            plt.tight_layout()
            plot_path = os.path.join(self.plots_dir, f"{model_name.lower().replace(' ', '_')}_roc_curve_{stage.lower()}.png")
            plt.savefig(plot_path)
            plt.close()
            logger.info(f"Saved ROC curve plot to {plot_path}")

            # NEW: Log plot to WandB
            if self.wandb_run:
                self.wandb_run.log({f"{model_name}/{stage}_ROC_Curve": wandb.Image(plot_path)})
                logger.info(f"ROC Curve plot logged to WandB for {model_name}.")

        except ValueError as e:
            logger.warning(f"Could not plot ROC curve for {model_name}: {e}. Ensure y_true contains at least two classes.")

    def display_results_table(self):
        """Displays the collected evaluation results in a formatted table."""
        if not self.results_df.empty:
            logger.info("\n--- Overall Model Evaluation Results ---")
            # Pivot table for better readability
            pivot_df = self.results_df.pivot_table(
                index=['Model', 'Stage'],
                columns='Metric',
                values='Value',
                aggfunc='first'
            ).round(4)
            logger.info(f"\n{pivot_df.to_string()}")
        else:
            logger.info("\nNo evaluation results to display.")

Overwriting scripts/data_evaluate.py


## nn_utils

In [None]:
%%writefile scripts/nn_utils.py

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.metrics import MeanAbsoluteError, BinaryAccuracy, AUC # Ensure AUC is imported for classification metrics

def build_nn_model(input_dim, hidden_layers=1, neurons=64, dropout_rate=0.0, learning_rate=0.001, task_type='regression'):
    """
    Builds a Keras Sequential model for regression or classification.
    """
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(input_dim,)))
    model.add(Dropout(dropout_rate))

    for _ in range(hidden_layers - 1): # Add additional hidden layers
        model.add(Dense(neurons, activation='relu'))
        model.add(Dropout(dropout_rate))

    if task_type == 'regression':
        model.add(Dense(1, activation='linear')) # Single output for regression
        model.compile(optimizer=Adam(learning_rate=learning_rate),
                      loss=MeanSquaredError(),
                      metrics=[MeanAbsoluteError()])
    elif task_type == 'classification':
        model.add(Dense(1, activation='sigmoid')) # Single output for binary classification
        model.compile(optimizer=Adam(learning_rate=learning_rate),
                      loss=BinaryCrossentropy(),
                      metrics=[BinaryAccuracy(), AUC()]) # Add AUC for classification
    else:
        raise ValueError("task_type must be 'regression' or 'classification'")

    return model

## Model Tuner

In [9]:
%%writefile scripts/model_tuning.py

import logging
import os
import json # To save best parameters
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, f1_score # Import f1_score for classification

# For Neural Network Tuning
import tensorflow as tf
from tensorflow import keras
import keras_tuner # Make sure this is imported for keras_tuner.HyperModel
from keras_tuner.tuners import RandomSearch, Hyperband

# Import build_nn_model from nn_utils (assuming it's been refactored there)
from scripts.nn_utils import build_nn_model

# Configure a logger for this module
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    console_handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

class ModelTuner:
    """
    Handles hyperparameter tuning and cross-validation for machine learning models.
    Saves and loads best parameters to/from a JSON file.
    """
    def __init__(self, output_dir="model_params"):
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.params_file = os.path.join(self.output_dir, "all_best_params.json")
        self.best_params_cache = self._load_all_best_params()
        logger.info(f"ModelTuner initialized. Best parameters will be saved to '{self.output_dir}'.")

    def _load_all_best_params(self):
        """Loads all best parameters from the single JSON file."""
        if os.path.exists(self.params_file):
            try:
                with open(self.params_file, 'r') as f:
                    return json.load(f)
            except json.JSONDecodeError as e:
                logger.error(f"Error decoding {self.params_file}: {e}. Returning empty cache.", exc_info=True)
                return {}
        return {}

    def _save_all_best_params(self):
        """Saves current best parameters cache to the single JSON file."""
        try:
            with open(self.params_file, 'w') as f:
                json.dump(self.best_params_cache, f, indent=4)
            logger.info(f"All best parameters saved to {self.params_file}")
        except Exception as e:
            logger.error(f"Error saving all best parameters to {self.params_file}: {e}", exc_info=True)

    def get_best_params(self, model_name: str) -> dict:
        """Retrieves best parameters for a specific model from the cache."""
        return self.best_params_cache.get(model_name, None)

    def tune_model(self, model, param_grid: dict, X: pd.DataFrame, y: pd.Series,
                   model_name: str, cv_splits: int = 3, search_method: str = 'grid', # Reduced cv_splits
                   n_iter: int = 3, scoring: str = 'neg_mean_absolute_error') -> tuple: # Reduced n_iter
        """
        Performs hyperparameter tuning and cross-validation for a given scikit-learn model.
        (Parameters eased out for faster convergence during HPO).

        Args:
            model: The scikit-learn estimator to tune.
            param_grid (dict): Dictionary with parameters names (str) as keys and lists of parameter
                                settings to try as values.
            X (pd.DataFrame): Training features.
            y (pd.Series): Training target.
            model_name (str): A descriptive name for the model (e.g., 'ridge_regression').
            cv_splits (int): Number of cross-validation splits. (Reduced for faster HPO)
            search_method (str): 'grid' for GridSearchCV or 'random' for RandomizedSearchCV.
            n_iter (int): Number of parameter settings that are sampled if search_method is 'random'. (Reduced for faster HPO)
            scoring (str): Scoring metric to optimize (e.g., 'neg_mean_absolute_error').

        Returns:
            tuple: A tuple containing (best_estimator, best_params, best_score).
        """
        logger.info(f"Starting hyperparameter tuning for {model_name} using {search_method} search...")

        best_params_loaded = self.get_best_params(model_name)
        if best_params_loaded:
            logger.info(f"Using loaded best parameters for {model_name}: {best_params_loaded}. Skipping tuning.")
            model.set_params(**best_params_loaded)
            return model, best_params_loaded, None

        cv_strategy = KFold(n_splits=cv_splits, shuffle=True, random_state=42)

        if search_method == 'grid':
            search = GridSearchCV(model, param_grid, cv=cv_strategy, scoring=scoring, verbose=0, n_jobs=-1) # Reduced verbose
        elif search_method == 'random':
            search = RandomizedSearchCV(model, param_distro=param_grid, n_iter=n_iter, cv=cv_strategy, scoring=scoring, verbose=0, n_jobs=-1, random_state=42) # Reduced verbose
        else:
            raise ValueError(f"Unknown search_method: {search_method}. Must be 'grid' or 'random'.")

        search.fit(X, y)

        best_estimator = search.best_estimator_
        best_params = search.best_params_
        best_score = search.best_score_

        logger.info(f"Best parameters for {model_name}: {best_params}")
        logger.info(f"Best CV score for {model_name} ({scoring}): {best_score:.4f}")

        self.best_params_cache[model_name] = best_params
        self._save_all_best_params()

        return best_estimator, best_params, best_score

    class NeuralNetworkHyperModel(keras_tuner.HyperModel):
        def __init__(self, input_dim: int, task_type: str = 'regression'):
            self.input_dim = input_dim
            self.task_type = task_type
            super().__init__()

        def build(self, hp):
            """
            Builds a Keras model based on hyperparameters chosen by the tuner.
            (Parameters eased out for faster convergence during HPO).
            """
            # No need to import build_nn_model here if it's imported at the module level.
            # However, if it's explicitly needed here due to circular import issues,
            # ensure it's from the correct place (nn_utils).
            # If `from scripts.nn_utils import build_nn_model` is at the top of this file, remove this local import.
            # If build_nn_model is still in data_modelling.py and there's a circular import, keep this.
            # Given the previous error, it was likely due to a circular import that
            # has now been resolved by suggesting to move build_nn_model to nn_utils.
            # So, assuming build_nn_model is now correctly imported from nn_utils at the top.
            # If not, add: from scripts.nn_utils import build_nn_model here

            # Eased out search space for faster tuning
            num_hidden_layers = hp.Int('num_hidden_layers', min_value=1, max_value=2, default=1) # Reduced max layers
            neurons = hp.Int('neurons', min_value=32, max_value=64, step=32, default=32) # Reduced range
            dropout_rate = hp.Float('dropout_rate', min_value=0.0, max_value=0.3, step=0.1, default=0.1) # Reduced max dropout
            learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4], default=1e-3) # Fewer choices

            model = build_nn_model(
                input_dim=self.input_dim,
                hidden_layers=num_hidden_layers,
                neurons=neurons,
                dropout_rate=dropout_rate,
                learning_rate=learning_rate,
                task_type=self.task_type
            )
            return model

    def tune_neural_network_classification(self, X_train: pd.DataFrame, y_train: pd.Series,
                                           X_val: pd.DataFrame, y_val: pd.Series) -> dict:
        """
        Performs hyperparameter tuning for the Neural Network Classification model
        using Keras Tuner. (Parameters eased out for faster convergence during HPO).

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series): Training target.
            X_val (pd.DataFrame): Validation features.
            y_val (pd.Series): Validation target.

        Returns:
            dict: The best hyperparameters found.
        """
        logger.info("\n--- Tuning Neural Network Classification Model ---")

        model_name = "neural_network_classification"
        best_params_loaded = self.get_best_params(model_name)

        if best_params_loaded:
            logger.info(f"Using loaded best parameters for Neural Network Classification: {best_params_loaded}. Skipping tuning.")
            return best_params_loaded

        tuner_dir = os.path.join(self.output_dir, "keras_tuner_nn_classification")
        os.makedirs(tuner_dir, exist_ok=True)

        hypermodel = self.NeuralNetworkHyperModel(input_dim=X_train.shape[1], task_type='classification')

        tuner = RandomSearch(
            hypermodel,
            objective='val_auc',
            max_trials=5,       # Reduced number of different models to try
            executions_per_trial=1,
            directory=tuner_dir,
            project_name='nn_classification_tuning_eased', # Changed project name to avoid conflicts if old runs exist
            overwrite=True
        )

        logger.info("Running Keras Tuner search...")
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=5, restore_best_weights=True # Reduced patience
        )

        tuner.search(
            X_train, y_train,
            epochs=20, # Reduced max epochs for each trial
            validation_data=(X_val, y_val),
            callbacks=[early_stopping],
            verbose=0
        )

        best_hp = tuner.get_best_hyperparameters(num_trials=1)[0].values

        mapped_params = {
            'hidden_layers': best_hp['num_hidden_layers'],
            'neurons': best_hp['neurons'],
            'dropout_rate': best_hp['dropout_rate'],
            'learning_rate': best_hp['learning_rate']
        }

        self.best_params_cache[model_name] = mapped_params
        self._save_all_best_params()

        logger.info(f"Best hyperparameters for Neural Network Classification: {mapped_params}")

        return mapped_params

Overwriting scripts/model_tuning.py


## model_explainability

In [6]:
%%writefile scripts/model_explainability.py
import pandas as pd
import numpy as np
import logging
import os
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import tensorflow as tf
import wandb # NEW: Import wandb

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    console_handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

class ModelExplainer:
    """
    Handles model explainability for trained machine learning models.
    """
    def __init__(self, output_dir="plots/explainability", wandb_run=None): # NEW: Add wandb_run
        self.output_dir = output_dir
        self.wandb_run = wandb_run # NEW: Store wandb_run
        os.makedirs(self.output_dir, exist_ok=True)
        logger.info(f"ModelExplainer initialized. Explainability plots will be saved to '{self.output_dir}'.")

    def plot_feature_importance(self, model, feature_names: list, filename: str):
        """
        Plots feature importance for tree-based models (RandomForest).

        Args:
            model: Trained scikit-learn model (must have .feature_importances_ attribute).
            feature_names (list): List of feature names corresponding to model's input.
            filename (str): Name of the file to save the plot.
        """
        if not hasattr(model, 'feature_importances_'):
            logger.warning(f"Model of type {type(model).__name__} does not have 'feature_importances_'. Skipping feature importance plot.")
            return

        try:
            importances = model.feature_importances_
            feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
            feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

            plt.figure(figsize=(12, 8))
            sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20)) # Top 20 features
            plt.title(f'Top 20 Feature Importance for {type(model).__name__}')
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.tight_layout()
            plot_path = os.path.join(self.output_dir, filename)
            plt.savefig(plot_path)
            plt.close()
            logger.info(f"Saved feature importance plot to {plot_path}")

            # NEW: Log plot to WandB
            if self.wandb_run:
                self.wandb_run.log({f"Explainability/{filename.replace('.png', '')}": wandb.Image(plot_path)})
                logger.info(f"Feature importance plot logged to WandB for {type(model).__name__}.")

        except Exception as e:
            logger.error(f"Error generating feature importance plot: {e}")

    def plot_coefficient_importance(self, model, feature_names: list, filename: str):
        """
        Plots coefficients for linear models (LogisticRegression, Ridge).

        Args:
            model: Trained scikit-learn linear model (must have .coef_ attribute).
            feature_names (list): List of feature names.
            filename (str): Name of the file to save the plot.
        """
        if not hasattr(model, 'coef_'):
            logger.warning(f"Model of type {type(model).__name__} does not have 'coef_'. Skipping coefficient importance plot.")
            return

        try:
            coefficients = model.coef_.flatten() if model.coef_.ndim > 1 else model.coef_

            coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})
            coef_df['abs_coefficient'] = np.abs(coef_df['coefficient'])
            coef_df = coef_df.sort_values(by='abs_coefficient', ascending=False)

            plt.figure(figsize=(12, 8))
            sns.barplot(x='coefficient', y='feature', data=coef_df.head(20)) # Top 20 features by absolute coefficient
            plt.title(f'Top 20 Feature Coefficients for {type(model).__name__}')
            plt.xlabel('Coefficient Value')
            plt.ylabel('Feature')
            plt.tight_layout()
            plot_path = os.path.join(self.output_dir, filename)
            plt.savefig(plot_path)
            plt.close()
            logger.info(f"Saved coefficient importance plot to {plot_path}")

            # NEW: Log plot to WandB
            if self.wandb_run:
                self.wandb_run.log({f"Explainability/{filename.replace('.png', '')}": wandb.Image(plot_path)})
                logger.info(f"Coefficient importance plot logged to WandB for {type(model).__name__}.")

        except Exception as e:
            logger.error(f"Error generating coefficient importance plot: {e}")


    def plot_shap_summary(self, model, X: pd.DataFrame, filename: str, task_type: str = 'regression', num_features: int = 20):
        """
        Generates and saves a SHAP summary plot.

        Args:
            model: The trained model (scikit-learn or Keras).
            X (pd.DataFrame): The data used to generate SHAP values (e.g., X_test or a sample).
            filename (str): Name of the file to save the plot.
            task_type (str): 'regression' or 'classification'.
            num_features (int): Number of features to display in the summary plot.
        """
        try:
            if isinstance(model, (RandomForestRegressor, RandomForestClassifier)):
                explainer = shap.TreeExplainer(model)
            elif isinstance(model, (LogisticRegression, Ridge)):
                explainer = shap.LinearExplainer(model, X)
            elif isinstance(model, tf.keras.Model):
                explainer = shap.DeepExplainer(model, X.values)
            else:
                logger.warning(f"SHAP Explainer for model type {type(model).__name__} not explicitly supported/optimized. Using KernelExplainer (can be slow).")
                predict_fn = model.predict_proba if task_type == 'classification' and hasattr(model, 'predict_proba') else model.predict
                explainer = shap.KernelExplainer(predict_fn, shap.sample(X, 100, random_state=42))


            if isinstance(explainer, shap.DeepExplainer) and task_type == 'classification':
                shap_values = explainer.shap_values(X.values)
                if isinstance(shap_values, list) and len(shap_values) > 1:
                    shap_values = shap_values[1]
            elif isinstance(explainer, shap.LinearExplainer) and task_type == 'classification':
                shap_values = explainer.shap_values(X.values)
                if isinstance(shap_values, list) and len(shap_values) > 1:
                    shap_values = shap_values[1]
            elif isinstance(explainer, shap.KernelExplainer) and task_type == 'classification':
                shap_values = explainer.shap_values(X)
                if isinstance(shap_values, list) and len(shap_values) > 1:
                    shap_values = shap_values[1]
            else:
                shap_values = explainer.shap_values(X.values if isinstance(explainer, (shap.DeepExplainer, shap.LinearExplainer)) else X)

            plt.figure(figsize=(10, 7))
            shap.summary_plot(shap_values, X, plot_type="bar", show=False, max_display=num_features)
            plt.title(f'SHAP Summary Bar Plot for {type(model).__name__} ({task_type.capitalize()})')
            plt.tight_layout()
            plot_path = os.path.join(self.output_dir, filename)
            plt.savefig(plot_path)
            plt.close()
            logger.info(f"Saved SHAP summary bar plot to {plot_path}")

            # NEW: Log plot to WandB
            if self.wandb_run:
                self.wandb_run.log({f"Explainability/{filename.replace('.png', '')}_bar": wandb.Image(plot_path)})
                logger.info(f"SHAP summary bar plot logged to WandB for {type(model).__name__}.")


            plt.figure(figsize=(10, 7))
            shap.summary_plot(shap_values, X, show=False, max_display=num_features)
            plt.title(f'SHAP Summary Dot Plot for {type(model).__name__} ({task_type.capitalize()})')
            plt.tight_layout()
            dot_filename = filename.replace(".png", "_dot.png")
            dot_plot_path = os.path.join(self.output_dir, dot_filename)
            plt.savefig(dot_plot_path)
            plt.close()
            logger.info(f"Saved SHAP summary dot plot to {dot_plot_path}")

            # NEW: Log plot to WandB
            if self.wandb_run:
                self.wandb_run.log({f"Explainability/{dot_filename.replace('.png', '')}_dot": wandb.Image(dot_plot_path)})
                logger.info(f"SHAP summary dot plot logged to WandB for {type(model).__name__}.")

        except Exception as e:
            logger.error(f"Error generating SHAP summary plot for {type(model).__name__}: {e}", exc_info=True)


    def plot_shap_dependence(self, model, X: pd.DataFrame, feature: str, filename: str, task_type: str = 'regression', interaction_feature: str = None):
        """
        Generates and saves a SHAP dependence plot for a specific feature.

        Args:
            model: The trained model (scikit-learn or Keras).
            X (pd.DataFrame): The data used to generate SHAP values (e.g., X_test or a sample).
            feature (str): The name of the feature for which to plot dependence.
            filename (str): Name of the file to save the plot.
            task_type (str): 'regression' or 'classification'.
            interaction_feature (str, optional): A feature to color the dependence plot by, revealing interactions.
        """
        if feature not in X.columns:
            logger.warning(f"Feature '{feature}' not found in X. Skipping SHAP dependence plot.")
            return
        if interaction_feature and interaction_feature not in X.columns:
            logger.warning(f"Interaction feature '{interaction_feature}' not found in X. Skipping SHAP dependence plot with interaction.")
            interaction_feature = None

        try:
            if isinstance(model, (RandomForestRegressor, RandomForestClassifier)):
                explainer = shap.TreeExplainer(model)
            elif isinstance(model, (LogisticRegression, Ridge)):
                explainer = shap.LinearExplainer(model, X)
            elif isinstance(model, tf.keras.Model):
                explainer = shap.DeepExplainer(model, X.values)
            else:
                predict_fn = model.predict_proba if task_type == 'classification' and hasattr(model, 'predict_proba') else model.predict
                explainer = shap.KernelExplainer(predict_fn, shap.sample(X, 100, random_state=42))

            if isinstance(explainer, shap.DeepExplainer) and task_type == 'classification':
                shap_values = explainer.shap_values(X.values)
                if isinstance(shap_values, list) and len(shap_values) > 1:
                    shap_values = shap_values[1]
            elif isinstance(explainer, shap.LinearExplainer) and task_type == 'classification':
                shap_values = explainer.shap_values(X.values)
                if isinstance(shap_values, list) and len(shap_values) > 1:
                    shap_values = shap_values[1]
            elif isinstance(explainer, shap.KernelExplainer) and task_type == 'classification':
                shap_values = explainer.shap_values(X)
                if isinstance(shap_values, list) and len(shap_values) > 1:
                    shap_values = shap_values[1]
            else:
                shap_values = explainer.shap_values(X.values if isinstance(explainer, (shap.DeepExplainer, shap.LinearExplainer)) else X)

            if shap_values.ndim == 1:
                shap_values = shap_values.reshape(-1, 1)


            plt.figure(figsize=(10, 7))
            shap.dependence_plot(
                ind=feature,
                shap_values=shap_values,
                features=X,
                feature_names=X.columns.tolist(),
                interaction_index=interaction_feature,
                show=False,
                title=f'SHAP Dependence Plot for {feature} ({type(model).__name__})'
            )
            plt.tight_layout()
            plot_path = os.path.join(self.output_dir, filename)
            plt.savefig(plot_path)
            plt.close()
            logger.info(f"Saved SHAP dependence plot for '{feature}' to {plot_path}")

            # NEW: Log plot to WandB
            if self.wandb_run:
                self.wandb_run.log({f"Explainability/{filename.replace('.png', '')}": wandb.Image(plot_path)})
                logger.info(f"SHAP dependence plot logged to WandB for '{feature}' ({type(model).__name__}).")

        except Exception as e:
            logger.error(f"Error generating SHAP dependence plot for feature '{feature}': {e}", exc_info=True)

Overwriting scripts/model_explainability.py


## Main.py

In [16]:
import pandas as pd
import numpy as np
import os
import logging
import sys
import importlib
import wandb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.linear_model import LogisticRegression # Ensure this is imported for isinstance checks

# Reload custom modules to ensure the latest changes are picked up
# This is crucial in environments like Jupyter/Colab where modules might be cached
import scripts.data_loading
importlib.reload(scripts.data_loading)
import scripts.data_profiling
importlib.reload(scripts.data_profiling)
import scripts.data_visualizer
importlib.reload(scripts.data_visualizer)
import scripts.data_preprocessor
importlib.reload(scripts.data_preprocessor)
import scripts.data_modelling
importlib.reload(scripts.data_modelling)
import scripts.data_evaluate
importlib.reload(scripts.data_evaluate)
import scripts.model_tuning
importlib.reload(scripts.model_tuning)
import scripts.model_explainability
importlib.reload(scripts.model_explainability)


# Custom Scripts - ensure these are available in your 'scripts' directory
from scripts.data_loading import DataDownload
from scripts.data_profiling import DataProfiler
from scripts.data_visualizer import DataVisualizer
from scripts.data_preprocessor import DataPreprocessor
from scripts.data_modelling import DataModeling
from scripts.data_evaluate import ModelEvaluator
from scripts.model_tuning import ModelTuner
from scripts.model_explainability import ModelExplainer


# --- Configure Global Logging ---
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Clear existing handlers to prevent duplicate logs if script is run multiple times
if logger.handlers:
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

file_handler = logging.FileHandler('pipeline.log')
file_handler.setLevel(logging.INFO)

console_handler = logging.StreamHandler(sys.stdout) # Use sys.stdout for console output
console_handler.setLevel(logging.INFO)

file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)

console_formatter = logging.Formatter('%(levelname)s: %(message)s')
console_handler.setFormatter(console_formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)


class FlightDelayPipeline:
    """
    Orchestrates the entire machine learning pipeline for flight delay prediction,
    from data download to model explainability, with WandB integration.
    """
    def __init__(self, config: dict):
        self.config = config
        self.wandb_run = None # Will be initialized in _init_wandb

        self._setup_directories()

        # Initialize core components
        self.downloader = DataDownload(
            dataset_name=self.config["dataset_name"],
            download_path=self.raw_data_dir
        )
        self.preprocessor = DataPreprocessor()
        self.profiler = DataProfiler(output_dir=self.reports_dir)
        self.visualizer = DataVisualizer(output_dir=self.eda_plots_dir)
        self.tuner = ModelTuner(output_dir=self.model_params_dir)
        self.modeler = DataModeling(tuner=self.tuner)

        # Evaluator and Explainer depend on wandb_run, so initialize them later
        self.evaluator = None
        self.explainer = None

        logger.info("FlightDelayPipeline initialized with provided configuration.")

    def _setup_directories(self):
        """Defines and creates all necessary output directories."""
        self.raw_data_dir = "data/raw"
        self.processed_data_dir = "data/processed"
        self.reports_dir = "reports"
        self.eda_plots_dir = "plots/eda"
        self.model_eval_plots_dir = "plots/model_evaluation"
        self.models_dir = "models"
        self.model_params_dir = "model_params"
        self.explainability_plots_dir = "plots/explainability"

        for directory in [
            self.raw_data_dir, self.processed_data_dir, self.reports_dir,
            self.eda_plots_dir, self.model_eval_plots_dir, self.models_dir,
            self.model_params_dir, self.explainability_plots_dir
        ]:
            os.makedirs(directory, exist_ok=True)
        logger.info("All pipeline directories ensured to exist.")

    def _handle_colab_environment(self):
        """Adjusts the working directory and installs Kaggle if running in Colab."""
        if 'google.colab' in sys.modules:
            base_dir = '/content/drive/MyDrive/Colab Notebooks/flight_delay'
            os.makedirs(base_dir, exist_ok=True)
            os.chdir(base_dir)
            logger.info(f"Running in Google Colab. Changed current working directory to {os.getcwd()}")
            try:
                import kaggle
            except ImportError:
                logger.info("Kaggle library not found, installing...")
                os.system("pip install kaggle")
                logger.info("Kaggle library installed.")
        else:
            logger.info("Not running in Google Colab environment.")


    def _init_wandb(self):
        """Initializes Weights & Biases run and sets up dependent components."""
        try:
            self.wandb_run = wandb.init(
                project=self.config["project_name"],
                name=self.config["run_name"],
                config=self.config # Log pipeline configuration
            )
            logger.info(f"WandB run initialized: {self.wandb_run.url}")
            # Initialize evaluator and explainer now that wandb_run is available
            self.evaluator = ModelEvaluator(
                output_dir=self.models_dir,
                plots_dir=self.model_eval_plots_dir,
                wandb_run=self.wandb_run
            )
            self.explainer = ModelExplainer(
                output_dir=self.explainability_plots_dir,
                wandb_run=self.wandb_run
            )
            logger.info("ModelEvaluator and ModelExplainer initialized with WandB integration.")
        except Exception as e:
            logger.error(f"Failed to initialize WandB: {e}. Proceeding without WandB logging.", exc_info=True)
            self.wandb_run = None # Ensure it's None if init fails
            # Still initialize evaluators/explainers, but without wandb_run
            self.evaluator = ModelEvaluator(
                output_dir=self.models_dir,
                plots_dir=self.model_eval_plots_dir,
                wandb_run=None
            )
            self.explainer = ModelExplainer(
                output_dir=self.explainability_plots_dir,
                wandb_run=None
            )
            logger.warning("ModelEvaluator and ModelExplainer initialized without WandB integration due to error.")


    def run(self):
        """Executes the entire machine learning pipeline."""
        self._handle_colab_environment()
        self._init_wandb() # Initialize WandB at the start of the run
        logger.info("--- Starting ML Project Pipeline ---")

        try:
            df_raw = self._step_data_download()
            df_filtered = self._step_data_load_and_filter(df_raw)
            self._step_data_profiling(df_filtered)
            df_eda = self._step_data_preprocessing_eda(df_filtered)
            self._step_data_visualization_eda(df_eda)
            df_model = self._step_data_preprocessing_modeling(df_filtered)
            self._step_data_modeling_and_evaluation(df_model)

        except Exception as e:
            logger.critical(f"Pipeline run failed: {e}", exc_info=True)
            if self.wandb_run:
                self.wandb_run.log_code(".") # Log code on pipeline failure
                self.wandb_run.finish(exit_code=1)
            sys.exit(1) # Exit with an error code

        logger.info("\n--- ML Project Pipeline Complete ---")
        if self.wandb_run:
            self.wandb_run.log_code(".") # Log code on successful completion
            self.wandb_run.finish() # End the WandB run

    def _step_data_download(self) -> pd.DataFrame:
        """Step 1: Downloads data and loads the raw CSV."""
        logger.info("\n--- Step 1: Data Download ---")
        try:
            self.downloader.data_download()
            csv_file_path = os.path.join(self.raw_data_dir, self.config["data_sample_file"])
            df_raw = pd.read_csv(csv_file_path)
            logger.info(f"Raw dataset loaded. Shape: {df_raw.shape}")
            if self.wandb_run:
                artifact = wandb.Artifact(name="raw-flights-data", type="dataset")
                artifact.add_dir(self.raw_data_dir)
                self.wandb_run.log_artifact(artifact)
                logger.info("Raw data artifact logged to WandB.")
            return df_raw
        except FileNotFoundError:
            logger.critical(f"Error: The expected CSV file '{self.config['data_sample_file']}' was not found after download.", exc_info=True)
            raise
        except Exception as e:
            logger.critical(f"An error occurred during data download or initial raw loading: {e}", exc_info=True)
            raise

    def _step_data_load_and_filter(self, df_raw: pd.DataFrame) -> pd.DataFrame:
        """Step 2: Applies initial filtering for non-cancelled flights."""
        logger.info("\n--- Step 2: Data Loading & Initial Filtering ---")
        initial_shape = df_raw.shape
        df_filtered = df_raw[df_raw['CANCELLED'] == 0].copy()
        logger.info(f"Filtered for non-cancelled flights. Original shape: {initial_shape}, Filtered shape: {df_filtered.shape}")
        return df_filtered

    def _step_data_profiling(self, df: pd.DataFrame):
        """Step 3: Generates and logs a data profiling report."""
        logger.info("\n--- Step 3: Data Profiling (Pre-preprocessing) ---")
        report_name = "flight_data_profile_report_pre_processing.html"
        report_path = os.path.join(self.reports_dir, report_name)

        if os.path.exists(report_path):
            logger.info(f"Profiling report already exists at: {report_path}. Skipping generation, logging existing.")
        else:
            try:
                self.profiler.generate_profile_report(df, report_name=report_name)
                logger.info(f"Pre-processing data profiling complete. Report saved to: {report_path}")
            except Exception as e:
                logger.error(f"Error during pre-processing data profiling: {e}", exc_info=True)
                return # Do not log artifact if generation failed

        if self.wandb_run and os.path.exists(report_path):
            artifact = wandb.Artifact(name="pre_processing_data_profile", type="report")
            artifact.add_file(report_path)
            self.wandb_run.log_artifact(artifact)
            logger.info("Profiling report artifact logged to WandB.")

    def _step_data_preprocessing_eda(self, df_filtered: pd.DataFrame) -> pd.DataFrame:
        """Step 4: Preprocesses data specifically for EDA visualizations."""
        logger.info("\n--- Step 4: Data Preprocessing for Visualization (EDA) ---")
        df_eda = df_filtered.copy()
        try:
            # Convert to EDA specific target (e.g., ARR_DELAY > 10 for broad EDA)
            if 'ARR_DELAY' in df_eda.columns:
                df_eda['FLIGHT_STATUS_EDA'] = (df_eda['ARR_DELAY'] > 10).astype(int)
                logger.info("Added 'FLIGHT_STATUS_EDA' column for EDA.")

            # Add temporal features for EDA
            if 'FL_DATE' in df_eda.columns:
                df_eda['YEAR'] = pd.to_datetime(df_eda['FL_DATE']).dt.year
                df_eda['MONTH'] = pd.to_datetime(df_eda['FL_DATE']).dt.month
                logger.info("Added 'YEAR' and 'MONTH' columns for EDA time-series plots.")

            df_eda = self.preprocessor.create_elapsed_time_diff(df_eda) # Example of using a preprocessor method
            logger.info(f"EDA preprocessing complete. EDA DataFrame shape: {df_eda.shape}")
        except Exception as e:
            logger.error(f"Error during EDA data preprocessing: {e}", exc_info=True)
            df_eda = None # Indicate failure
        return df_eda

    def _step_data_visualization_eda(self, df_eda: pd.DataFrame):
        """Step 5: Generates and logs various EDA plots."""
        if df_eda is None:
            logger.warning("\n--- Data Visualization Skipped (EDA DataFrame is None) ---")
            return

        logger.info("\n--- Step 5: Data Visualization (EDA) ---")
        try:
            # Define plots and their filenames
            plots = [
                (self.visualizer.plot_column_distribution, [df_eda, 15, 4, "all_column_distributions_eda.png"], "EDA/column_distributions"),
                (self.visualizer.plot_airline_counts, [df_eda, "airline_flight_counts_eda.png"], "EDA/airline_flight_counts"),
                (self.visualizer.plot_destination_visits, [df_eda, self.config["eda_top_n_destinations"], "top_20_destination_visits_eda.png"], "EDA/top_destination_visits"),
                (self.visualizer.plot_average_arrival_delay_by_airline, [df_eda, self.config["eda_min_flight_count_airline_delay"], "avg_arrival_delay_by_airline_eda.png"], "EDA/avg_arrival_delay_by_airline"),
                (self.visualizer.plot_total_delays_by_year, [df_eda, "total_delays_by_year_eda.png"], "EDA/total_delays_by_year"),
                (self.visualizer.plot_monthly_delays_by_year, [df_eda, "monthly_delays_by_year_eda.png"], "EDA/monthly_delays_by_year"),
                (self.visualizer.plot_monthly_trend_with_highlight, [df_eda, 'ARR_DELAY', 'Monthly Total Delays Over Time', 'Total Delays (minutes)', "monthly_delay_trend_highlight_eda.png"], "EDA/monthly_delay_trend"),
                (self.visualizer.plot_delay_reason_analysis, [df_eda, "delay_reason_breakdown_eda.png"], "EDA/delay_reason_breakdown")
            ]

            for plot_func, args, wandb_key in plots:
                filename = args[-1] # Filename is always the last argument
                plot_path = os.path.join(self.eda_plots_dir, filename)
                try:
                    plot_func(*args)
                    logger.info(f"Generated {filename}")
                    if self.wandb_run:
                        self.wandb_run.log({wandb_key: wandb.Image(plot_path)})
                        logger.info(f"Logged {filename} to WandB.")
                except Exception as p_e:
                    logger.warning(f"Failed to generate or log {filename}: {p_e}", exc_info=True)

            logger.info("Data visualization complete. Plots saved to 'plots/eda/' directory and logged to WandB.")
        except Exception as e:
            logger.error(f"Error during data visualization: {e}", exc_info=True)

    def _step_data_preprocessing_modeling(self, df_filtered: pd.DataFrame) -> pd.DataFrame:
        """Step 6: Preprocesses data for machine learning modeling."""
        if df_filtered is None:
            logger.critical("\n--- Modeling Data Preprocessing Skipped (Data not loaded) ---")
            raise ValueError("Filtered DataFrame is None, cannot proceed with modeling preprocessing.")

        logger.info("\n--- Step 6: Data Preprocessing for Modeling ---")
        df_model = df_filtered.copy()

        try:
            # Convert to Classification Target
            if 'ARR_DELAY' in df_model.columns:
                df_model[self.config["target_column"]] = (df_model['ARR_DELAY'] > self.config["arr_delay_threshold_mins"]).astype(int)
                logger.info(f"Created '{self.config['target_column']}' binary target (1 if ARR_DELAY > {self.config['arr_delay_threshold_mins']}, 0 otherwise).")
                logger.info(f"{self.config['target_column']} value counts:\n{df_model[self.config['target_column']].value_counts().to_string()}")
            else:
                logger.critical("'ARR_DELAY' column not found for creating classification target. Exiting.")
                raise ValueError("'ARR_DELAY' column missing.")

            delay_cols = ['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER',
                          'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
                          'DELAY_DUE_LATE_AIRCRAFT']
            df_model = self.preprocessor.handle_missing_values(df_model, delay_cols)
            logger.info("Handled missing values in delay columns.")

            df_model = self.preprocessor.create_elapsed_time_diff(df_model)
            logger.info("Created elapsed time difference feature.")

            time_columns = ['CRS_DEP_TIME', 'CRS_ARR_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'DEP_TIME', 'ARR_TIME']
            df_model = self.preprocessor.apply_cyclical_encoding(df_model, time_columns)
            logger.info("Applied cyclical encoding to time columns.")

            city_state_columns = ['DEST_CITY', 'ORIGIN_CITY']
            df_model = self.preprocessor.split_city_state(df_model, city_state_columns)
            logger.info("Split city-state columns.")

            date_columns = ['FL_DATE']
            df_model = self.preprocessor.add_weekday_weekend_columns(df_model, date_columns)
            logger.info("Added weekday/weekend columns.")

            columns_to_exclude_model = [
                'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE',
                'CANCELLED', 'DIVERTED', 'CANCELLATION_CODE',
                'FLIGHT_STATUS_EDA', # Drop EDA specific column
                'CRS_DEP_TIME', 'CRS_ARR_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'DEP_TIME', 'ARR_TIME', # Original time columns
                'FL_DATE', 'ORIGIN', 'DEST', # Original location/date columns
                'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', # Original delay reason columns (after handling missing)
                'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT',
                'ARR_DELAY' # Original target continuous column
            ]
            df_model = self.preprocessor.exclude_columns(df_model, columns_to_exclude_model)
            logger.info(f"Excluded specified columns. Current shape: {df_model.shape}")

            df_model = self.preprocessor.encode_categorical_features(df_model)
            logger.info(f"Encoded categorical features. Current shape: {df_model.shape}")


            df_for_corr_check = df_model.drop(columns=[self.config["target_column"]], errors='ignore')
            columns_to_drop_high_corr, _ = self.preprocessor.identify_high_correlation_pairs(df_for_corr_check, threshold=self.config["high_correlation_threshold"])

            if columns_to_drop_high_corr:
                logger.info(f"Removing {len(columns_to_drop_high_corr)} columns due to high correlation (> {self.config['high_correlation_threshold']}): {columns_to_drop_high_corr}")
                df_model = df_model.drop(columns=list(columns_to_drop_high_corr), errors='ignore')
            else:
                logger.info("No columns identified for removal due to high correlation.")

            logger.info("Data preprocessing for modeling complete.")
            logger.info(f"Final Modeling DataFrame shape: {df_model.shape}")
            logger.info(f"Final Modeling DataFrame columns (first 5):\n{df_model.columns.tolist()[:5]}...")

            output_filepath_model = os.path.join(self.processed_data_dir, 'preprocessed_flight_data_for_modeling.csv')
            df_model.to_csv(output_filepath_model, index=False)
            logger.info(f"Preprocessed data for modeling saved to: {output_filepath_model}")
            if self.wandb_run:
                artifact = wandb.Artifact(name="preprocessed-modeling-data", type="processed_data")
                artifact.add_file(output_filepath_model)
                self.wandb_run.log_artifact(artifact)
                logger.info("Preprocessed data artifact logged to WandB.")

        except Exception as e:
            logger.critical(f"Critical error during data preprocessing for modeling: {e}", exc_info=True)
            df_model = None # Indicate failure
            raise # Re-raise for pipeline termination
        return df_model

    def _step_data_modeling_and_evaluation(self, df_model: pd.DataFrame):
        """Step 7: Handles data splitting, scaling, model training, and evaluation."""
        if df_model is None or df_model.empty:
            logger.critical("\n--- Data Modeling Skipped (Modeling DataFrame is None or empty) ---")
            raise ValueError("Modeling DataFrame is None or empty, cannot proceed with modeling.")

        logger.info("\n--- Step 7: Data Modeling and Evaluation ---")

        if self.config["target_column"] not in df_model.columns:
            logger.critical(f"Target column '{self.config['target_column']}' not found in the modeling DataFrame.")
            raise ValueError("Target column missing from modeling DataFrame.")

        X = df_model.drop(columns=[self.config["target_column"]])
        y = df_model[self.config["target_column"]]

        logger.info(f"Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")
        logger.info(f"Target variable distribution:\n{y.value_counts().to_string()}")

        # Final NaN drop (for any remaining NaNs after encoding or other transforms)
        rows_before_final_nan_drop = X.shape[0]
        X = X.dropna(axis=0, how='any') # Drop rows with ANY NaN in features
        y = y.loc[X.index] # Align y with X
        if X.shape[0] < rows_before_final_nan_drop:
            rows_removed_nan = rows_before_final_nan_drop - X.shape[0]
            logger.warning(f"Removed {rows_removed_nan} rows due to NaN values in features after final preprocessing.")
            logger.info(f"Updated Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")

        if X.empty or y.empty:
            logger.critical("Features or target DataFrame is empty after final NaN handling. Cannot proceed with modeling.")
            raise ValueError("Empty DataFrame after final NaN handling.")

        # Identify numerical columns for scaling (excluding binary and cyclical)
        numerical_features = X.select_dtypes(include=np.number).columns.tolist()
        cols_to_scale = [
            col for col in numerical_features
            if not (X[col].dropna().isin([0, 1]).all() or col.endswith('_SIN') or col.endswith('_COS'))
        ]
        logger.info(f"Identified {len(cols_to_scale)} numerical features for scaling.")

        scaler_name = 'feature_scaler_classification'
        scaler = self.evaluator.load_model(scaler_name)

        if scaler:
            X_scaled = X.copy()
            if cols_to_scale:
                X_scaled[cols_to_scale] = scaler.transform(X_scaled[cols_to_scale])
            logger.info("Loaded and applied existing StandardScaler to numerical features.")
        else:
            scaler = StandardScaler()
            X_scaled = X.copy()
            if cols_to_scale:
                X_scaled[cols_to_scale] = scaler.fit_transform(X_scaled[cols_to_scale])
            self.evaluator.save_model(scaler, scaler_name)
            logger.info("Trained and saved new StandardScaler for numerical features.")

        # Train-Validation-Test Split on SCALED data
        X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=self.config["test_size"], random_state=self.config["random_state"], stratify=y)
        # Handle cases where stratify might fail for very small classes (unlikely with 10k samples)
        if len(np.unique(y_train_val)) < 2 or len(np.unique(y_test)) < 2:
             logger.warning("Stratified split resulted in fewer than two classes in train/test set. Skipping further validation split if necessary.")
             X_train, X_val, y_train, y_val = X_train_val, None, y_train_val, None # No separate validation set
        else:
            # Ensure the validation split ratio is correctly applied to the train_val set
            # test_size_for_val = self.config["validation_split_ratio"] / (1 - self.config["test_size"])
            X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=self.config["validation_split_ratio"], random_state=self.config["random_state"], stratify=y_train_val)
            logger.info(f"Further split of Train-Val: Train={X_train.shape[0]} samples, Validation={X_val.shape[0]} samples")


        logger.info(f"Data split: Train-Val (for HPO/Training)={X_train_val.shape[0]} samples, Test (final evaluation)={X_test.shape[0]} samples")
        logger.info(f"Test Set Target Distribution:\n{y_test.value_counts().to_string()}")


        models_config = {
            "baseline_model": {
                "func": self.modeler.run_baseline_model_classification,
                "train_args": (y_train_val, y_test),
                "predict_data": y_test
            },
            "logistic_regression_model": {
                "func": self.modeler.run_logistic_regression,
                "train_args": (X_train_val, y_train_val),
                "predict_data": X_test
            },
            "neural_network_classification_model": {
                "func": self.modeler.run_neural_network_classification,
                "train_args": (X_train, y_train, X_val, y_val),
                "predict_data": X_test
            }
        }

        trained_models = {}
        for model_name, cfg in models_config.items():
            logger.info(f"\n--- Processing {model_name.replace('_', ' ').title()} ---")

            model = self.evaluator.load_model(model_name)

            if model is not None:
                logger.info(f"Loaded existing model: {model_name}")
            else:
                logger.info(f"Training new model: {model_name}")
                model = cfg["func"](*cfg["train_args"])
                self.evaluator.save_model(model, model_name)

            trained_models[model_name] = model

            # Make predictions
            y_pred_proba = None
            y_pred = None

            if model_name == "baseline_model":
                majority_class = y_train_val.mode()[0]
                y_pred_proba = np.full(len(cfg["predict_data"]), float(majority_class))
                y_pred = y_pred_proba.astype(int)
            elif isinstance(model, tf.keras.Model):
                y_pred_proba = model.predict(cfg["predict_data"], verbose=0).flatten()
                y_pred = (y_pred_proba > 0.5).astype(int)
            else: # Scikit-learn models
                if hasattr(model, 'predict_proba'):
                    y_pred_proba = model.predict_proba(cfg["predict_data"])[:, 1]
                else: # For models without predict_proba (e.g., some simple estimators)
                    logger.warning(f"Model {model_name} does not have predict_proba. Using predict for probabilities (may not be well-calibrated).")
                    y_pred_proba = model.predict(cfg["predict_data"]).astype(float)
                y_pred = model.predict(cfg["predict_data"])

            # Evaluate
            self.evaluator.evaluate_classification_model(
                model_name.replace('_', ' ').title(),
                y_test, y_pred, y_pred_proba, stage="Test"
            )

            # Plot ROC and Confusion Matrix (skip for baseline)
            if model_name != "baseline_model":
                self.evaluator.plot_roc_curve(model_name.replace('_', ' ').title(), y_test, y_pred_proba, stage="Test")
                self.evaluator.plot_confusion_matrix(model_name.replace('_', ' ').title(), y_test, y_pred, stage="Test")

            # Model Explainability (skip for baseline)
            if model_name != "baseline_model" and trained_models[model_name] is not None:
                self._step_model_explainability(model_name, trained_models[model_name], X_test)
            else:
                logger.info(f"Model explainability skipped for {model_name}.")

        self.evaluator.display_results_table()

    def _step_model_explainability(self, model_name: str, model, X_test: pd.DataFrame):
        """Step 8: Generates and logs SHAP and coefficient plots for trained models."""
        if self.explainer is None:
            logger.warning("Model explainer not initialized (likely due to WandB issue). Skipping explainability plots.")
            return
        if model is None:
            logger.warning(f"Model {model_name} is None. Cannot perform explainability.")
            return

        logger.info(f"\n--- Step 8: Model Explainability ({model_name.replace('_', ' ').title()}) ---")
        try:
            X_explain_sample = X_test.sample(n=min(self.config["shap_explain_sample_size"], X_test.shape[0]), random_state=self.config["random_state"])
            task_type = 'classification'

            # SHAP Summary Plot
            self.explainer.plot_shap_summary(
                model=model,
                X=X_explain_sample,
                filename=f"{model_name}_shap_summary_plot.png",
                task_type=task_type
            )
            logger.info(f"SHAP summary plots generated for {model_name}.")

            # SHAP Dependence Plots for a few features
            # Prioritize features based on anticipated importance or from initial SHAP summary
            potential_top_features = X_explain_sample.columns.tolist()
            features_for_dependence = []

            # Smart selection of features for dependence plots
            # You might want to get top N features dynamically after running SHAP summary once
            # For now, keeping a sensible list based on expected features from your pipeline
            if 'CRS_ELAPSED_TIME' in potential_top_features: features_for_dependence.append('CRS_ELAPSED_TIME')
            if 'DISTANCE' in potential_top_features: features_for_dependence.append('DISTANCE')
            if 'AIRLINE_WN' in potential_top_features: features_for_dependence.append('AIRLINE_WN') # Example OHE feature
            if 'ORIGIN_STATE_CA' in potential_top_features: features_for_dependence.append('ORIGIN_STATE_CA') # Example OHE feature
            if 'DAY_OF_WEEK_is_weekend' in potential_top_features: features_for_dependence.append('DAY_OF_WEEK_is_weekend')
            if 'CRS_DEP_TIME_SIN' in potential_top_features: features_for_dependence.append('CRS_DEP_TIME_SIN')
            if 'MONTH_8' in potential_top_features: features_for_dependence.append('MONTH_8') # Example month from OHE

            if features_for_dependence:
                logger.info(f"Generating SHAP dependence plots for: {features_for_dependence}")
                for feature in features_for_dependence:
                    if feature in X_explain_sample.columns: # Final check
                        self.explainer.plot_shap_dependence(
                            model=model,
                            X=X_explain_sample,
                            feature=feature,
                            filename=f"{model_name}_shap_dependence_plot_{feature}.png",
                            task_type=task_type
                        )
                    else:
                        logger.warning(f"Feature '{feature}' not found in the explainability sample. Skipping dependence plot.")
            else:
                logger.info(f"No predefined or suitable features found for SHAP dependence plots for {model_name}.")

            # Coefficient Importance for linear models
            if isinstance(model, LogisticRegression):
                self.explainer.plot_coefficient_importance(
                    model=model,
                    feature_names=X_explain_sample.columns.tolist(),
                    filename=f"{model_name}_coefficient_importance.png"
                )
                logger.info(f"Coefficient importance plot generated for {model_name}.")

        except Exception as e:
            logger.error(f"Error during model explainability for {model_name.replace('_', ' ').title()}: {e}", exc_info=True)


if __name__ == "__main__":
    # Define common configuration for WandB and pipeline
    # These parameters can be easily changed for new experiments
    pipeline_config = {
        "project_name": "flight-delay-prediction-final",
        "run_name": "full_pipeline_refactored_v1",
        "dataset_name": "patrickzel/flight-delay-and-cancellation-data-2019-2023-v2",
        "data_sample_file": "flights_sample_10k.csv",
        "target_column": "FLIGHT_STATUS",
        "arr_delay_threshold_mins": 15, # For binary classification target: delay > 15 mins
        "test_size": 0.15, # Fraction of data for final test set
        "validation_split_ratio": 0.15, # Fraction of (train+val) for validation set, e.g. 0.15 of train_val
        "random_state": 42,
        "high_correlation_threshold": 0.9, # For feature removal
        "eda_top_n_destinations": 20,
        "eda_min_flight_count_airline_delay": 500,
        "shap_explain_sample_size": 1000, # Number of samples for SHAP calculations
        "nn_epochs": 10, # Neural Network training epochs
        "nn_batch_size": 32 # Neural Network training batch size
    }

    # Create and run the pipeline
    pipeline = FlightDelayPipeline(pipeline_config)
    pipeline.run()

ModuleNotFoundError: No module named 'scripts'

In [13]:
%%python pipeline.py

Dataset URL: https://www.kaggle.com/datasets/patrickzel/flight-delay-and-cancellation-data-2019-2023-v2
License(s): other
flight-delay-and-cancellation-data-2019-2023-v2.zip: Skipping, found more recently modified local copy (use --force to force download)
[1;34mUpgrade to ydata-sdk[0m
Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.
Register at https://ydata.ai/register
Generating column distribution plots and saving to all_column_distributions_eda.png...
Generating airline counts plot and saving to airline_flight_counts_eda.png...
Generating top 20 destination visits plot and saving to top_20_destination_visits_eda.png...
Generating average arrival delay by airline plot and saving to avg_arrival_delay_by_airline_eda.png...

Additional Insights (Average Delay by Airline):
- Worst performing airline: Southwest Airlines Co. (30.6 min avg delay)
- Best performing

2025-07-15 16:19:28.308564: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-15 16:19:28.313056: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-15 16:19:28.325203: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752596368.345141   12097 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752596368.351091   12097 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752596368.366981   12097 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

CalledProcessError: Command 'b' \n'' returned non-zero exit status 1.

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/flight_delay/data/processed/preprocessed_flight_data_for_modeling.csv')


In [None]:
df.shape

(97148, 22)

In [None]:
df.columns

Index(['AIRLINE', 'FL_NUMBER', 'DEST', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN',
       'CRS_ELAPSED_TIME', 'DELAY_DUE_CARRIER', 'FLIGHT_STATUS',
       'CRS_DEP_TIME_MINUTES', 'CRS_DEP_TIME_MINUTES_SIN',
       'CRS_DEP_TIME_MINUTES_COS', 'CRS_ARR_TIME_MINUTES',
       'CRS_ARR_TIME_MINUTES_SIN', 'CRS_ARR_TIME_MINUTES_COS',
       'ARR_TIME_MINUTES', 'DEST_CITY_CITY', 'DEST_CITY_STATE',
       'ORIGIN_CITY_CITY', 'ORIGIN_CITY_STATE', 'DAY_OF_WEEK', 'IS_WEEKEND'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97148 entries, 0 to 97147
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   AIRLINE                   97148 non-null  object 
 1   FL_NUMBER                 97148 non-null  int64  
 2   DEST                      97148 non-null  object 
 3   DEP_DELAY                 97148 non-null  float64
 4   TAXI_OUT                  97148 non-null  float64
 5   TAXI_IN                   97148 non-null  float64
 6   CRS_ELAPSED_TIME          97148 non-null  float64
 7   DELAY_DUE_CARRIER         97148 non-null  float64
 8   FLIGHT_STATUS             97148 non-null  int64  
 9   CRS_DEP_TIME_MINUTES      97148 non-null  int64  
 10  CRS_DEP_TIME_MINUTES_SIN  97148 non-null  float64
 11  CRS_DEP_TIME_MINUTES_COS  97148 non-null  float64
 12  CRS_ARR_TIME_MINUTES      97148 non-null  int64  
 13  CRS_ARR_TIME_MINUTES_SIN  97148 non-null  float64
 14  CRS_AR

In [None]:
# Test script to verify category_encoders import
try:
    import category_encoders as ce
    print("✓ category_encoders imported successfully")
    print(f"Available encoders: {[attr for attr in dir(ce) if 'Encoder' in attr]}")

    # Test WOE Encoder specifically
    woe_encoder = ce.WOEEncoder()
    print("✓ WOEEncoder created successfully")

except ImportError as e:
    print(f"✗ Import error: {e}")
    print("Installing category_encoders...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'category_encoders'])

    # Try importing again
    import category_encoders as ce
    print("✓ category_encoders installed and imported successfully")

except Exception as e:
    print(f"✗ Unexpected error: {e}")

# Test other required imports
required_imports = [
    ('pandas', 'pd'),
    ('numpy', 'np'),
    ('sklearn.preprocessing', 'LabelEncoder'),
    ('sklearn.preprocessing', 'OneHotEncoder'),
    ('sklearn.preprocessing', 'OrdinalEncoder'),
]

for module, item in required_imports:
    try:
        exec(f"from {module} import {item}")
        print(f"✓ {module}.{item} imported successfully")
    except ImportError as e:
        print(f"✗ Failed to import {module}.{item}: {e}")

✓ category_encoders imported successfully
Available encoders: ['BackwardDifferenceEncoder', 'BaseNEncoder', 'BinaryEncoder', 'CatBoostEncoder', 'CountEncoder', 'GLMMEncoder', 'GrayEncoder', 'HashingEncoder', 'HelmertEncoder', 'JamesSteinEncoder', 'LeaveOneOutEncoder', 'MEstimateEncoder', 'OneHotEncoder', 'OrdinalEncoder', 'PolynomialEncoder', 'QuantileEncoder', 'RankHotEncoder', 'SumEncoder', 'SummaryEncoder', 'TargetEncoder', 'WOEEncoder']
✓ WOEEncoder created successfully
✗ Failed to import pandas.pd: cannot import name 'pd' from 'pandas' (/usr/local/lib/python3.11/dist-packages/pandas/__init__.py)
✗ Failed to import numpy.np: cannot import name 'np' from 'numpy' (/usr/local/lib/python3.11/dist-packages/numpy/__init__.py)
✓ sklearn.preprocessing.LabelEncoder imported successfully
✓ sklearn.preprocessing.OneHotEncoder imported successfully
✓ sklearn.preprocessing.OrdinalEncoder imported successfully


In [None]:
pd.options.display.max_columns=100

## Testing


In [11]:
import pandas as pd
import numpy as np
import os
import logging
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import importlib

# Import your custom scripts
from scripts.data_loading import DataDownload
from scripts.data_preprocessor import DataPreprocessor
from scripts.model_explainability import ModelExplainer
from scripts.data_evaluate import ModelEvaluator # To load scaler and model

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# --- Configure Logging ---
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    console_handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

# --- Define Paths ---
raw_data_dir = "data/raw"
processed_data_dir = "data/processed" # Ensure this exists if you need to save preprocessed data
models_dir = "models"
explainability_plots_dir = "plots/explainability"

nn_model_filename = "neural_network_classification_model"
nn_model_path = os.path.join(models_dir, nn_model_filename)
scaler_filename = "feature_scaler_classification" # Name of your saved scaler
scaler_path = os.path.join(models_dir, f"{scaler_filename}.pkl") # Assuming .pkl extension for scaler

# Ensure directories exist
os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)
os.makedirs(explainability_plots_dir, exist_ok=True)



logger.info("--- Starting SHAP Explainability Test for Neural Network with Flights Data ---")

# Initialize necessary components from your pipeline
downloader = DataDownload(dataset_name="patrickzel/flight-delay-and-cancellation-data-2019-2023-v2",
                          download_path=raw_data_dir)
preprocessor = DataPreprocessor()
evaluator = ModelEvaluator(output_dir=models_dir, plots_dir="plots/model_evaluation") # Need evaluator to load model/scaler
explainer = ModelExplainer(output_dir=explainability_plots_dir)

# --- 1. Data Download & Initial Filtering (mimic pipeline.py) ---
logger.info("\n--- Step 1: Data Download ---")
try:
    downloader.data_download()
    csv_file_path = os.path.join(raw_data_dir, 'flights_sample_10k.csv')
    df_raw = pd.read_csv(csv_file_path)
    df_filtered_cancelled = df_raw[df_raw['CANCELLED'] == 0].copy()
    logger.info(f"Loaded and filtered flights data. Shape: {df_filtered_cancelled.shape}")
except Exception as e:
    logger.critical(f"Error during data loading: {e}", exc_info=True)
    exit(1)

# --- 2. Data Preprocessing for Modeling (mimic pipeline.py) ---
logger.info("\n--- Step 2: Data Preprocessing for Modeling ---")
df_model = df_filtered_cancelled.copy()

try:
    # --- Convert to Classification Target ---
    df_model['FLIGHT_STATUS'] = (df_model['ARR_DELAY'] > 15).astype(int)

    delay_cols = ['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER',
                  'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
                  'DELAY_DUE_LATE_AIRCRAFT']
    df_model = preprocessor.handle_missing_values(df_model, delay_cols)

    df_model = preprocessor.create_elapsed_time_diff(df_model)

    time_columns = ['CRS_DEP_TIME', 'CRS_ARR_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'DEP_TIME', 'ARR_TIME']
    df_model = preprocessor.apply_cyclical_encoding(df_model, time_columns)

    city_state_columns = ['DEST_CITY', 'ORIGIN_CITY']
    df_model = preprocessor.split_city_state(df_model, city_state_columns)

    date_columns = ['FL_DATE']
    df_model = preprocessor.add_weekday_weekend_columns(df_model, date_columns)

    columns_to_exclude_model = [
        'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE',
        'CANCELLED', 'DIVERTED', 'CANCELLATION_CODE',
        'FLIGHT_STATUS_EDA', # Drop EDA specific column from pipeline's EDA step
        'CRS_DEP_TIME', 'CRS_ARR_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'DEP_TIME', 'ARR_TIME',
        'FL_DATE', 'ORIGIN', 'DEST',
        'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS',
        'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT',
        'ARR_DELAY'
    ]
    df_model = preprocessor.exclude_columns(df_model, columns_to_exclude_model)

    df_model = preprocessor.encode_categorical_features(df_model)

    # High Correlation Feature Removal (re-evaluate this with your actual features if necessary)
    df_for_corr_check = df_model.drop(columns=['FLIGHT_STATUS'], errors='ignore')
    columns_to_drop_high_corr, _ = preprocessor.identify_high_correlation_pairs(df_for_corr_check, threshold=0.9)
    if columns_to_drop_high_corr:
        df_model = df_model.drop(columns=list(columns_to_drop_high_corr), errors='ignore')
        logger.info(f"Removed {len(columns_to_drop_high_corr)} columns due to high correlation during preprocessing for modeling.")


    logger.info(f"Preprocessing for modeling complete. Final DataFrame shape: {df_model.shape}")
    # Make sure all columns that were available during training are present,
    # and any new columns are handled (e.g., if one-hot encoding creates more)
    # It's crucial that X.columns match exactly what the model was trained on.

except Exception as e:
    logger.critical(f"Critical error during data preprocessing for modeling: {e}", exc_info=True)
    exit(1)

# --- 3. Prepare Data for Model (mimic pipeline.py train/test split and scaling) ---
logger.info("\n--- Step 3: Preparing Data for Model ---")
if 'FLIGHT_STATUS' not in df_model.columns:
    logger.critical("Target column 'FLIGHT_STATUS' not found. Cannot proceed.")
    exit(1)

X = df_model.drop(columns=['FLIGHT_STATUS'])
y = df_model['FLIGHT_STATUS']

# Final NaN drop from X (as done in pipeline)
X = X.dropna(axis=1, how='all')
rows_before_final_nan_drop = X.shape[0]
X = X.dropna()
y = y[X.index] # Align y with X
if X.shape[0] < rows_before_final_nan_drop:
    logger.warning(f"Removed {rows_before_final_nan_drop - X.shape[0]} rows due to NaN values in features during final data prep.")

# Train-Test Split (use the same random_state and split ratio as in pipeline.py)
# We only need X_test for explainability
_, X_test, _, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
logger.info(f"Test set shape before scaling: X_test {X_test.shape}, y_test {y_test.shape}")

# Load the trained scaler
scaler = evaluator.load_model(scaler_filename)
if scaler is None:
    logger.critical(f"Scaler '{scaler_filename}' not found at {scaler_path}. Please run pipeline.py first to train and save the scaler.")
    exit(1)

# Identify numerical columns that were scaled during pipeline run
numerical_features_to_scale = [col for col in X_test.select_dtypes(include=np.number).columns if not (X_test[col].dropna().isin([0, 1]).all() or col.endswith('_SIN') or col.endswith('_COS'))]

# Apply scaling to the test set
X_test_scaled = X_test.copy()
if numerical_features_to_scale:
    X_test_scaled[numerical_features_to_scale] = scaler.transform(X_test_scaled[numerical_features_to_scale])
    logger.info(f"Applied StandardScaler to {len(numerical_features_to_scale)} numerical features in X_test.")
else:
    logger.info("No numerical features to scale in X_test or already handled.")

# Crucially, ensure X_test_scaled is a DataFrame with original column names
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
logger.info(f"Scaled X_test_df shape: {X_test_scaled_df.shape}")


# --- 4. Load the Neural Network Model ---
logger.info("\n--- Step 4: Loading Neural Network Model ---")
model = evaluator.load_model(nn_model_filename)
if model is None:
    logger.critical(f"Neural Network model '{nn_model_filename}' not found at {nn_model_path}. Please run pipeline.py first to train and save the model.")
    exit(1)
logger.info("Neural Network model loaded successfully.")

# --- 5. Generate SHAP Explainability Plots ---
logger.info("\n--- Step 5: Generating SHAP Plots ---")
try:
    # Use a sample of the test data for SHAP to speed up calculation for DeepExplainer
    # DeepExplainer is recommended for Keras models and can handle more samples than KernelExplainer
    # without needing a separate background dataset for its initialization.
    # However, for very large test sets, a sample is still good.
    X_explain_sample = X_test_scaled_df.sample(n=min(1000, X_test_scaled_df.shape[0]), random_state=42)
    logger.info(f"Using {X_explain_sample.shape[0]} samples from scaled test data for SHAP explainability.")

    task_type = 'classification' # Flight delay is a binary classification

    # SHAP Summary Plot
    explainer.plot_shap_summary(
        model=model,
        X=X_explain_sample,
        filename=f"{nn_model_filename.replace('.h5', '')}_shap_summary_plot.png",
        task_type=task_type
    )

    # SHAP Dependence Plots for a few features
    # It's best to identify truly impactful features from the summary plot after its run.
    # For demonstration, let's pick a few features that are often relevant.
    features_for_dependence = []
    if 'CRS_ELAPSED_TIME' in X_explain_sample.columns: features_for_dependence.append('CRS_ELAPSED_TIME')
    if 'DISTANCE' in X_explain_sample.columns: features_for_dependence.append('DISTANCE')
    if 'AIRLINE_WN' in X_explain_sample.columns: features_for_dependence.append('AIRLINE_WN') # Example airline
    if 'ORIGIN_STATE_CA' in X_explain_sample.columns: features_for_dependence.append('ORIGIN_STATE_CA') # Example state
    if 'DAY_OF_WEEK_is_weekend' in X_explain_sample.columns: features_for_dependence.append('DAY_OF_WEEK_is_weekend')
    if 'CRS_DEP_TIME_SIN' in X_explain_sample.columns: features_for_dependence.append('CRS_DEP_TIME_SIN')


    if features_for_dependence:
        for feature in features_for_dependence:
            explainer.plot_shap_dependence(
                model=model,
                X=X_explain_sample,
                feature=feature,
                filename=f"{nn_model_filename.replace('.h5', '')}_shap_dependence_plot_{feature}.png",
                task_type=task_type
            )
    else:
        logger.warning("No default features identified for SHAP dependence plots. Consider adding relevant features.")


except Exception as e:
    logger.error(f"An error occurred during SHAP explainability: {e}", exc_info=True)

logger.info("--- SHAP Explainability Test Complete ---")

INFO: --- Starting SHAP Explainability Test for Neural Network with Flights Data ---
INFO:__main__:--- Starting SHAP Explainability Test for Neural Network with Flights Data ---
2025-07-13 12:46:20,183 - scripts.data_loading - INFO - DataDownload initialized. Running in Colab: False
INFO:scripts.data_loading:DataDownload initialized. Running in Colab: False
2025-07-13 12:46:20,195 - scripts.data_preprocessor - INFO - DataPreprocessor initialized.
INFO:scripts.data_preprocessor:DataPreprocessor initialized.
2025-07-13 12:46:20,197 - scripts.data_evaluate - INFO - ModelEvaluator initialized. Models will be saved to 'models'.
INFO:scripts.data_evaluate:ModelEvaluator initialized. Models will be saved to 'models'.
2025-07-13 12:46:20,217 - scripts.data_evaluate - INFO - Evaluation plots will be saved to 'plots/model_evaluation'.
INFO:scripts.data_evaluate:Evaluation plots will be saved to 'plots/model_evaluation'.
2025-07-13 12:46:20,252 - scripts.model_explainability - INFO - ModelExplain

<Figure size 1000x700 with 0 Axes>

<Figure size 1000x700 with 0 Axes>

<Figure size 1000x700 with 0 Axes>

In [25]:
import shap
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Load the neural network model
model = tf.keras.models.load_model('models/neural_network_classification_model.h5')

# Ensure X_test_scaled_df is a numpy array for SHAP
X_test_scaled = X_test_scaled_df.to_numpy()

# Select a background dataset for SHAP (e.g., a subset of test data)
# Using a small subset (e.g., 100 samples) for computational efficiency
background = X_test_scaled[np.random.choice(X_test_scaled.shape[0], 100, replace=False)]

# Initialize SHAP DeepExplainer
explainer = shap.DeepExplainer(model, background)

# Compute SHAP values for the test set
shap_values = explainer.shap_values(X_test_scaled)

# Since this is a classification model, shap_values is a list (one per class)
# For binary classification, we'll focus on the positive class (index 1)
# Adjust index based on your model's output (e.g., 0 for single-class output or multi-class)
shap_values_class = shap_values[1] if len(shap_values) > 1 else shap_values[0]

# Create a SHAP summary plot
feature_names = X_test_scaled_df.columns.tolist()
shap.summary_plot(shap_values_class, X_test_scaled, feature_names=feature_names, show=False)
plt.title("SHAP Summary Plot for Neural Network Model")
plt.tight_layout()
plt.savefig("shap_summary_plot.png")
plt.close()

# Create individual SHAP force plots for the first few test samples
for i in range(min(3, X_test_scaled.shape[0])):  # Plot for first 3 samples
    shap.force_plot(explainer.expected_value[1] if len(shap_values) > 1 else explainer.expected_value,
                    shap_values_class[i], X_test_scaled[i], feature_names=feature_names, matplotlib=True, show=False)
    plt.title(f"SHAP Force Plot for Test Sample {i+1}")
    plt.tight_layout()
    plt.savefig(f"shap_force_plot_sample_{i+1}.png")
    plt.close()

# Print feature importance based on mean absolute SHAP values
mean_shap_values = np.abs(shap_values_class).mean(axis=0)
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP_Value': mean_shap_values
}).sort_values(by='Mean_SHAP_Value', ascending=False)

print("\nFeature Importance based on Mean Absolute SHAP Values:")
print(feature_importance)

Expected: input_layer
Received: inputs=['Tensor(shape=(100, 19))']
Expected: input_layer
Received: inputs=['Tensor(shape=(200, 19))']
Expected: input_layer
Received: inputs=['Tensor(shape=(1461, 19))']


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [19]:
# X_test_scaled_df.shape, y_test.shape
X_test_scaled.columns


Index(['FL_NUMBER', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'CRS_ELAPSED_TIME',
       'CRS_DEP_TIME_MINUTES', 'CRS_DEP_TIME_MINUTES_SIN',
       'CRS_DEP_TIME_MINUTES_COS', 'CRS_ARR_TIME_MINUTES',
       'CRS_ARR_TIME_MINUTES_SIN', 'CRS_ARR_TIME_MINUTES_COS',
       'ARR_TIME_MINUTES', 'DAY_OF_WEEK', 'IS_WEEKEND', 'ORIGIN_CITY_CITY_WOE',
       'DEST_CITY_STATE_WOE', 'ORIGIN_CITY_STATE_WOE', 'DEST_CITY_CITY_WOE',
       'AIRLINE_WOE'],
      dtype='object')

In [21]:
y.head()

Unnamed: 0,FLIGHT_STATUS
0,0
1,0
2,0
3,0
4,0


In [7]:
import importlib
import scripts.data_evaluate # Make sure this matches your file path

# Reload the module to pick up the changes
importlib.reload(scripts.data_evaluate)

# Now, if you're importing ModelEvaluator directly, do it again
from scripts.data_evaluate import ModelEvaluator

# If you have an existing 'evaluator' object, you might need to re-instantiate it
# For example, if you had:
# evaluator = ModelEvaluator(output_dir="models", plots_dir="plots/model_evaluation")
# You would re-run that line after the reload.

In [None]:
import importlib
import scripts.data_evaluate

# Reload the module to pick up the changes
importlib.reload(scripts.data_evaluate)

# Now re-import or re-access DataModeling
from scripts.data_evaluate import ModelEvaluator

evaluator = ModelEvaluator()
evaluator

2025-07-12 11:47:05,053 - scripts.data_evaluate - INFO - ModelEvaluator initialized. Models will be saved to 'models'.
INFO:scripts.data_evaluate:ModelEvaluator initialized. Models will be saved to 'models'.
2025-07-12 11:47:05,059 - scripts.data_evaluate - INFO - Evaluation plots will be saved to 'plots/model_evaluation'.
INFO:scripts.data_evaluate:Evaluation plots will be saved to 'plots/model_evaluation'.


In [None]:
from scripts.data_preprocessor import DataPreprocessor
df_1 = preprocessor.encode_categorical_features(df)

df_1.head()

2025-07-12 06:21:14,575 - scripts.data_preprocessor - INFO - Applying WOE encoding to all categorical features based on FLIGHT_STATUS.
INFO:scripts.data_preprocessor:Applying WOE encoding to all categorical features based on FLIGHT_STATUS.
2025-07-12 06:21:14,581 - scripts.data_preprocessor - INFO - Found 6 categorical columns for WOE encoding: ['ORIGIN_CITY_STATE', 'DEST', 'DEST_CITY_CITY', 'DEST_CITY_STATE', 'AIRLINE', 'ORIGIN_CITY_CITY']
INFO:scripts.data_preprocessor:Found 6 categorical columns for WOE encoding: ['ORIGIN_CITY_STATE', 'DEST', 'DEST_CITY_CITY', 'DEST_CITY_STATE', 'AIRLINE', 'ORIGIN_CITY_CITY']
2025-07-12 06:21:14,606 - scripts.data_preprocessor - ERROR - Error applying WOE encoding: name 'ce' is not defined
Traceback (most recent call last):
  File "/content/drive/MyDrive/Colab Notebooks/flight_delay/scripts/data_preprocessor.py", line 244, in encode_categorical_features
    #         df_copy[col] = df_copy[col].fillna('Missing')
            ^^
NameError: name 'ce' i

Unnamed: 0,AIRLINE,FL_NUMBER,DEST,DEP_DELAY,TAXI_OUT,TAXI_IN,CRS_ELAPSED_TIME,DELAY_DUE_CARRIER,FLIGHT_STATUS,CRS_DEP_TIME_MINUTES,CRS_DEP_TIME_MINUTES_SIN,CRS_DEP_TIME_MINUTES_COS,CRS_ARR_TIME_MINUTES,CRS_ARR_TIME_MINUTES_SIN,CRS_ARR_TIME_MINUTES_COS,ARR_TIME_MINUTES,DEST_CITY_CITY,DEST_CITY_STATE,ORIGIN_CITY_CITY,ORIGIN_CITY_STATE,DAY_OF_WEEK,IS_WEEKEND
0,Allegiant Air,1668,SPI,-10.0,9.0,7.0,160.0,0.0,0,390,0.991445,-0.130526,490,0.843391,-0.5373,458.0,Springfield,IL,Punta Gorda,FL,4,0
1,PSA Airlines Inc.,5560,CLT,-7.0,16.0,11.0,79.0,0.0,0,385,0.994056,-0.108867,464,0.898794,-0.438371,456.0,Charlotte,NC,New Bern/Morehead/Beaufort,NC,1,0
2,Southwest Airlines Co.,1944,DEN,25.0,15.0,7.0,80.0,10.0,1,1035,-0.980785,-0.19509,1115,-0.988362,0.152123,1131.0,Denver,CO,Albuquerque,NM,2,0
3,Southwest Airlines Co.,3081,STL,0.0,12.0,6.0,105.0,0.0,0,335,0.994056,0.108867,380,0.996195,-0.087156,375.0,St. Louis,MO,Pittsburgh,PA,6,1
4,Delta Air Lines Inc.,674,SEA,-2.0,14.0,11.0,178.0,0.0,0,700,0.087156,-0.996195,878,-0.636078,-0.771625,856.0,Seattle,WA,Los Angeles,CA,6,1


## Gradio APP


In [8]:
import gradio as gr
import pandas as pd
import os
import sys # <--- ADDED THIS IMPORT
import logging

# Configure Logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Remove handlers if already present to avoid duplication in case app.py is run multiple times
if logger.handlers:
    for handler in list(logger.handlers):
        logger.removeHandler(handler)

console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_formatter = logging.Formatter('%(levelname)s: %(message)s')
console_handler.setFormatter(console_formatter)
logger.addHandler(console_handler)


# Define paths (ensure these match where your main.py saves the outputs)
base_dir = '.' # Assuming app.py runs from the project root or adjust accordingly
if 'google.colab' in sys.modules: # Check if running in Colab context for paths
    base_dir = '/content/drive/MyDrive/Colab Notebooks/flight_delay'

raw_data_dir = os.path.join(base_dir, "data/raw")
processed_data_dir = os.path.join(base_dir, "data/processed")
reports_dir = os.path.join(base_dir, "reports")
eda_plots_dir = os.path.join(base_dir, "plots/eda")
model_eval_plots_dir = os.path.join(base_dir, "plots/model_evaluation")
models_dir = os.path.join(base_dir, "models")
model_params_dir = os.path.join(base_dir, "model_params")


# --- Helper functions to load and display content ---

def get_basic_data_info():
    """Loads raw data and provides basic information and target variable definition."""
    try:
        csv_file_path = os.path.join(raw_data_dir, 'flights_sample_10k.csv')
        df_raw = pd.read_csv(csv_file_path)

        # Filter non-cancelled flights as in your main pipeline
        df_filtered_cancelled = df_raw[df_raw['CANCELLED'] == 0].copy()

        info_text = "### **1.2.1 Basic Data Info & Target Variable Definition**\n\n"
        info_text += f"**Raw Data Shape:** {df_raw.shape}\n\n"
        info_text += f"**Filtered (Non-Cancelled) Data Shape:** {df_filtered_cancelled.shape}\n\n"
        info_text += "**First 5 Rows of Filtered Data:**\n"
        info_text += f"```\n{df_filtered_cancelled.head().to_string()}\n```\n\n"
        info_text += "**Target Variable (`FLIGHT_STATUS`) Definition:**\n"
        info_text += "The target variable `FLIGHT_STATUS` is a binary classification target derived from `ARR_DELAY`.\n"
        info_text += "  - `FLIGHT_STATUS = 1` if `ARR_DELAY > 15` minutes (flight is considered 'delayed').\n"
        info_text += "  - `FLIGHT_STATUS = 0` if `ARR_DELAY <= 15` minutes (flight is considered 'on-time' or 'minor delay').\n\n"

        # Simulate target distribution if not explicitly saved
        if 'ARR_DELAY' in df_filtered_cancelled.columns:
            df_filtered_cancelled['FLIGHT_STATUS'] = (df_filtered_cancelled['ARR_DELAY'] > 15).astype(int)
            info_text += "**Simulated Target Distribution (from filtered raw data):**\n"
            info_text += f"```\n{df_filtered_cancelled['FLIGHT_STATUS'].value_counts().to_string()}\n```"
        else:
            info_text += "Note: 'ARR_DELAY' column not found to simulate target distribution in raw data view."

        return info_text
    except FileNotFoundError:
        return f"Error: Data file not found at {os.path.join(raw_data_dir, 'flights_sample_100k.csv')}. Please ensure the pipeline has run successfully."
    except Exception as e:
        return f"An error occurred: {e}"

def get_data_profiling_link():
    """Provides a link to the generated data profiling report."""
    report_path = os.path.join(reports_dir, "flight_data_profile_report_pre_processing.html")
    if os.path.exists(report_path):
        return f"### **1.3 Data Profiling Link**\n\n" \
               f"A comprehensive data profiling report (generated using `pandas-profiling`) is available:\n" \
               f"- [Open Data Profile Report]({report_path})\n\n" \
               f"**Note:** If running locally, click the link to open the HTML report in your browser. If in Colab, you might need to navigate to the file path in your Google Drive and open it from there."
    else:
        return f"### **1.3 Data Profiling Link**\n\n" \
               f"Data profiling report not found at: `{report_path}`. Please ensure the pipeline has generated it."

def get_eda_plots():
    """Returns a list of paths to EDA plots."""
    plot_files = []
    # List all plots generated in your pipeline's Step 5
    plots = [
        "all_column_distributions_eda.png",
        "airline_flight_counts_eda.png",
        "top_20_destination_visits_eda.png",
        "avg_arrival_delay_by_airline_eda.png",
        "total_delays_by_year_eda.png",
        "monthly_delays_by_year_eda.png",
        "monthly_delay_trend_highlight_eda.png",
        "delay_reason_breakdown_eda.png"
    ]

    found_plots_info = "### **1.2.2 EDA Plots & Inferences**\n\n"
    found_plots_info += "Explore the visualizations below to understand key trends and patterns in the flight delay data.\n\n"

    for plot in plots:
        path = os.path.join(eda_plots_dir, plot)
        if os.path.exists(path):
            plot_files.append(path)
            found_plots_info += f"- **{plot.replace('.png', '').replace('_', ' ').title()}**\n"
            # Add a brief inference for each plot based on common expectations from flight data EDA
            if "column_distributions" in plot:
                found_plots_info += "  *Inference:* Provides an overview of feature distributions, helping to identify skewed data, outliers, or categorical balance issues.\n"
            elif "airline_flight_counts" in plot:
                found_plots_info += "  *Inference:* Shows which airlines operate the most flights, indicating their market share and potential impact on overall delays.\n"
            elif "destination_visits" in plot:
                found_plots_info += "  *Inference:* Highlights the busiest destination airports, which can be hotspots for delays due to high traffic.\n"
            elif "arrival_delay_by_airline" in plot:
                found_plots_info += "  *Inference:* Compares average arrival delays across different airlines, revealing which airlines tend to be more punctual or delayed.\n"
            elif "total_delays_by_year" in plot:
                found_plots_info += "  *Inference:* Illustrates the yearly trend of total flight delays, showing if delay incidents are increasing or decreasing over time.\n"
            elif "monthly_delays_by_year" in plot:
                found_plots_info += "  *Inference:* Breaks down delays by month and year, revealing seasonal patterns and yearly variations.\n"
            elif "monthly_delay_trend_highlight" in plot:
                found_plots_info += "  *Inference:* Focuses on specific months or periods with high delay occurrences, indicating peak travel times or anomaly periods.\n"
            elif "delay_reason_breakdown" in plot:
                found_plots_info += "  *Inference:* Identifies the primary causes of delays (e.g., carrier, weather, NAS), which is crucial for targeted interventions.\n"
            found_plots_info += "\n"
        else:
            logger.warning(f"EDA plot not found: {path}")

    if not plot_files:
        return "No EDA plots found. Please ensure the pipeline has generated them.", []

    return found_plots_info, plot_files


def get_preprocessing_summary():
    """Summarizes the data cleaning and preprocessing steps."""
    summary_text = "### **1.4 Data Cleaning and Preprocessing Summary**\n\n"
    summary_text += "The raw flight data undergoes several crucial preprocessing steps to prepare it for machine learning:\n\n"
    summary_text += "1.  **Initial Filtering:** Cancelled flights (`CANCELLED == 1`) are removed as our target is arrival delay.\n"
    summary_text += "2.  **Target Variable Creation:** A new binary target `FLIGHT_STATUS` is created from `ARR_DELAY` (1 if arrival delay > 15 mins, 0 otherwise).\n"
    summary_text += "3.  **Missing Value Imputation:** Missing values in delay reason columns (`DELAY_DUE_CARRIER`, etc.) are filled, likely with zeros, indicating no delay attributed to that reason.\n"
    summary_text += "4.  **Elapsed Time Difference:** A feature `ELAPSED_TIME_DIFF` is calculated from scheduled and actual times, providing insights into flight duration deviations.\n"
    summary_text += "5.  **Cyclical Encoding:** Time-based features like `CRS_DEP_TIME`, `CRS_ARR_TIME` are transformed using sine and cosine functions to capture their cyclical nature.\n"
    summary_text += "6.  **City/State Split:** Combined city, state information in `DEST_CITY` and `ORIGIN_CITY` is separated into distinct `_CITY` and `_STATE` columns for better granularity.\n"
    summary_text += "7.  **Weekday/Weekend Features:** `FL_DATE` is used to create binary features indicating whether a flight occurs on a `WEEKDAY` or `WEEKEND`.\n"
    summary_text += "8.  **Categorical Encoding:** Categorical features (e.g., `AIRLINE`, `ORIGIN`, `DEST`, `DEST_STATE`) are converted into numerical representations, typically using one-hot encoding or similar methods, suitable for machine learning models.\n"
    summary_text += "9.  **High Correlation Feature Removal:** Features with a high correlation (e.g., >0.9) are identified and one of the pair is removed to prevent multicollinearity and improve model stability.\n"
    summary_text += "10. **Feature Exclusion:** Various columns deemed irrelevant or redundant for modeling (e.g., original identifiers, cancellation details, raw time columns, original delay values) are dropped.\n\n"
    summary_text += "The preprocessed data is then saved as `preprocessed_flight_data_for_modeling.csv`."
    return summary_text

def get_modeling_and_evaluation_results():
    """Loads and displays model evaluation results and plots."""
    results_md = "### **2. Data Modeling**\n\n"
    results_md += "Our pipeline trains and evaluates several classification models for predicting flight delays:\n"
    results_md += "- **Baseline Model:** Predicts the majority class (no delay).\n"
    results_md += "- **Logistic Regression:** A simple linear model.\n"
    results_md += "- **Neural Network:** A more complex deep learning model.\n\n"

    results_md += "### **3. Data Evaluation and KPIs (Classification)**\n\n"
    results_md += "Models are evaluated using metrics relevant for classification tasks, including:\n"
    results_md += "- **Accuracy:** Overall correctness.\n"
    results_md += "- **Precision:** Proportion of true positive predictions that were actually positive.\n"
    results_md += "- **Recall:** Proportion of actual positives that were correctly identified.\n"
    results_md += "- **F1-Score:** Harmonic mean of precision and recall.\n"
    results_md += "- **ROC AUC:** Area Under the Receiver Operating Characteristic Curve, indicating the model's ability to distinguish between classes.\n\n"

    metrics_table_path = os.path.join(model_eval_plots_dir, "classification_model_metrics_table.csv")
    metrics_display = ""
    if os.path.exists(metrics_table_path):
        try:
            df_metrics = pd.read_csv(metrics_table_path)
            metrics_display = "**Overall Model Performance (Test Set):**\n"
            metrics_display += df_metrics.to_markdown(index=False) + "\n\n"
        except Exception as e:
            metrics_display = f"Could not load metrics table: {e}\n\n"
    else:
        metrics_display = f"Metrics table not found at: `{metrics_table_path}`. Please ensure the pipeline has generated it.\n\n"

    results_md += metrics_display

    plot_files = []
    # List all evaluation plots generated in your pipeline's Step 7
    eval_plots = [
      "Logistic_Regression_Test_ROC_Curve.png",
      "Logistic_Regression_Test_Confusion_Matrix.png",
      "Neural_Network_Classification_Test_ROC_Curve.png",
      "Neural_Network_Classification_Test_Confusion_Matrix.png"
      ]

    found_plots_info = "### **Model Evaluation Plots (Test Set):**\n\n"

    for plot in eval_plots:
        path = os.path.join(model_eval_plots_dir, plot)
        if os.path.exists(path):
            plot_files.append(path)
            found_plots_info += f"- **{plot.replace('.png', '').replace('_', ' ').title()}**\n"
            if "ROC_Curve" in plot:
                found_plots_info += "  *Inference:* Visualizes the trade-off between the true positive rate and false positive rate. A curve closer to the top-left corner indicates better performance.\n"
            elif "Confusion_Matrix" in plot:
                found_plots_info += "  *Inference:* Shows the counts of true positives, true negatives, false positives, and false negatives, providing a clear picture of classification errors.\n"
            found_plots_info += "\n"
        else:
            logger.warning(f"Model evaluation plot not found: {path}")

    if not plot_files:
        found_plots_info = "No model evaluation plots found. Please ensure the pipeline has generated them."

    return results_md, found_plots_info, plot_files

def get_shap_plots():
    """Returns a list of paths to SHAP explanation plots."""
    shap_plots_dir = os.path.join(base_dir, "plots/model_explainability") # Ensure this path is defined globally or passed
    os.makedirs(shap_plots_dir, exist_ok=True) # Ensure dir exists for the app too

    plot_files = []
    shap_plots = [
        "logistic_regression_shap_summary.png",
        "neural_network_shap_summary.png",
        # Add other SHAP plots if you generate them (e.g., dependence plots)
    ]

    found_plots_info = "### **4.1 SHAP Feature Importance Plots**\n\n"
    found_plots_info += "These plots show the contribution of each feature to the model's predictions.\n\n"

    for plot in shap_plots:
        path = os.path.join(shap_plots_dir, plot)
        if os.path.exists(path):
            plot_files.append(path)
            found_plots_info += f"- **{plot.replace('.png', '').replace('_', ' ').title()}**\n"
            found_plots_info += "  *Inference:* Each point represents an instance in the dataset. The position on the x-axis shows the SHAP value, indicating the feature's impact on the prediction. Color often indicates the feature's actual value (e.g., high or low).\n\n"
        else:
            logger.warning(f"SHAP plot not found: {path}")

    if not plot_files:
        return "No SHAP plots found. Please ensure the pipeline has generated them in `plots/model_explainability`.", []

    return found_plots_info, plot_files

def get_model_explainability_info():
    """Placeholder for Model Explainability."""
    explain_md = "### **4. Model Explainability**\n\n"
    explain_md += "Understanding why a model makes certain predictions is crucial, especially in high-stakes domains like aviation.\n"
    explain_md += "This section will be dedicated to explaining model predictions using techniques such as:\n"
    explain_md += "- **SHAP (SHapley Additive exPlanations):** To explain individual predictions and global feature importance by showing the contribution of each feature to the prediction.\n"
    explain_md += "- **LIME (Local Interpretable Model-agnostic Explanations):** To explain the predictions of any machine learning model by approximating it with a local, interpretable model.\n\n"
    explain_md += "**Current Status:** This functionality is still under development. Once implemented, you will be able to select a model and input features to see how different factors influence the prediction of a flight delay.\n\n"
    explain_md += "*(Future enhancement: Interactive SHAP/LIME plots, feature importance rankings, and example predictions with explanations.)*"
    return explain_md


# --- Gradio Interface ---

with gr.Blocks(theme=gr.themes.Soft(), title="Flight Delay Prediction ML Pipeline Dashboard") as demo:
    gr.Markdown(
        """
        # ✈️ Flight Delay Prediction ML Pipeline Dashboard
        Welcome to the interactive dashboard for the Flight Delay Prediction Machine Learning project!
        This application guides you through the end-to-end ML pipeline, from data understanding and preprocessing
        to model training, evaluation, and future explainability.
        """
    )

    with gr.Tab("1. Data Overview & EDA"):
        gr.Markdown("## Data Understanding and Exploratory Data Analysis (EDA)")
        with gr.Accordion("1.1 Use Case & Pipeline Structure", open=False):
            gr.Markdown(
                """
                ### **1.1 Use Case Definition and Process Pipeline Structure**
                **Use Case:** Predict whether a flight will be significantly delayed (arrival delay > 15 minutes) or on-time/minor delay. This is a **binary classification** problem.

                **Overall Pipeline Structure:**
                1.  **Data Download:** Retrieves raw flight data from Kaggle.
                2.  **Data Loading & Initial Filtering:** Loads data and removes cancelled flights.
                3.  **Data Profiling (Pre-preprocessing):** Generates an initial report on data quality and characteristics.
                4.  **Data Preprocessing for Visualization (EDA):** Prepares data for exploratory plots (e.g., creating temporary target for EDA).
                5.  **Data Visualization (EDA):** Generates insightful plots to understand data patterns.
                6.  **Data Preprocessing for Modeling:** Cleans, transforms, and engineers features for machine learning models.
                7.  **Data Modeling and Evaluation:** Trains and evaluates various classification models.
                8.  **Model Explainability (Future):** Provides insights into model predictions.
                """
            )

        with gr.Tab("Basic Data Info"):
            gr.Markdown("---")
            gr.Markdown("### 1.2.1 Basic Info about Data and Target Variable Definition")
            basic_info_output = gr.Markdown(get_basic_data_info())

        with gr.Tab("EDA Plots"):
            gr.Markdown("---")
            eda_info_output = gr.Markdown("Loading EDA plot information...")
            eda_plot_gallery = gr.Gallery(
                label="Exploratory Data Analysis Plots",
                columns=[4], rows=[2], object_fit="contain", height="auto"
            )
            demo.load(get_eda_plots, inputs=None, outputs=[eda_info_output, eda_plot_gallery])

        with gr.Tab("Data Preprocessing Summary"):
            gr.Markdown("---")
            gr.Markdown("### 1.3 Data Cleaning and Preprocessing for Modeling")
            preprocessing_summary_output = gr.Markdown(get_preprocessing_summary())

        with gr.Tab("Data Profiling Report"):
            gr.Markdown("---")
            gr.Markdown("### 1.4 Data Profiling Link")
            profiling_link_output = gr.Markdown(get_data_profiling_link())

    with gr.Tab("2. Model Training & Evaluation"):
        gr.Markdown("## Machine Learning Model Training and Performance Evaluation")
        gr.Markdown("---")
        model_eval_summary_output = gr.Markdown("Loading model evaluation results...")
        model_eval_plot_gallery = gr.Gallery(
            label="Model Evaluation Plots (Test Set)",
            columns=[4], rows=[2], object_fit="contain", height="auto"
        )
        demo.load(get_modeling_and_evaluation_results, inputs=None, outputs=[model_eval_summary_output, model_eval_plot_gallery])

    with gr.Tab("3. Model Explainability"):
        gr.Markdown("## Understanding Model Decisions")
        gr.Markdown("---")
        # explainability_output = gr.Markdown(get_model_explainability_info()) # REMOVE THIS LINE

        shap_info_output = gr.Markdown("Loading SHAP plot information...")
        shap_plot_gallery = gr.Gallery(
            label="SHAP Explanations",
            columns=[2], rows=[1], object_fit="contain", height="auto"
        )
        demo.load(get_shap_plots, inputs=None, outputs=[shap_info_output, shap_plot_gallery])
        # Optionally, keep the general info above the plots if desired
        gr.Markdown(get_model_explainability_info())


if __name__ == "__main__":
    # Ensure all directories exist before trying to load/save files
    os.makedirs(raw_data_dir, exist_ok=True)
    os.makedirs(processed_data_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    os.makedirs(eda_plots_dir, exist_ok=True)
    os.makedirs(model_eval_plots_dir, exist_ok=True)
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(model_params_dir, exist_ok=True)

    # You would typically run your main pipeline first to generate outputs
    # For demonstration, we assume your main.py has already run and saved artifacts.
    logger.info("Starting Gradio App. Ensure your ML pipeline (main.py) has been executed to generate necessary reports and plots.")
    demo.launch(share=True) # Set share=True to get a public link for Colab

INFO: Starting Gradio App. Ensure your ML pipeline (main.py) has been executed to generate necessary reports and plots.
INFO:__main__:Starting Gradio App. Ensure your ML pipeline (main.py) has been executed to generate necessary reports and plots.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dfa178a44a3fc3e08b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
