In [34]:
hopsworks_api_key = None  
Arlanda = "ARN" 

In [35]:
import hopsworks
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import joblib
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score

In [3]:
def hopsworks_connection():
    project = hopsworks.login(api_key_value=hopsworks_api_key, host="eu-west.cloud.hopsworks.ai")
    fs = project.get_feature_store()
    mr = project.get_model_registry()
    return project, fs, mr

In [4]:
def download_model(mr):
    model = mr.get_model(name='flight_delay_predictor', version=1)
    saved_model_dir = model.download()

    model_pipeline = joblib.load(f"{saved_model_dir}/model.pkl")

    with open(f"{saved_model_dir}/metadata.json", 'r') as f:
        metadata = json.load(f)

    print(f"Training Accuracy: {metadata['accuracy']:.3f}\n")
    print(f"ROC-AUC score: {metadata['roc_auc']:.3f}\n")

    return model_pipeline, metadata

In [19]:
def fetch_batch_data(fs, today, end_date):
    flights_fg = fs.get_feature_group('flight_schedules', version=1)
    temporal_fg = fs.get_feature_group('temporal_features', version=1)
    weather_fg = fs.get_feature_group('weather_features', version=1)

    today_str = today.strftime("%Y-%m-%d")
    end_str = end_date.strftime("%Y-%m-%d")

    df_flights = flights_fg.filter(
        (flights_fg.scheduled_time >= today_str) & (flights_fg.scheduled_time < end_str)
    ).read()

    df_temporal = temporal_fg.filter(
        (temporal_fg.date >= today_str) & (temporal_fg.date < end_str)
    ).read()

    df_weather = weather_fg.filter(
        (weather_fg.timestamp >= today_str) & (weather_fg.timestamp < end_str)
    ).read()

    return df_flights, df_temporal, df_weather

In [8]:
def engineer_features(df_flights, df_temporal, df_weather):
    df_flights['date'] = pd.to_datetime(df_flights['scheduled_time']).dt.date.astype(str)
    
    batch_data = df_flights.merge(
        df_temporal, left_on='date', right_on='date', how='left', suffixes=('', '_temporal')
    )

    batch_data['scheduled_hour'] = pd.to_datetime(batch_data['scheduled_time']).dt.floor('H')
    df_weather['weather_hour'] = pd.to_datetime(df_weather['timestamp']).dt.floor('H')
    
    batch_data = batch_data.merge(
        df_weather, 
        left_on=['arn_airport_role', 'scheduled_hour'], 
        right_on=['airport_code', 'weather_hour'], 
        how='left', 
        suffixes=('', '_weather')
    )

    batch_data['hour'] = pd.to_datetime(batch_data['scheduled_time']).dt.hour
    batch_data['day_of_week'] = pd.to_datetime(batch_data['scheduled_time']).dt.dayofweek
    batch_data['month'] = pd.to_datetime(batch_data['scheduled_time']).dt.month
    
    batch_data['time_of_day'] = pd.cut(
        batch_data['hour'], 
        bins=[0, 6, 12, 18, 24], 
        labels=['night', 'morning', 'afternoon', 'evening'],
        include_lowest=True
    )

    weather_weights = {'clear': 0, 'fog': 2, 'rain': 1, 'rain_windy': 3, 'snow': 4, 'windy': 2}
    batch_data['weather_impact'] = batch_data['weather_condition'].map(weather_weights).fillna(0)
    
    batch_data['high_wind'] = (batch_data['wind_speed'] > 15).astype(int)
    batch_data['low_visibility'] = (batch_data['visibility'] < 5).astype(int)
    batch_data['peak_international'] = (
        batch_data['is_peak_travel'] & (batch_data['route_type'] == 'international')
    ).astype(int)

    for col in ['is_weekend', 'is_holiday', 'is_school_break', 'is_peak_travel', 
                'is_sportlov', 'is_summer_break', 'is_christmas_break']:
        if col in batch_data.columns:
            batch_data[col] = batch_data[col].fillna(False).astype(int)
    
    print(f"Merged data shape: {batch_data.shape}\n")
    return batch_data

In [22]:
def make_predictions(model_pipeline, metadata, batch_data):
    categorical_features = metadata['categorical_features']
    numerical_features = metadata['numerical_features']
    all_features = categorical_features + numerical_features

    X_batch = batch_data[all_features].copy()

    for col in numerical_features:
        if col in X_batch.columns:
            X_batch[col] = X_batch[col].fillna(X_batch[col].median())

    for col in categorical_features:
        if col in X_batch.columns:
            X_batch[col] = X_batch[col].fillna(
                X_batch[col].mode()[0] if len(X_batch[col].mode()) > 0 else "UNKNOWN"
            )

    batch_data['delay_probability'] = model_pipeline.predict_proba(X_batch)[:,1]
    batch_data['predicted_delayed'] = model_pipeline.predict(X_batch)

    print(f"Flights predicted as delayed: {batch_data['predicted_delayed'].sum()} / {len(batch_data)}")
    print(f"Average delay probability: {batch_data['delay_probability'].mean():.2%}\n")
    
    # Preview high-risk flights
    high_risk = batch_data.nlargest(10, 'delay_probability')[[
        'flight_number', 'scheduled_time', 'route', 'weather_condition', 
        'delay_probability', 'predicted_delayed'
    ]]
    print("Top 10 flights at risk of delay:")
    print(high_risk)
    print()
    
    return batch_data

In [31]:
def save_predictions(fs, batch_data, today):
    base_columns = ['flight_id', 'flight_number', 'scheduled_time', 'route', 
                    'flight_direction', 'route_type', 'delay_probability', 'predicted_delayed']
    
    if 'weather_condition' in batch_data.columns and batch_data['weather_condition'].notna().any():
        base_columns.append('weather_condition')
    
    monitoring_data = batch_data[base_columns].copy()
    
    if 'weather_condition' in monitoring_data.columns:
        monitoring_data['weather_condition'] = monitoring_data['weather_condition'].fillna('unknown')
    
    monitoring_data['prediction_date'] = today.strftime('%Y-%m-%d')
    monitoring_data['days_before_flight'] = (
        pd.to_datetime(monitoring_data['scheduled_time']).dt.date - today.date()
    ).apply(lambda x: x.days)
    
    monitoring_data['scheduled_time'] = pd.to_datetime(monitoring_data['scheduled_time'])
    
    monitor_fg = fs.get_or_create_feature_group(
        name='flight_delay_predictions',
        description='Flight delay prediction monitoring for Arlanda Airport',
        version=1,
        primary_key=['flight_id', 'prediction_date'],
        event_time='scheduled_time'
    )
    
    # Insert predictions
    monitor_fg.insert(monitoring_data, wait=True)
    
    return monitor_fg

In [11]:
def generate_forecast_dashboard(batch_data, today):
    daily_forecast = batch_data.groupby(
        pd.to_datetime(batch_data['scheduled_time']).dt.date
    ).agg({
        'flight_id': 'count',
        'predicted_delayed': 'sum',
        'delay_probability': 'mean'
    }).reset_index()
    
    daily_forecast.columns = ['date', 'total_flights', 'predicted_delays', 'avg_delay_prob']
    daily_forecast['delay_rate'] = daily_forecast['predicted_delays'] / daily_forecast['total_flights']
    
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    ax1 = axes[0]
    ax1.bar(daily_forecast['date'], daily_forecast['predicted_delays'], 
            alpha=0.7, color='orange', label='Predicted Delays')
    ax1.plot(daily_forecast['date'], daily_forecast['total_flights'], 
             marker='o', color='blue', linewidth=2, label='Total Flights')
    ax1.set_xlabel('Date', fontsize=12)
    ax1.set_ylabel('Number of Flights', fontsize=12)
    ax1.set_title('Flight Delay Forecast - Arlanda Airport (Next 7 Days)', fontsize=14, fontweight='bold')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45)
    
    ax2 = axes[1]
    ax2.plot(daily_forecast['date'], daily_forecast['avg_delay_prob'] * 100, 
             marker='s', color='red', linewidth=2.5, markersize=8)
    ax2.fill_between(daily_forecast['date'], 0, daily_forecast['avg_delay_prob'] * 100, 
                     alpha=0.3, color='red')
    ax2.set_xlabel('Date', fontsize=12)
    ax2.set_ylabel('Average Delay Probability (%)', fontsize=12)
    ax2.set_title('Daily Average Delay Risk', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
    
    plt.tight_layout()
    forecast_path = './flight_delay_forecast.png'
    plt.savefig(forecast_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    return forecast_path


In [12]:
def generate_hindcast_dashboard(fs, monitor_fg, flights_fg, today):
    """Generate hindcast comparing predictions with actual outcomes."""    
    # Fetch historical predictions (from past 30 days)
    past_date = (today - timedelta(days=30)).strftime('%Y-%m-%d')
    historical_predictions = monitor_fg.filter(
        monitor_fg.prediction_date >= past_date
    ).read()
    
    if len(historical_predictions) == 0:
        print("No historical predictions yet. Hindcast will be available after a few days.\n")
        return None
    
    actual_outcomes = flights_fg.filter(
        flights_fg.scheduled_time >= past_date
    ).read()[['flight_id', 'is_delayed', 'delay_minutes']]
    
    hindcast_df = historical_predictions.merge(
        actual_outcomes, on='flight_id', how='inner'
    )
    
    if len(hindcast_df) == 0:
        print("No matched predictions and outcomes yet.\n")
        return None
    
    print(f"Hindcast data: {len(hindcast_df)} flights with both predictions and outcomes")
    
    accuracy = accuracy_score(hindcast_df['is_delayed'], hindcast_df['predicted_delayed'])
    precision = precision_score(hindcast_df['is_delayed'], hindcast_df['predicted_delayed'])
    recall = recall_score(hindcast_df['is_delayed'], hindcast_df['predicted_delayed'])
    
    # Create confusion matrix
    cm = confusion_matrix(hindcast_df['is_delayed'], hindcast_df['predicted_delayed'])
    
    # Plot hindcast dashboard
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Confusion Matrix
    ax1 = axes[0]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                xticklabels=['On-Time', 'Delayed'],
                yticklabels=['On-Time', 'Delayed'])
    ax1.set_ylabel('Actual', fontsize=12)
    ax1.set_xlabel('Predicted', fontsize=12)
    ax1.set_title('Confusion Matrix - Model Performance', fontsize=14, fontweight='bold')
    
    # Plot 2: Metrics
    ax2 = axes[1]
    metrics = ['Accuracy', 'Precision', 'Recall']
    values = [accuracy, precision, recall]
    colors = ['#2ecc71', '#3498db', '#e74c3c']
    
    bars = ax2.barh(metrics, values, color=colors, alpha=0.7)
    ax2.set_xlim([0, 1])
    ax2.set_xlabel('Score', fontsize=12)
    ax2.set_title('Model Performance Metrics', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3, axis='x')
    
    # Add value labels
    for i, (bar, value) in enumerate(zip(bars, values)):
        ax2.text(value + 0.02, i, f'{value:.2%}', va='center', fontsize=11, fontweight='bold')
    
    plt.tight_layout()
    hindcast_path = './flight_delay_hindcast.png'
    plt.savefig(hindcast_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"\nModel Performance Summary:")
    print(f"   Accuracy:  {accuracy:.2%}")
    print(f"   Precision: {precision:.2%}")
    print(f"   Recall:    {recall:.2%}\n")
    
    return hindcast_path


In [13]:
def upload_dashboards(project, forecast_path, hindcast_path, today):
    dataset_api = project.get_dataset_api()
    str_today = today.strftime("%Y-%m-%d")
    
    if dataset_api.exists("Resources/flight_delays") == False:
        dataset_api.mkdir("Resources/flight_delays")
    
    dataset_api.upload(
        forecast_path, 
        f"Resources/flight_delays/forecast_{str_today}.png", 
        overwrite=True
    )
    
    if hindcast_path:
        dataset_api.upload(
            hindcast_path, 
            f"Resources/flight_delays/hindcast_{str_today}.png", 
            overwrite=True
        )
    
    print(f"Dashboards uploaded successfully!")

In [32]:
def main():
    # Define prediction window
    today = datetime.now()
    forecast_days = 7
    end_date = today + timedelta(days=forecast_days)
    
    # 1. Connect to Hopsworks
    project, fs, mr = hopsworks_connection()
    
    # 2. Download model
    model_pipeline, metadata = download_model(mr)
    
    # 3. Fetch batch data
    df_flights, df_temporal, df_weather = fetch_batch_data(fs, today, end_date)
    
    # 4. Engineer features
    batch_data = engineer_features(df_flights, df_temporal, df_weather)
    
    # 5. Make predictions
    batch_data = make_predictions(model_pipeline, metadata, batch_data)
    
    # 6. Save predictions
    monitor_fg = save_predictions(fs, batch_data, today)
    
    # 7. Generate forecast dashboard
    forecast_path = generate_forecast_dashboard(batch_data, today)
    
    # 8. Generate hindcast dashboard
    flights_fg = fs.get_feature_group('flight_schedules', version=1)
    hindcast_path = generate_hindcast_dashboard(fs, monitor_fg, flights_fg, today)
    
    # 9. Upload dashboards
    upload_dashboards(project, forecast_path, hindcast_path, today)

In [33]:
main()

2026-01-06 17:15:48,305 INFO: Closing external client and cleaning up certificates.
2026-01-06 17:15:48,309 INFO: Connection closed.
2026-01-06 17:15:48,312 INFO: Initializing external client
2026-01-06 17:15:48,313 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-06 17:15:49,228 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3207


Downloading: 100.000%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 578249/578249 elapsed<00:00 remaining<00:00


Downloading model artifact (0 dirs, 1 files)... 

Downloading: 100.000%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 916/916 elapsed<00:00 remaining<00:00


Training Accuracy: 0.593ct (0 dirs, 2 files)... DONE

ROC-AUC score: 0.573

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.68s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.23s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.27s) 
Merged data shape: (3568, 52)

Flights predicted as delayed: 1871 / 3568
Average delay probability: 50.97%

Top 10 flights at risk of delay:
     flight_number       scheduled_time  ... delay_probability predicted_delayed
497          SK535  2026-01-06 07:55:00  ...          0.842883                 1
105          FR881  2026-01-06 08:25:00  ...          0.836727                 1
659          JU382  2026-01-07 18:50:00  ...          0.833674                 1
1807         JU380  2026-01-06 08:10:00  ...          0.832592                 1
3194        BLX165  2026-01-06 06:35:00  ...          0.824489                 1
358         FR4616  2026-01-06 07:00:00



Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/3207/fs/3151/fg/3347


Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████████████████| Rows 3568/3568 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: flight_delay_predictions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3207/jobs/named/flight_delay_predictions_1_offline_fg_materialization/executions
2026-01-06 17:16:15,455 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-06 17:16:21,679 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-06 17:18:51,110 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-06 17:18:51,213 INFO: Waiting for log aggregation to finish.
2026-01-06 17:18:59,600 INFO: Execution finished successfully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.46s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.34s) 
Hindcast data: 3568 flights with both predictions and outcomes

Model Performance Summary:
   Accuracy:

Uploading /Users/unilangsachin/Desktop/ID2223-Scalable_ML/Flight-Delay-Tracker/./flight_delay_forecast.png: 100.000%|█████████████| 283667/283667 elapsed<00:03 remaining<00:00
Uploading /Users/unilangsachin/Desktop/ID2223-Scalable_ML/Flight-Delay-Tracker/./flight_delay_hindcast.png: 100.000%|█████████████| 156742/156742 elapsed<00:02 remaining<00:00

Dashboards uploaded successfully!



