In [9]:
import hopsworks
import pandas as pd
from config import hopsworks_api_key
from datetime import datetime, timedelta
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import joblib
import json
import os

In [10]:
project = hopsworks.login(api_key_value=hopsworks_api_key, host="eu-west.cloud.hopsworks.ai")
fs = project.get_feature_store()

2026-01-03 20:36:07,948 INFO: Closing external client and cleaning up certificates.
2026-01-03 20:36:07,966 INFO: Connection closed.
2026-01-03 20:36:07,968 INFO: Initializing external client
2026-01-03 20:36:07,969 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-03 20:36:09,125 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3207


In [11]:
flights_fg = fs.get_feature_group('flight_schedules', version=1)
temporal_fg = fs.get_feature_group('temporal_features', version=1)
weather_fg = fs.get_feature_group('weather_features', version=1)

df_flights = flights_fg.read()
df_temporal = temporal_fg.read()
df_weather = weather_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.58s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.18s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.10s) 


In [12]:
df_flights['date'] = pd.to_datetime(df_flights['scheduled_time']).dt.date.astype(str)

df = df_flights.merge(
    df_temporal, left_on='date', right_on='date', how='left', suffixes=('', '_temporal')
)

df['scheduled_hour'] = pd.to_datetime(df['scheduled_time']).dt.floor('H')
df_weather['weather_hour'] = pd.to_datetime(df_weather['timestamp']).dt.floor('H')

df = df.merge(
    df_weather, left_on=['arn_airport_role', 'scheduled_hour'], right_on=['airport_code', 'weather_hour'], 
    how='left', suffixes=('', '_weather')
)

In [13]:
np.random.seed(42)
delay_prob = np.full(len(df), 0.15)

if 'weather_condition' in df.columns:
    weather_mult = {'clear': 0.5, 'fog': 2.5, 'rain': 1.5, 'rain_windy': 3.0, 'snow': 4.0, 'windy': 2.0}
    for condition, mult in weather_mult.items():
        mask = df['weather_condition'] == condition
        delay_prob[mask] *= mult

if 'wind_speed' in df.columns:
    delay_prob[df['wind_speed'] > 15] *= 2.0

if 'visibility' in df.columns:
    delay_prob[df['visibility'] < 5] *= 2.5

if 'temperature' in df.columns:
    delay_prob[df['temperature'] < 0] *= 1.8

if 'route_type' in df.columns:
    delay_prob[df['route_type'] == 'international'] *= 1.4

if 'is_peak_travel' in df.columns:
    delay_prob[df['is_peak_travel'] == True] *= 1.6

if 'scheduled_time' in df.columns:
    hour = pd.to_datetime(df['scheduled_time']).dt.hour
    rush = ((hour >= 6) & (hour <= 9)) | ((hour >= 17) & (hour <= 20))
    delay_prob[rush] *= 1.3

if 'is_weekend' in df.columns:
    delay_prob[df['is_weekend'] == True] *= 0.7

delay_prob = np.minimum(delay_prob, 0.95)

is_delayed = np.random.random(len(df)) < delay_prob
delay_minutes = np.where(
    is_delayed,
    np.random.exponential(scale=30, size=len(df)) + 15,
    np.random.normal(loc=0, scale=5, size=len(df))
)
delay_minutes = np.maximum(delay_minutes, 0)

df['is_delayed_synthetic'] = is_delayed
df['delay_minutes_synthetic'] = delay_minutes


In [15]:
df['hour'] = pd.to_datetime(df['scheduled_time']).dt.hour
df['day_of_week'] = pd.to_datetime(df['scheduled_time']).dt.dayofweek
df['month'] = pd.to_datetime(df['scheduled_time']).dt.month

df['time_of_day'] = pd.cut(df['hour'], 
                            bins=[0, 6, 12, 18, 24], 
                            labels=['night', 'morning', 'afternoon', 'evening'],
                            include_lowest=True)

weather_weights = {'clear': 0, 'fog': 2, 'rain': 1, 'rain_windy': 3, 'snow': 4, 'windy': 2}
df['weather_impact'] = df['weather_condition'].map(weather_weights).fillna(0)

df['high_wind'] = (df['wind_speed'] > 15).astype(int)
df['low_visibility'] = (df['visibility'] < 5).astype(int)
df['peak_international'] = (df['is_peak_travel'] & (df['route_type'] == 'international')).astype(int)

categorical_features = [
    'airline_code', 'flight_direction', 'origin_airport', 'destination_airport',
    'route_type', 'terminal', 'di_indicator', 'time_of_day', 'season',
    'flight_status', 'weather_condition'
]

numerical_features = [
    'hour', 'day_of_week', 'month', 'year', 'day',
    'is_weekend', 'is_holiday', 'is_school_break', 'is_peak_travel',
    'is_sportlov', 'is_summer_break', 'is_christmas_break',
    'temperature', 'wind_speed', 'humidity', 'pressure',
    'visibility', 'cloud_cover', 'precipitation',
    'weather_impact', 'high_wind', 'low_visibility', 'peak_international'
]

categorical_features = [f for f in categorical_features if f in df.columns]
numerical_features = [f for f in numerical_features if f in df.columns]

# Convert boolean to int
for col in ['is_weekend', 'is_holiday', 'is_school_break', 'is_peak_travel', 
            'is_sportlov', 'is_summer_break', 'is_christmas_break']:
    if col in df.columns:
        df[col] = df[col].fillna(False).astype(int)

In [16]:
all_features = categorical_features + numerical_features
X = df[all_features].copy()
y = df['is_delayed_synthetic']

for col in numerical_features:
    if col in X.columns:
        X[col] = X[col].fillna(X[col].median())

for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 'UNKNOWN')




In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        scale_pos_weight=(~y_train).sum() / y_train.sum()
    ))
])

import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

model_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = model_pipeline.predict(X_test)
y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}\n")
print(classification_report(y_test, y_pred, target_names=['On-Time', 'Delayed']))

# Optimize decision threshold
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]

y_pred_optimized = (y_pred_proba >= best_threshold).astype(int)

accuracy_opt = accuracy_score(y_test, y_pred_optimized)

print(f"\nOptimized Threshold: {best_threshold:.3f}")
print(f"Optimized Accuracy: {accuracy_opt:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}\n")
print(classification_report(y_test, y_pred_optimized, target_names=['On-Time', 'Delayed']))

# Feature importance
feature_names = (numerical_features + 
                list(model_pipeline.named_steps['preprocessor']
                     .named_transformers_['cat']
                     .get_feature_names_out(categorical_features)))

importances = model_pipeline.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print(f"\nFive Most Important Features:")
print(feature_importance_df.head(5).to_string(index=False))

# Save model
model_dir = "flight_delay_model"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(model_pipeline, f"{model_dir}/model.pkl")

metadata = {
    'categorical_features': categorical_features,
    'numerical_features': numerical_features,
    'model_type': 'XGBoost',
    'training_date': datetime.now().isoformat(),
    'accuracy': float(accuracy),
    'roc_auc': float(roc_auc),
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

with open(f"{model_dir}/metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\nModel saved to {model_dir}/")

Accuracy: 0.593
ROC-AUC: 0.573

              precision    recall  f1-score   support

     On-Time       0.82      0.62      0.71      7803
     Delayed       0.25      0.49      0.33      2010

    accuracy                           0.59      9813
   macro avg       0.54      0.55      0.52      9813
weighted avg       0.71      0.59      0.63      9813


Optimized Threshold: 0.398
Optimized Accuracy: 0.383
ROC-AUC: 0.573

              precision    recall  f1-score   support

     On-Time       0.85      0.27      0.41      7803
     Delayed       0.22      0.82      0.35      2010

    accuracy                           0.38      9813
   macro avg       0.54      0.54      0.38      9813
weighted avg       0.72      0.38      0.40      9813


Five Most Important Features:
                 feature  importance
      peak_international    0.032850
route_type_international    0.026864
       route_type_nordic    0.015860
          is_peak_travel    0.010170
     route_type_domestic    

In [22]:
mr = project.get_model_registry()

flight_delay_model = mr.python.create_model(
    name="flight_delay_predictor",
    description="XGBoost model predicting flight delays at Arlanda Airport",
    metrics={
        "accuracy": float(accuracy),
        "accuracy_optimized": float(accuracy_opt),
        "roc_auc": float(roc_auc),
        "best_threshold": float(best_threshold)
    }
)

flight_delay_model.save(model_dir)

Uploading model files (0 dirs, 0 files):  17%|████████████████▌                                                                                  | 1/6 [00:00<00:04,  1.13it/s]
Uploading /Users/unilangsachin/Desktop/ID2223-Scalable_ML/Flight-Delay-Tracker/flight_delay_model/metadata.json: 0.000%|                      | 0/916 elapsed<00:00 remaining<?[A
Uploading /Users/unilangsachin/Desktop/ID2223-Scalable_ML/Flight-Delay-Tracker/flight_delay_model/metadata.json: 100.000%|██████████████| 916/916 elapsed<00:02 remaining<00:00[A
Uploading model files (0 dirs, 1 files):  17%|████████████████▌                                                                                  | 1/6 [00:03<00:04,  1.13it/s]
Uploading /Users/unilangsachin/Desktop/ID2223-Scalable_ML/Flight-Delay-Tracker/flight_delay_model/model.pkl: 0.000%|                       | 0/578249 elapsed<00:00 remaining<?[A
Uploading /Users/unilangsachin/Desktop/ID2223-Scalable_ML/Flight-Delay-Tracker/flight_delay_model/model.pkl: 10

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3207/models/flight_delay_predictor/1





Model(name: 'flight_delay_predictor', version: 1)