In [2]:
!pip install xgboost --break-system-packages

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.2


In [21]:
import pandas as pd
import numpy as np
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 1: Load Data
all_files = glob.glob("data/*.csv")
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
print(df.columns)

Index(['FlightDate', 'Reporting_Airline', 'OriginAirportSeqID', 'OriginState',
       'DepDelayMinutes', 'Cancelled', 'CancellationCode', 'Diverted',
       'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay'],
      dtype='object')


In [18]:
# Step 2: Preprocessing
# Fill missing delay columns with 0
delay_columns = ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
df[delay_columns] = df[delay_columns].fillna(0)

# Convert FlightDate to datetime and extract features
df['FlightDate'] = pd.to_datetime(df['FlightDate'])
df['Day'] = df['FlightDate'].dt.day
df['Month'] = df['FlightDate'].dt.month
df['Year'] = df['FlightDate'].dt.year

# One-hot encoding for categorical features
df = pd.get_dummies(df, columns=['Reporting_Airline', 'OriginState'])

# Define the target variable and features
X = df.drop(columns=['DepDelayMinutes', 'FlightDate', 'Cancelled', 'Diverted', 'CancellationCode'])
y = df['DepDelayMinutes']

KeyError: "None of [Index(['Reporting_Airline', 'OriginState'], dtype='object')] are in the [columns]"

In [19]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Step 4: Model Training
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

ValueError: Input y contains NaN.

In [None]:
# Step 5: Evaluation
# Calculate MAE and RMSE for both models
rf_mae = mean_absolute_error(y_test, rf_preds)
rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)

xgb_mae = mean_absolute_error(y_test, xgb_preds)
xgb_rmse = mean_squared_error(y_test, xgb_preds, squared=False)

print(f"Random Forest MAE: {rf_mae}, RMSE: {rf_rmse}")
print(f"XGBoost MAE: {xgb_mae}, RMSE: {xgb_rmse}")

# Feature Importance for XGBoost
importances = xgb_model.feature_importances_
indices = np.argsort(importances)[-10:]  # top 10 features
plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), importances[indices], align="center")
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.xlabel("Feature Importance")
plt.title("Top 10 Important Features for Delays Prediction")
plt.show()

In [None]:
# Step 6: Visualization
# Scatter plot for WeatherDelay vs DepDelayMinutes
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['WeatherDelay'], y=df['DepDelayMinutes'])
plt.xlabel('Weather Delay (minutes)')
plt.ylabel('Departure Delay (minutes)')
plt.title('Weather Delay vs. Departure Delay')
plt.show()