In [11]:
# Add these imports at the VERY TOP
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load and preprocess data
df = pd.read_csv("opensky_departures.csv", 
                 names=[
                     'date_airline',
                     'flight_num',
                     'departure_time', 
                     'arrival_time',
                     'duration_aircraft'
                 ],
                 sep=r'\s{2,}',  # Split on 2+ whitespaces
                 engine='python')

# Clean whitespace and handle splits
df = df.astype(str)
df['duration_aircraft'] = df['duration_aircraft'].str.strip()

# Robust split using regex to capture duration and aircraft
split_df = df['duration_aircraft'].str.extract(r'^(\d+)\s+(.*)$', expand=True)
split_df.columns = ['duration', 'aircraft']

# Merge back into original DataFrame
df = pd.concat([df, split_df], axis=1)

# Split date_airline column
df[['date', 'airline']] = df['date_airline'].str.extract(r'^(\d{2}-\d{2}-\d{4})\s+(.*)$')

# Convert to proper datetime format
df['departure_time'] = pd.to_datetime(
    df['date'] + ' ' + df['departure_time'],
    format='%d-%m-%Y %H:%M'
)

df['arrival_time'] = pd.to_datetime(
    df['date'] + ' ' + df['arrival_time'],
    format='%d-%m-%Y %H:%M'
)

# Clean and transform data
df = df[['date', 'airline', 'flight_num', 
         'departure_time', 'arrival_time', 
         'duration', 'aircraft']]

# Convert duration to numeric and clean
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')
df = df.dropna(subset=['duration'])

# Create features
df['hour'] = df['departure_time'].dt.hour
df['day_of_week'] = df['departure_time'].dt.dayofweek
df['month'] = df['departure_time'].dt.month

# Calculate delay and score
df['scheduled_duration'] = 250  # LAX-ORD typical flight time
df['delay'] = df['duration'] - df['scheduled_duration']
df['score'] = (1 - (df['delay'].clip(lower=0) / 120)) * 100
df.loc[df['delay'] < 0, 'score'] = 100

# Encode airline
le = LabelEncoder()
df['airline_encoded'] = le.fit_transform(df['airline'])

In [14]:
# Proceed with XGBoost training
features = ['airline_encoded', 'hour', 'day_of_week', 'month', 'duration']
X = df[features]
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [2]:
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(X_train, y_train)

NameError: name 'xgb' is not defined