In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, classification_report

from xgboost import XGBClassifier

import joblib


In [5]:
df = pd.read_csv("Machine-Learning.csv", low_memory=False)

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

df = df.drop_duplicates()

print(df.shape)
df.head()


(102956, 28)


Unnamed: 0,incident_date,incident_year,incident_month,incident_day,operator,aircraft,aircraft_type,aircraft_make,aircraft_model,aircraft_mass,...,timeofday,precipitation,height,distance,species_name,flight_impact,aircraft_damage,engine_ingested,visibility,season
0,2000-01-01,2000,1,1,UNITED AIRLINES,B-737-300,A,148,24.0,4.0,...,,,,,UNKNOWN MEDIUM BIRD,,No,0,Clear,Winter
1,2000-01-01,2000,1,1,AMERICAN AIRLINES,B-727-200,A,148,11.0,4.0,...,DAWN,NONE,0.0,0.0,UNKNOWN SMALL BIRD,NONE,No,0,Clear,Winter
2,2000-01-01,2000,1,1,CONTINENTAL AIRLINES,B-757-200,A,148,26.0,4.0,...,DAY,FOG,0.0,0.0,UNKNOWN MEDIUM BIRD,,No,0,Poor,Winter
3,2000-01-01,2000,1,1,US CUSTOMS AND BORDER PROTECTION,C-550,A,226,37.0,3.0,...,DAY,,1000.0,,UNKNOWN LARGE BIRD,PRECAUTIONARY LANDING,Yes,0,Clear,Winter
4,2000-01-01,2000,1,1,UNITED AIRLINES,B-727-200,A,148,11.0,4.0,...,,,,,UNKNOWN MEDIUM BIRD,,No,0,Clear,Winter


In [6]:

df["visibility"] = df["visibility"].astype(str)
df["airport"] = df["airport"].astype(str)
df["timeofday"] = df["timeofday"].astype(str)


In [7]:

df["aircraft_damage"] = df["aircraft_damage"].map({"Yes": 1, "No": 0})

df = df.dropna(subset=["aircraft_damage"])

features = [
    "incident_year",
    "incident_month",
    "timeofday",
    "airport",
    "visibility"
]

num_features = ["incident_year", "incident_month"]
cat_features = ["timeofday", "airport", "visibility"]

X = df[features]
y = df["aircraft_damage"]


In [8]:
preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_features),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features)
])


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [10]:
xgb_birdstrike = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

pipe_birdstrike = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_birdstrike)
])

param_grid_birdstrike = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0]
}

grid_birdstrike = GridSearchCV(
    pipe_birdstrike,
    param_grid_birdstrike,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2
)

grid_birdstrike.fit(X_train, y_train)

best_birdstrike_model = grid_birdstrike.best_estimator_


Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=0.8; total time=   0.9s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=1.0; total time=   0.9s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=0.8; total time=   1.0s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=0.8; total time=   1.1s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=1.0; total time=   0.9s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=1.0; total time=   0.9s
[CV] END model__colsample_bytree=

In [11]:
y_prob = best_birdstrike_model.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


ROC-AUC: 0.6878794993185425


In [12]:
from sklearn.metrics import roc_auc_score, log_loss, root_mean_squared_error

# Predict probabilities
y_test_prob = best_birdstrike_model.predict_proba(X_test)[:, 1]

# Metrics
roc_auc = roc_auc_score(y_test, y_test_prob)
logloss = log_loss(y_test, y_test_prob)
rmse = root_mean_squared_error(y_test, y_test_prob)

print("ðŸ“Š Birdstrike Probability Model Evaluation")
print(f"ROC-AUC : {roc_auc:.4f}")
print(f"LogLoss : {logloss:.4f}")
print(f"RMSE    : {rmse:.4f}")


ðŸ“Š Birdstrike Probability Model Evaluation
ROC-AUC : 0.6879
LogLoss : 0.2918
RMSE    : 0.2850


In [13]:
strike_df = df[df["aircraft_damage"] == 1]


In [14]:
from sklearn.preprocessing import LabelEncoder

# Drop missing values
species_df = strike_df.dropna(subset=features + ["species_name"]).copy()
species_df["species_name"] = species_df["species_name"].astype(str)

# Keep species with at least 20 occurrences
MIN_COUNT = 20
species_counts = species_df["species_name"].value_counts()

species_df["species_name_grouped"] = species_df["species_name"].apply(
    lambda x: x if species_counts[x] >= MIN_COUNT else "Other"
)


In [15]:
species_encoder = LabelEncoder()

X_species = species_df[features]
y_species = species_encoder.fit_transform(species_df["species_name_grouped"])


In [16]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_species,
    y_species,
    test_size=0.2,
    random_state=42,
    stratify=y_species
)


In [17]:
xgb_species = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
    num_class=len(np.unique(y_species))
)

pipe_species = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_species)
])

param_grid_species = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.05, 0.1]
}

grid_species = GridSearchCV(
    pipe_species,
    param_grid_species,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid_species.fit(X_train_s, y_train_s)

best_species_model = grid_species.best_estimator_


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200; total time=   6.8s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200; total time=   7.0s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200; total time=   7.1s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300; total time=   9.8s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300; total time=  10.2s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300; total time=  10.3s
[CV] END model__learning_rate=0.05, model__max_depth=6, model__n_estimators=200; total time=  10.7s
[CV] END model__learning_rate=0.05, model__max_depth=6, model__n_estimators=200; total time=  10.8s
[CV] END model__learning_rate=0.1, model__max_depth=4, model__n_estimators=200; total time=   6.5s
[CV] END model__learning_rate=0.1, model_

In [18]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Predictions
y_pred_species = best_species_model.predict(X_test_s)

print("ðŸ“Š Bird Species Model Evaluation")
print("Accuracy:", accuracy_score(y_test_s, y_pred_species))
print("F1 Score (weighted):", f1_score(y_test_s, y_pred_species, average="weighted"))

print("\nClassification Report:")
print(classification_report(
    y_test_s,
    y_pred_species,
    target_names=species_encoder.classes_
))


ðŸ“Š Bird Species Model Evaluation
Accuracy: 0.29835390946502055
F1 Score (weighted): 0.22130180561549656

Classification Report:
                             precision    recall  f1-score   support

              AMERICAN COOT       0.14      0.10      0.12        10
             AMERICAN ROBIN       0.00      0.00      0.00        12
            AMERICAN WIGEON       0.00      0.00      0.00         5
                 BALD EAGLE       0.50      0.07      0.12        14
                   BARN OWL       0.00      0.00      0.00         6
              BLACK VULTURE       0.00      0.00      0.00        19
                  BLACKBIRD       0.00      0.00      0.00         6
           BLUE-WINGED TEAL       0.00      0.00      0.00         4
               CANADA GOOSE       0.18      0.04      0.07        94
               CATTLE EGRET       0.00      0.00      0.00         6
                     COYOTE       0.00      0.00      0.00         7
   DOUBLE-CRESTED CORMORANT       0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
phase_df = strike_df.dropna(subset=features + ["flight_phase"]).copy()
phase_df["flight_phase"] = phase_df["flight_phase"].astype(str)

from sklearn.preprocessing import LabelEncoder
phase_encoder = LabelEncoder()

X_phase = phase_df[features]
y_phase = phase_encoder.fit_transform(phase_df["flight_phase"])



In [20]:



phase_df = strike_df.dropna(subset=features + ["flight_phase"]).copy()


phase_df["flight_phase"] = phase_df["flight_phase"].astype(str)


phase_encoder = LabelEncoder()

X_phase = phase_df[features]
y_phase = phase_encoder.fit_transform(phase_df["flight_phase"])


X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_phase,
    y_phase,
    test_size=0.2,
    random_state=42,
    stratify=y_phase
)


xgb_phase = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
    num_class=len(np.unique(y_phase))
)


pipe_phase = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_phase)
])


param_grid_phase = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.05, 0.1]
}

grid_phase = GridSearchCV(
    pipe_phase,
    param_grid_phase,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)


grid_phase.fit(X_train_p, y_train_p)


best_phase_model = grid_phase.best_estimator_


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200; total time=   1.5s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200; total time=   1.5s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200; total time=   1.6s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300; total time=   2.3s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300; total time=   2.3s
[CV] END model__learning_rate=0.05, model__max_depth=6, model__n_estimators=200; total time=   2.3s
[CV] END model__learning_rate=0.05, model__max_depth=6, model__n_estimators=200; total time=   2.4s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300; total time=   2.4s
[CV] END model__learning_rate=0.1, model__max_depth=4, model__n_estimators=200; total time=   1.4s
[CV] END model__learning_rate=0.05, model

In [21]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Predictions
y_pred_phase = best_phase_model.predict(X_test_p)

print("ðŸ“Š Flight Phase Model Evaluation")
print("Accuracy:", accuracy_score(y_test_p, y_pred_phase))
print("F1 Score (weighted):", f1_score(y_test_p, y_pred_phase, average="weighted"))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_p, y_pred_phase))


ðŸ“Š Flight Phase Model Evaluation
Accuracy: 0.4961439588688946
F1 Score (weighted): 0.4038531627214481

Confusion Matrix:
[[586   0  34   0   1   4   0   2   0   5   0   9   0]
 [  1   0   0   0   0   0   0   0   0   0   0   0   0]
 [348   0  46   0   0   0   0   1   0   1   0  15   0]
 [  2   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 72   0   3   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0 120   0   0   0  26   0   0   0]
 [  5   0   0   0   0   0   0   0   0   0   0   0   0]
 [169   0   9   0   0   1   0   1   0   0   0   1   0]
 [  1   0   1   0   0   0   0   0   0   0   0   0   0]
 [ 11   0   2   0   0  38   0   1   0 199   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0   0   0   0]
 [190   0  18   0   1   1   0   0   0   2   0  13   0]
 [  3   0   0   0   0   0   0   0   0   0   0   0   0]]


In [22]:
pd.Series(phase_encoder.classes_)


0         APPROACH
1          ARRIVAL
2            CLIMB
3        DEPARTURE
4          DESCENT
5         EN ROUTE
6          LANDING
7     LANDING ROLL
8            LOCAL
9              NAN
10          PARKED
11     TAKEOFF RUN
12            TAXI
dtype: object

In [23]:
joblib.dump(best_birdstrike_model, "xgb_birdstrike_model.pkl")
joblib.dump(best_species_model, "xgb_species_model.pkl")
joblib.dump(best_phase_model, "xgb_phase_model.pkl")


['xgb_phase_model.pkl']

In [24]:
def create_user_input(date, timeofday, airport, visibility):
    date = pd.to_datetime(date)
    return pd.DataFrame([{
        "incident_year": date.year,
        "incident_month": date.month,
        "timeofday": timeofday,
        "airport": airport,
        "visibility": visibility
    }])


In [25]:
def compare_airports(date, timeofday, dep_airport, arr_airport, visibility):
    dep_input = create_user_input(date, timeofday, dep_airport, visibility)
    arr_input = create_user_input(date, timeofday, arr_airport, visibility)

    dep_prob = best_birdstrike_model.predict_proba(dep_input)[:, 1][0]
    arr_prob = best_birdstrike_model.predict_proba(arr_input)[:, 1][0]

    return {
        "departure_airport_risk": round(dep_prob, 3),
        "arrival_airport_risk": round(arr_prob, 3),
        "higher_risk_airport": dep_airport if dep_prob > arr_prob else arr_airport
    }


In [26]:
def birdstrike_prediction_system(
    date,
    timeofday,
    departure_airport,
    arrival_airport,
    visibility
):
   
    user_input = create_user_input(
        date, timeofday, departure_airport, visibility
    )

   
    strike_prob = best_birdstrike_model.predict_proba(user_input)[:, 1][0]

    
    species_encoded = best_species_model.predict(user_input)[0]
    species = species_encoder.inverse_transform([species_encoded])[0]

   
    phase_encoded = best_phase_model.predict(user_input)[0]
    phase = phase_encoder.inverse_transform([phase_encoded])[0]

    
    airport_risk = compare_airports(
        date, timeofday, departure_airport, arrival_airport, visibility
    )

    return {
        "birdstrike_probability": round(float(strike_prob), 3),
        "most_likely_bird_species": species,
        "most_likely_flight_phase": phase,
        "airport_risk_analysis": airport_risk
    }



In [27]:
birdstrike_prediction_system(
    date="2010-07-15",
    timeofday="Day",
    departure_airport="JFK Intl",
    arrival_airport="LAX Intl",
    visibility="Clear"
)


{'birdstrike_probability': 0.071,
 'most_likely_bird_species': 'Other',
 'most_likely_flight_phase': 'APPROACH',
 'airport_risk_analysis': {'departure_airport_risk': np.float32(0.071),
  'arrival_airport_risk': np.float32(0.071),
  'higher_risk_airport': 'LAX Intl'}}

In [28]:
birdstrike_prediction_system(
    date="2012-01-15",
    timeofday="Night",
    departure_airport="DEN Intl",
    arrival_airport="PHX Intl",
    visibility="Clear"
)


{'birdstrike_probability': 0.157,
 'most_likely_bird_species': 'Other',
 'most_likely_flight_phase': 'APPROACH',
 'airport_risk_analysis': {'departure_airport_risk': np.float32(0.157),
  'arrival_airport_risk': np.float32(0.157),
  'higher_risk_airport': 'PHX Intl'}}

In [29]:
import joblib

joblib.dump(species_encoder, "models/species_encoder.pkl")
joblib.dump(phase_encoder, "models/phase_encoder.pkl")

['models/phase_encoder.pkl']