# **Machine Learning Training**


We recommend using Google Colab for this section because it provides free access to additional RAM and GPUs, making it ideal for handling memory-intensive machine learning tasks without the need for powerful local hardware.

#### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report

from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler 

import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive

import joblib


#### Mount Google Drive

In [None]:
drive.mount('/content/drive', force_remount=True)

#### Read file and divide it into a train and test dataset

Your df should point to the dataframe preporcessed for machine learning.

In [None]:
# Define a generic file path as an argument or variable
file_path = 'yourcsv.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Divide into train-test split
X = df.drop(columns=['departure_delay_binary_FA'])
y = df['departure_delay_binary_FA']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Drop features

In [5]:

drop_features = ['arrival_delay_binary_FA', # drop arrival-related features
                'flight_rules_arrival', # drop arrival-related features
                'visibility_meters_arrival', # drop arrival-related features
                'clouds_layer_1_type_arrival', # drop arrival-related features
                'clouds_layer_1_altitude_category_arrival', # drop arrival-related features
                'clouds_layer_2_type_arrival', # drop arrival-related features
                'clouds_layer_2_altitude_category_arrival', # drop arrival-related features
                'clouds_layer_3_type_arrival', # drop arrival-related features
                'clouds_layer_3_altitude_category_arrival', # drop arrival-related features 
                'clouds_layer_4_type_arrival', # drop arrival-related features
                'clouds_layer_4_altitude_category_arrival', # drop arrival-related features
                'clouds_layer_5_type_arrival', # drop arrival-related features
                'clouds_layer_5_altitude_category_arrival', # drop arrival-related features
                'clouds_layer_6_type_arrival', # drop arrival-related features
                'clouds_layer_6_altitude_category_arrival', # drop arrival-related features
                'arrival_delay_binary_FA', # drop arrival-related features
                'destination_region', # drop arrival-related features
                'destination.code_icao', # drop arrival-related features
                'relative_humidity_arrival', # drop arrival-related features
                'dewpoint_arrival', # drop arrival-related features
                'temperature_arrival', # drop arrival-related features
                'wind_speed_arrival', # drop arrival-related features
                'altimeter_hpa_arrival', # drop arrival-related features
                'pressure_altitude_arrival', # drop arrival-related features
                'density_altitude_arrival', # drop arrival-related features
                'wind_gust_arrival', # drop arrival-related features
                'remarks_info.sea_level_pressure_arrival', # drop arrival-related features
                'wind_variable_change_arrival', # drop arrival-related features
                'wx_code_blowing_dust_arrival', # drop arrival-related features
                'wx_code_blowing_snow_arrival', # drop arrival-related features
                'wx_code_fog_arrival', # drop arrival-related features
                'wx_code_funnel_cloud_arrival', # drop arrival-related features
                'wx_code_hail_arrival', # drop arrival-related features
                'wx_code_haze_arrival', # drop arrival-related features
                'wx_code_heavy_rain_arrival', # drop arrival-related features
                'wx_code_heavy_snow_arrival', # drop arrival-related features
                'wx_code_light_fog_arrival', # drop arrival-related features
                'wx_code_light_hail_arrival', # drop arrival-related features
                'wx_code_light_rain_arrival', # drop arrival-related features
                'wx_code_light_snow_arrival', # drop arrival-related features
                'wx_code_rain_arrival', # drop arrival-related features
                'wx_code_smoke_arrival', # drop arrival-related features
                'wx_code_snow_arrival', # drop arrival-related features
                'wx_code_thunderstorm_arrival', # drop arrival-related features
                'wx_code_vicinity_fog_arrival', # drop arrival-related features
                'wx_code_vicinity_showers_arrival', # drop arrival-related features
                'destination_sub_region', # drop arrival-related features
                'METAR_departure', # drop the unprocessed departure METAR
                'density_altitude_departure', # covered by 'pressure_altitude_departure'
                'visibility_meters_departure', # covered by 'flight_rules_departure'
                'origin_region', # covered by 'origin_sub_region'
                'wx_code_blowing_dust_departure', # covered by 'wx_sum_departure'
                'wx_code_blowing_snow_departure', # covered by 'wx_sum_departure'
                'wx_code_fog_departure', # covered by 'wx_sum_departure'
                'wx_code_funnel_cloud_departure', # covered by 'wx_sum_departure'
                'wx_code_hail_departure', # covered by 'wx_sum_departure'
                'wx_code_haze_departure', # covered by 'wx_sum_departure'
                'wx_code_heavy_rain_departure', # covered by 'wx_sum_departure'
                'wx_code_heavy_snow_departure', # covered by 'wx_sum_departure'
                'wx_code_light_fog_departure', # covered by 'wx_sum_departure'
                'wx_code_light_hail_departure', # covered by 'wx_sum_departure'
                'wx_code_light_rain_departure', # covered by 'wx_sum_departure'
                'wx_code_light_snow_departure', # covered by 'wx_sum_departure'
                'wx_code_rain_departure', # covered by 'wx_sum_departure'
                'wx_code_smoke_departure', # covered by 'wx_sum_departure'
                'wx_code_snow_departure', # covered by 'wx_sum_departure'
                'wx_code_thunderstorm_departure', # covered by 'wx_sum_departure'
                'wx_code_vicinity_fog_departure', # covered by 'wx_sum_departure'
                'wx_code_vicinity_showers_departure', # covered by 'wx_sum_departure'
                'wx_binary_departure', # covered by 'wx_sum_departure'
                'flight_rules_departure', # covered by 'LIFR_binary_departure'
                'flight_type' # covered by 'filed_ete'
                'manufacturer', # didn't lead to better model performance
                'clouds_layer_2_type_departure', # didn't lead to better model performance
                'clouds_layer_2_altitude_category_departure', # didn't lead to better model performance
                'clouds_layer_3_type_departure', # didn't lead to better model performance
                'clouds_layer_3_altitude_category_departure', # didn't lead to better model performance
                'clouds_layer_4_type_departure', # didn't lead to better model performance
                'clouds_layer_4_altitude_category_departure', # didn't lead to better model performance
                'clouds_layer_5_type_departure', # didn't lead to better model performance
                'clouds_layer_5_altitude_category_departure', # didn't lead to better model performance
                'clouds_layer_6_type_departure', # didn't lead to better model performance
                'clouds_layer_6_altitude_category_departure', # didn't lead to better model performance
                'remarks_info.sea_level_pressure_departure', # didn't lead to better model performance
                 ]

X_train.drop(columns=drop_features, inplace=True)
X_test.drop(columns=drop_features, inplace=True)

#### Define the data types of the features before feeding them into the pipeline

In [None]:
categorical_features = ['operator_icao',
                        'route_code',
                        'aircraft_type'            ,
                        'origin.code_icao'         ,
                        'departure_time_of_day'    ,
                        'departure_month'          ,
                        'departure_weekday'        ,
                        'week_no'                  ,
                        'origin_sub_region'       ,
                        'clouds_layer_1_type_departure',
                        'clouds_layer_1_altitude_category_departure',
                        ]

numeric_features = ['filed_ete',
                    'relative_humidity_departure',
                    'dewpoint_departure',
                    'temperature_departure',
                    'pressure_altitude_departure',
                    'wind_speed_departure',
                    'wind_gust_departure',
                    'altimeter_hpa_departure']

passthrough_features = ['wind_variable_change_departure',
                        'pressure_tendency_decreasing_or_steady_then_increasing_departure',
                        'pressure_tendency_decreasing_steadily_or_unsteadily_departure',
                        'pressure_tendency_decreasing_then_increasing_departure',
                        'pressure_tendency_decreasing_then_steady_departure',
                        'pressure_tendency_increasing_steadily_or_unsteadily_departure',
                        'pressure_tendency_increasing_then_decreasing_departure',
                        'pressure_tendency_increasing_then_steady_departure',
                        'pressure_tendency_steady_departure',
                        'pressure_tendency_steady_or_increasing_then_decreasing_departure',
                        'wx_sum_departure',
                        'LIFR_binary_departure']

#### Define ML pipeline

In [None]:
# Define a pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    # Impute missing categorical values with 'Not Available'
    ("cat_imputer", SimpleImputer(strategy='constant',
                                  fill_value='Not Available').set_output(transform="pandas")),
    
    # Apply One-Hot Encoding to categorical variables (ignores unknown categories)
    ("onehot", OneHotEncoder(sparse_output=False,
                             handle_unknown="ignore").set_output(transform="pandas"))
])

# Define a pipeline for preprocessing numerical features
numeric_transformer = Pipeline(steps=[
    # Use KNN imputation to fill missing values based on nearest neighbors
    ("knn_imputer", KNNImputer(n_neighbors=5).set_output(transform="pandas")),
    
    # Apply a power transformation to correct skewed data distributions (Yeo-Johnson method)
    ('power_transform', PowerTransformer(method='yeo-johnson')),  # Transform skewed data
    
    # Use Robust Scaler to scale features, handling outliers by using the median and interquartile range
    ('robust_scaler', RobustScaler())  # Handle outliers
])

# Define a ColumnTransformer to apply the appropriate transformations to numeric and categorical features
preprocessor = ColumnTransformer(transformers=[
    # Apply the numeric transformer to the numerical features
    ("num", numeric_transformer,
     numeric_features),
    
    # Apply the categorical transformer to the categorical features
    ("cat", categorical_transformer,
     categorical_features)
],
# The 'remainder' argument allows us to keep the other columns (not specified in transformers) as they are
remainder='passthrough'  # Default; explicitly drops unspecified columns
).set_output(transform="pandas")  # Ensures output is returned as a pandas DataFrame

#### Random Forest Classifier

In [6]:
# Initialize a Random Forest Classifier with a fixed random seed for reproducibility
rf = RandomForestClassifier(random_state=42)

# Define the best hyperparameters for the Random Forest model
best_params = {
    'bootstrap': True,  # Use bootstrap sampling (random sampling with replacement)
    'max_features': 'log2',  # Limit the number of features to log2(n_features) for each split
    'n_estimators': 65  # Number of trees in the forest (set to 65 for better performance)
}

# Create a pipeline that includes preprocessing, handling class imbalance, and training the model
pipeline_rf_rus = ImbPipeline(steps=[
    # Step 1: Apply the preprocessing steps defined earlier (e.g., scaling, imputation)
    ("pre_process", preprocessor),

    # Step 2: Use RandomUnderSampler to handle class imbalance by randomly undersampling the majority class
    ("rus", RandomUnderSampler(random_state=42)),

    # Step 3: Apply the Random Forest Classifier model with the best hyperparameters
    ("model", rf.set_params(**best_params))
])

# Fit the entire pipeline to the training data (including preprocessing, balancing, and model training)
pipeline_rf_rus.fit(X_train, y_train)

# Make predictions on the test set using the trained model
y_pred = pipeline_rf_rus.predict(X_test)

# Extract the class labels from the trained Random Forest model (this will be used for evaluation or interpretation)
class_labels = pipeline_rf_rus.named_steps['model'].classes_

#### Print the results

In [None]:
# Print the classification report for the model's performance on the test set
# This report includes metrics like precision, recall, f1-score, and support for each class
print(classification_report(y_test, y_pred))

# Define the labels for predicted and true classes
pred_labels = ['pred not delayed', 'pred delayed']  # Predicted class labels for the model's output
class_labels = ['not delayed', 'delayed']  # True class labels for the actual test data

# Create a confusion matrix to evaluate the model's performance in classifying 'delayed' vs 'not delayed'
# The confusion matrix shows the true positives, false positives, false negatives, and true negatives
# It is formatted into a DataFrame for easier visualization with proper column and index labels
pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=pred_labels,  # Set the column labels to the predicted class labels
             index=class_labels)   # Set the index labels to the actual class labels


#### Export the model

In [None]:
joblib.dump(pipeline_rf_rus, 'pipeline_rf_rus_model.pkl')

#### Export the results

The code uses the trained model (pipeline_rf_rus) to predict class probabilities for a test set (X_test), extracts the probability for the positive class, and creates a DataFrame (df_final) that includes the true labels, predicted labels, and predicted probabilities. It then merges additional data from another DataFrame (df), renames some columns for clarity, and saves the final DataFrame to a CSV file.

In [None]:
# Predicting class probabilities using the trained model pipeline_rf_rus
# The `predict_proba` method returns a 2D array of probabilities for each class
probabilities = pipeline_rf_rus.predict_proba(X_test)

# Extracting the probability of the positive class (class 1) from the predicted probabilities
# `[:, 1]` accesses the second column (index 1), which contains the probability of class 1
positive_class_prob = probabilities[:, 1]

# Creating a DataFrame 'df_final' from the features in the test set (X_test)
df_final = pd.DataFrame(X_test)

# Adding the true labels (y_test) to the DataFrame as a new column
df_final['true_label'] = y_test

# Adding the predicted labels (y_pred) to the DataFrame as a new column
df_final['y_pred'] = y_pred

# Adding the predicted probabilities for the positive class to the DataFrame as a new column
df_final['predicted_prob_class_1'] = positive_class_prob

# Merging the 'df_final' DataFrame with the DataFrame 'df' based on the index in order to add re-attach the 'METAR_departure' column
df_final = df_final.merge(df[['METAR_departure']], left_index=True, right_index=True, how='left')

# Renaming columns for better clarity and consistency
# Renaming 'METAR_departure' to 'METAR_departure_ref' and 'destination.code_icao' to 'destination.code_icao_ref'
df_final.rename(columns={'METAR_departure': 'METAR_departure_ref', 'destination.code_icao': 'destination.code_icao_ref'}, inplace=True)

# Saving the final DataFrame 'df_final' as a CSV file to the specified path
# The 'index=True' argument ensures that the DataFrame index is saved as a column in the CSV file
df_final.to_csv('/df_final.csv', index=True)
