In [8]:
import pandas as pd

# Load the dataset to examine its structure
data_path = '../preprocessing/data_preprocessing.csv'
df = pd.read_csv(data_path)

# Display the first few rows of the dataset and its basic information
df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81708 entries, 0 to 81707
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FlightDepDateTime      81708 non-null  object 
 1   Weather_Intensity      9399 non-null   object 
 2   Weather_Obscuration    5265 non-null   object 
 3   Weather_Precipitation  4087 non-null   object 
 4   Wind_Direction         80665 non-null  float64
 5   Wind_Gusts             15655 non-null  float64
 6   Wind_Speed             81701 non-null  float64
 7   Visibility             81700 non-null  float64
 8   isDelayed              81708 non-null  bool   
dtypes: bool(1), float64(4), object(4)
memory usage: 5.1+ MB


(     FlightDepDateTime Weather_Intensity Weather_Obscuration  \
 0  2020-01-01 04:57:00               NaN                 NaN   
 1  2020-01-01 04:57:00               NaN                 NaN   
 2  2020-01-01 04:57:00               NaN                 NaN   
 3  2020-01-01 04:57:00               NaN                 NaN   
 4  2020-01-01 04:59:00               NaN                 NaN   
 
   Weather_Precipitation  Wind_Direction  Wind_Gusts  Wind_Speed  Visibility  \
 0                   NaN           240.0         NaN        12.0        10.0   
 1                   NaN           240.0         NaN        12.0        10.0   
 2                   NaN           240.0         NaN        12.0        10.0   
 3                   NaN           240.0         NaN        12.0        10.0   
 4                   NaN           240.0         NaN        12.0        10.0   
 
    isDelayed  
 0      False  
 1      False  
 2      False  
 3      False  
 4      False  ,
 None)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Splitting "FlightDepDateTime" into individual components: Year, Month, Day, Hour, Minute
df['FlightDepDateTime'] = pd.to_datetime(df['FlightDepDateTime'], errors='coerce')

# Extracting individual components
df['Year'] = df['FlightDepDateTime'].dt.year
df['Month'] = df['FlightDepDateTime'].dt.month
df['Day'] = df['FlightDepDateTime'].dt.day
df['Hour'] = df['FlightDepDateTime'].dt.hour
df['Minute'] = df['FlightDepDateTime'].dt.minute

# Dropping the original "FlightDepDateTime" column as it's now split into multiple features
df_final = df.drop(columns=['FlightDepDateTime'])

# Handle missing values and preprocess data
# Separate features and target
X = df_final.drop(columns=['isDelayed'])  # Exclude datetime column
y = df_final['isDelayed'].astype(int)  # Encode target variable as 0 and 1

# Encode categorical features
cat_cols = X.select_dtypes(include=['object']).columns
X[cat_cols] = X[cat_cols].fillna('None')  # Fill missing categorical data with 'None'
encoder = LabelEncoder()
for col in cat_cols:
    X[col] = encoder.fit_transform(X[col])

# Display the updated DataFrame structure
df_final.head()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=140, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90     13353
           1       0.73      0.05      0.09      2989

    accuracy                           0.82     16342
   macro avg       0.78      0.52      0.50     16342
weighted avg       0.81      0.82      0.75     16342



# Incorporating Uncertainty

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Assume 'X' and 'y' are defined as in the previous setup

# Set up parameters for the Monte Carlo simulation
n_simulations = 100  # Number of Monte Carlo simulations
weather_features = ['Wind_Direction', 'Wind_Gusts', 'Wind_Speed', 'Visibility']  # Features to simulate noise

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the original deterministic model for comparison
det_model = RandomForestClassifier(n_estimators=60, max_depth=7, random_state=42)
det_model.fit(X_train, y_train)
det_predictions = det_model.predict(X_test)
print("Deterministic Model Performance:")
print(classification_report(y_test, det_predictions))

# Monte Carlo Simulation for probabilistic modeling
prob_predictions = np.zeros((n_simulations, X_test.shape[0]))

for i in range(n_simulations):
    # Add Gaussian noise to weather features in the training set
    X_train_sim = X_train.copy()
    for feature in weather_features:
        noise = np.random.normal(loc=0, scale=X_train[feature].std() * 0.1, size=X_train.shape[0])
        X_train_sim[feature] += noise
    
    # Train model on the noisy data
    model = RandomForestClassifier(n_estimators=60, max_depth=7, random_state=42)
    model.fit(X_train_sim, y_train)
    
    # Store predictions for this simulation
    prob_predictions[i] = model.predict(X_test)

# Compute probabilistic predictions as the mean prediction across simulations
prob_pred_mean = prob_predictions.mean(axis=0) > 0.5  # Threshold at 0.5 for binary classification

# Convert probabilistic predictions to binary values and evaluate
print("\nProbabilistic Model Performance:")
print(classification_report(y_test, prob_pred_mean.astype(int)))

# Compare deterministic and probabilistic predictions in terms of accuracy
det_accuracy = accuracy_score(y_test, det_predictions)
prob_accuracy = accuracy_score(y_test, prob_pred_mean.astype(int))

print(f"\nDeterministic Model Accuracy: {det_accuracy:.2f}")
print(f"Probabilistic Model Accuracy: {prob_accuracy:.2f}")

Deterministic Model Performance:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     20058
           1       0.73      0.05      0.09      4455

    accuracy                           0.82     24513
   macro avg       0.78      0.52      0.50     24513
weighted avg       0.81      0.82      0.75     24513


Probabilistic Model Performance:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90     20058
           1       0.72      0.05      0.09      4455

    accuracy                           0.82     24513
   macro avg       0.77      0.52      0.50     24513
weighted avg       0.81      0.82      0.76     24513


Deterministic Model Accuracy: 0.82
Probabilistic Model Accuracy: 0.82
