In [27]:
import pandas as pd
import os

# Extracted Files Folder
extract_path = "/content/drive/MyDrive/extracted"

# Print Extracted Files
extracted_files = os.listdir(extract_path)
print("Extracted files:")
print(extracted_files)

if extracted_files:
    first_file_path = os.path.join(extract_path, extracted_files[0])
    if first_file_path.endswith('.csv'):
        use_cols = ['DayofMonth', 'DayOfWeek', 'ArrDelay', 'DepDelay', 'Distance', 'Month']
        data = pd.read_csv(first_file_path, usecols=use_cols)

        # Ensure 'DepDelay' is numeric, if there are non-numeric values, mark them as NAN
        data['DepDelay'] = pd.to_numeric(data['DepDelay'], errors='coerce')
        data['ArrDelay'] = pd.to_numeric(data['ArrDelay'], errors='coerce')

        # Remove rows where DepDelay is NaN
        data_clean = data.dropna(subset=['DepDelay', 'ArrDelay'])

        # Equal data from each month
        num_samples_per_month = 490 // 7
        sampled_data = data_clean.groupby('Month').apply(lambda x: x.sample(num_samples_per_month, random_state=42)).reset_index(drop=True)

        # Print Samples
        print("Divided Data:")
        print(sampled_data.head())

    else:
        print(f"Error. {first_file_path}")
else:
    print("No file found.")


Extracted files:
['Combined_Flights_2022.csv']
Divided Data:
   DepDelay  Distance  Month  DayofMonth  DayOfWeek  ArrDelay
0      -2.0     187.0      1          19          3     -20.0
1      -7.0    2475.0      1          24          1     -19.0
2      -1.0     427.0      1          29          6     -25.0
3      -3.0     762.0      1          24          1      -9.0
4      35.0    1635.0      1           7          5      13.0


  sampled_data = data_clean.groupby('Month').apply(lambda x: x.sample(num_samples_per_month, random_state=42)).reset_index(drop=True)


In [35]:
import random

weather_conditions = {
    1: ["Snowy", "Windy", "Cloudy"],
    2: ["Snowy", "Windy", "Cloudy"],
    3: ["Windy", "Rainy", "Cloudy"],
    4: ["Rainy", "Cloudy", "Sunny", "Windy"],
    5: ["Mostly Sunny", "Windy", "Sunny"],
    6: ["Sunny", "Partly Cloudy"],
    7: ["Sunny", "Mostly Sunny"]
}

sampled_data['WeatherCondition'] = sampled_data['Month'].apply(
    lambda x: random.choice(weather_conditions[x])
)

print(sampled_data.head())

output_file_path = "/content/drive/MyDrive/sampled_data_with_weatherCond.csv"

sampled_data.to_csv(output_file_path, index=False)

print(f"The data has been saved to {output_file_path}.")

   DepDelay  Distance  Month  DayofMonth  DayOfWeek  ArrDelay WeatherCondition
0      -2.0     187.0      1          19          3     -20.0            Windy
1      -7.0    2475.0      1          24          1     -19.0            Windy
2      -1.0     427.0      1          29          6     -25.0           Cloudy
3      -3.0     762.0      1          24          1      -9.0           Cloudy
4      35.0    1635.0      1           7          5      13.0           Cloudy
The data has been saved to /content/drive/MyDrive/sampled_data_with_weatherCond.csv.


In [29]:
# Separate the features (X) and target (y)
X = sampled_data.drop('ArrDelay', axis=1)  # Features
y = sampled_data['ArrDelay']  # Target

# One-hot encode WeatherCondition
X = pd.get_dummies(X, columns=['WeatherCondition'])

# Split the data
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert the datasets to DataFrame format
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_val_df = pd.DataFrame(X_val, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Add the target labels
train_set = X_train_df.copy()
train_set['ArrDelay'] = y_train.reset_index(drop=True)

val_set = X_val_df.copy()
val_set['ArrDelay'] = y_val.reset_index(drop=True)

test_set = X_test_df.copy()
test_set['ArrDelay'] = y_test.reset_index(drop=True)

# Display the shapes of the datasets
print(f"Training set shape: {train_set.shape}")
print(f"Validation set shape: {val_set.shape}")
print(f"Test set shape: {test_set.shape}")

Training set shape: (294, 13)
Validation set shape: (98, 13)
Test set shape: (98, 13)


In [30]:
# Display the first few rows of the datasets
print("\nTraining set preview:")
print(X_train.head())

print("\nValidation set preview:")
print(X_val.head())

print("\nTest set preview:")
print(X_test.head())


Training set preview:
     DepDelay  Distance  Month  DayofMonth  DayOfWeek  \
74        0.0    2422.0      2          23          3   
399     -11.0     562.0      6          23          4   
266      15.0     358.0      4           7          4   
238      -1.0     867.0      4          15          5   
119      -5.0     325.0      2          16          3   

     WeatherCondition_Cloudy  WeatherCondition_Mostly Sunny  \
74                      True                          False   
399                    False                          False   
266                    False                          False   
238                    False                          False   
119                    False                          False   

     WeatherCondition_Partly Cloudy  WeatherCondition_Rainy  \
74                            False                   False   
399                           False                   False   
266                           False                    True   
238

In [31]:
# Linear Regression
# Evalution Methods = MSE, R² & MAE

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

model = LinearRegression()

model.fit(X_train, y_train)

# Prediction to Validation set
y_val_pred = model.predict(X_val)

# Validation set performance evaluation
mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)
print(f"Validation Set MSE: {mse_val}")
print(f"Validation Set R²: {r2_val}")
print(f"Validation Set MAE: {mae_val}")

# Test set performance evaluation
y_test_pred = model.predict(X_test)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"Test Set MSE: {mse_test}")
print(f"Test Set R²: {r2_test}")
print(f"Test Set MAE: {mae_test}")

Validation Set MSE: 294.6992767925365
Validation Set R²: 0.8789972853161622
Validation Set MAE: 11.263831861036904
Test Set MSE: 294.5633201810526
Test Set R²: 0.8047045296194177
Test Set MAE: 10.876927048724466


In [32]:
# Random Forest
# Evalution Methods = MSE, R² & MAE

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Initialize the Random Forest model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on training data
rf_reg.fit(X_train, y_train)

# Prediction on the validation set
y_val_pred = rf_reg.predict(X_val)

# Validation set performance evaluation
mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)
print(f"Validation Set MSE: {mse_val}")
print(f"Validation Set R²: {r2_val}")
print(f"Validation Set MAE: {mae_val}")

# Test set performance evaluation
y_test_pred = rf_reg.predict(X_test)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"Test Set MSE: {mse_test}")
print(f"Test Set R²: {r2_test}")
print(f"Test Set MAE: {mae_test}")


Validation Set MSE: 326.81277346938776
Validation Set R²: 0.865811571668738
Validation Set MAE: 11.916530612244898
Test Set MSE: 354.7895071428571
Test Set R²: 0.7647745698922357
Test Set MAE: 12.279285714285715
