In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib


In [None]:
master_df = pd.read_csv("data/merged-final-data/Final_Merged_July_21_June_23.csv")

In [None]:
master_df.drop ('Unnamed: 0', axis = 1, inplace = True)
master_df.columns

In [None]:
air_code = np.unique(master_df[['ORIGIN','DEST']].values)

In [None]:
flight_features = ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_TIME_HOUR', 'ARR_TIME_HOUR', 'DISTANCE']

dep_weather_features = ['wind_gusts_10m_DEP', 'snow_depth_DEP', 'precipitation_DEP', 'wind_speed_100m_DEP',
                        'wind_direction_100m_DEP', 'rain_DEP','shortwave_radiation_DEP','relative_humidity_2m_DEP',
                        'cloud_cover_DEP','temperature_2m_DEP']

arr_weather_features = ['wind_gusts_10m_ARR', 'snow_depth_ARR', 'precipitation_ARR', 'wind_speed_100m_ARR',
                        'wind_direction_100m_ARR', 'rain_ARR','shortwave_radiation_ARR','relative_humidity_2m_ARR',
                        'cloud_cover_ARR','temperature_2m_ARR']
target_features = ['ARR_DELAY', 'DEP_DELAY']

In [None]:
master_df = master_df[flight_features+dep_weather_features+arr_weather_features+target_features]

In [None]:
null_count = master_df.isnull().sum()
null_count

In [None]:
master_df = master_df.fillna(0)

In [None]:
def frequency_encode(df, feature):
    frequency = df[feature].value_counts()
    df[feature] = df[feature].map(frequency)
    return df

categorical_features = ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST']

# Apply frequency encoding to each categorical feature
for feature in categorical_features:
    master_df = frequency_encode(master_df, feature)

In [None]:
master_df

In [None]:



iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)

# Fit the model
iso_forest.fit(master_df)

# Predict outliers
outliers = iso_forest.predict(master_df)

# Data points classified as -1 are outliers
master_df = master_df[outliers != -1]


In [None]:
print(f"values removed {len(master_df)}")

In [None]:
master_df

Model 1 - XGB Regressor

In [None]:
df = master_df.copy()

In [None]:


# Assuming 'df' is your DataFrame
df['TOTAL_DELAY'] = df['DEP_DELAY'] + df['ARR_DELAY']

# Features list
all_features = flight_features + dep_weather_features + arr_weather_features

# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]
#features_for_scaling = all_features

# Scaling features
scaler = MinMaxScaler()
df[features_for_scaling] = scaler.fit_transform(df[features_for_scaling])
df[features_for_scaling] = np.sin(df[features_for_scaling])

# Split data into train and test sets
X = df[all_features]
y = df['TOTAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model_xgb_final = XGBRegressor(objective='reg:squarederror')
model_xgb_final.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model_xgb_final.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(y_test, y_pred)

print(f"R^2 Score: {r_squared}")

In [None]:
predict_df = pd.read_csv("data/predict-data-3-months/predict-data.csv")
predict_df.drop ('Unnamed: 0', axis = 1, inplace = True)
for feature in categorical_features:
    predict_df = frequency_encode(predict_df, feature)

predict_df_actual = predict_df['DEP_DELAY'] + predict_df['ARR_DELAY']

# Features list
all_features = flight_features + dep_weather_features + arr_weather_features
predict_df = predict_df[all_features]

# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]
#features_for_scaling = all_features

# Scaling features
predict_df[features_for_scaling] = scaler.fit_transform(predict_df[features_for_scaling])
predict_df[features_for_scaling] = np.sin(predict_df[features_for_scaling])
print(predict_df_actual.shape)
y_pred=model_xgb_final.predict(predict_df)
print(y_pred.shape)
mse = mean_squared_error(predict_df_actual, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(predict_df_actual, y_pred)

print(f"R^2 Score: {r_squared}")

In [None]:
df = master_df.copy()

In [None]:

# Assuming 'df' is your DataFrame
df['TOTAL_DELAY'] = df['DEP_DELAY'] + df['ARR_DELAY']


# Features list
all_features = flight_features
# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]
#features_for_scaling = all_features

# Scaling features
scaler = MinMaxScaler()
df[features_for_scaling] = scaler.fit_transform(df[features_for_scaling])
df[features_for_scaling] = np.sin(df[features_for_scaling])

# Split data into train and test sets
X = df[all_features]
y = df['TOTAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = XGBRegressor(objective='reg:squarederror')
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)

r_squared = r2_score(y_test, y_pred)

print(f"R^2 Score: {r_squared}")

Model 2 - Linear Regression

In [None]:
df = master_df.copy()
df

In [None]:


# Assuming 'df' is your DataFrame
df['TOTAL_DELAY'] = df['DEP_DELAY'] + df['ARR_DELAY']

# Features list
all_features = flight_features + dep_weather_features + arr_weather_features

# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]

# Scaling features
scaler = MinMaxScaler()
df[features_for_scaling] = scaler.fit_transform(df[features_for_scaling])
df[features_for_scaling] = np.sin(df[features_for_scaling])

# Split data into train and test sets
X = df[all_features]
y = df['TOTAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model_lr_final = LinearRegression()
model_lr_final.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model_lr_final.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(y_test, y_pred)

print(f"R^2 Score: {r_squared}")

In [None]:
predict_df = pd.read_csv("data/predict-data-3-months/predict-data.csv")
predict_df.drop ('Unnamed: 0', axis = 1, inplace = True)
for feature in categorical_features:
    predict_df = frequency_encode(predict_df, feature)

predict_df_actual = predict_df['DEP_DELAY'] + predict_df['ARR_DELAY']

# Features list
all_features = flight_features + dep_weather_features + arr_weather_features
predict_df = predict_df[all_features]

# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]

# Scaling features
predict_df[features_for_scaling] = scaler.fit_transform(predict_df[features_for_scaling])
predict_df[features_for_scaling] = np.sin(predict_df[features_for_scaling])
print(predict_df_actual.shape)
y_pred=model_lr_final.predict(predict_df)
print(y_pred.shape)
mse = mean_squared_error(predict_df_actual, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(predict_df_actual, y_pred)

print(f"R^2 Score: {r_squared}")

In [None]:
df = master_df.copy()

In [None]:


# Assuming 'df' is your DataFrame
df['TOTAL_DELAY'] = df['DEP_DELAY'] + df['ARR_DELAY']


# Features list
all_features = flight_features
# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]

# Scaling features
scaler = MinMaxScaler()
df[features_for_scaling] = scaler.fit_transform(df[features_for_scaling])
df[features_for_scaling] = np.sin(df[features_for_scaling])

# Split data into train and test sets
X = df[all_features]
y = df['TOTAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(y_test, y_pred)

print(f"R^2 Score: {r_squared}")

Model 3 - Light BGM

In [None]:
df = master_df.copy()

In [None]:
# Assuming 'df' is your DataFrame
df['TOTAL_DELAY'] = df['DEP_DELAY'] + df['ARR_DELAY']

# Encoding high-cardinality categorical variables
# Features list
all_features = flight_features + dep_weather_features + arr_weather_features

# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]

# Scaling features
scaler = MinMaxScaler()
df[features_for_scaling] = scaler.fit_transform(df[features_for_scaling])

# Split data into train and test sets
X = df[all_features]
y = df['TOTAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the LightGBM model
model_lgbm_final = LGBMRegressor()
model_lgbm_final.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model_lgbm_final.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(y_test, y_pred)

print(f"R^2 Score: {r_squared}")


In [None]:
predict_df = pd.read_csv("data/predict-data-3-months/predict-data.csv")
predict_df.drop ('Unnamed: 0', axis = 1, inplace = True)
for feature in categorical_features:
    predict_df = frequency_encode(predict_df, feature)

predict_df_actual = predict_df['DEP_DELAY'] + predict_df['ARR_DELAY']

# Features list
all_features = flight_features + dep_weather_features + arr_weather_features
predict_df = predict_df[all_features]

# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]

# Scaling features
predict_df[features_for_scaling] = scaler.fit_transform(predict_df[features_for_scaling])
print(predict_df_actual.shape)
y_pred=model_lgbm_final.predict(predict_df)
print(y_pred.shape)
mse = mean_squared_error(predict_df_actual, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(predict_df_actual, y_pred)

print(f"R^2 Score: {r_squared}")

In [None]:
df = master_df.copy()

In [None]:
# Function for frequency encoding

# Assuming 'df' is your DataFrame
df['TOTAL_DELAY'] = df['DEP_DELAY'] + df['ARR_DELAY']

# Encoding high-cardinality categorical variables

# Features list
all_features = flight_features
# Exclude target variable from features for scaling
features_for_scaling = [f for f in all_features if f not in ['TOTAL_DELAY']]

# Scaling features
scaler = MinMaxScaler()
df[features_for_scaling] = scaler.fit_transform(df[features_for_scaling])

# Split data into train and test sets
X = df[all_features]
y = df['TOTAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the LightGBM model
model = LGBMRegressor()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: ", rmse)
r_squared = r2_score(y_test, y_pred)

print(f"R^2 Score: {r_squared}")


In [None]:
joblib.dump(model_lgbm_final, 'Models/lgbm.pkl')

In [None]:
joblib.dump(model_xgb_final, 'Models/xgb.pkl')

In [None]:
joblib.dump(model_lr_final, 'Models/lr.pkl')