In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import joblib
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
df=pd.read_csv('2023_data.csv')

In [4]:
df_newyork=df[(df['ORIGIN_CITY']=='New York, NY')|(df['DEST_CITY']=='New York, NY')].copy()

In [5]:
df_newyork.loc[df["DEP_DELAY"] >= 15, "DELAY"] = 1
df_newyork.loc[df["DEP_DELAY"] < 15, "DELAY"] = 0 

In [6]:
from sklearn.utils import resample

df_delayed = df_newyork[df_newyork['DELAY'] == 1]
df_not_delayed = df_newyork[df_newyork['DELAY'] == 0]


# Count the number of instances in the minority class
n_delayed = len(df_delayed)

# Downsample the majority class
df_not_delayed_downsampled = resample(df_not_delayed,
                                      replace=False,    # sample without replacement
                                      n_samples=n_delayed,  # to match minority class
                                      random_state=123)  # reproducible results

# Combine the downsampled majority class with the minority class
df_balanced = pd.concat([df_not_delayed_downsampled, df_delayed])

# Shuffle the dataset
df_newyork = df_balanced.sample(frac=1, random_state=123).reset_index(drop=True)


In [25]:
df_newyork['DELAY'].value_counts()

0.0    79337
1.0    79337
Name: DELAY, dtype: int64

In [7]:
dummies = pd.get_dummies(df_newyork['AIRLINE_CODE'], prefix='AIRLINE')
df_newyork = pd.concat([df_newyork, dummies], axis=1)

In [26]:
df_newyork['TIME_MINUTES'] = df_newyork['CRS_DEP_TIME'].apply(lambda x: (x // 100) * 60 + (x % 100))
df_newyork.sort_values(by=['FL_DATE', 'TIME_MINUTES'], inplace=True)
def count_flights(group):
    counts = group['TIME_MINUTES'].apply(lambda x: ((group['TIME_MINUTES'] < x) & 
                                                    (group['TIME_MINUTES'] >= x - 30)).sum())
    return counts
df_newyork['FLIGHTS_BEFORE'] = df_newyork.groupby('FL_DATE').apply(count_flights).reset_index(level=0, drop=True)

In [27]:
df_newyork['FL_DATE'] = pd.to_datetime(df_newyork['FL_DATE'])
df_newyork['day_of_week'] = df_newyork['FL_DATE'].dt.dayofweek
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dummies = pd.get_dummies(df_newyork['day_of_week'])
dummies.columns = days
df_newyork = pd.concat([df_newyork, dummies], axis=1)

In [28]:
import pandas as pd

# Assuming df_newyork is your original dataframe

def categorize_departure_time(dep_time):
    dep_time = int(dep_time)
    if dep_time < 600:  # Before 6:00 AM
        return 'Early Morning'
    elif 600 <= dep_time < 1200:  # Before 12:00 PM
        return 'Morning'
    elif 1200 <= dep_time < 1800:  # Before 6:00 PM
        return 'Afternoon'
    else:
        return 'Evening'

segment_days = ['Early Morning', 'Morning', 'Afternoon', 'Evening']
df_newyork['dep_time_segment'] = df_newyork['CRS_DEP_TIME'].apply(categorize_departure_time)

# Create a new column with binary encoding
for segment in segment_days:
    df_newyork[segment] = (df_newyork['dep_time_segment'] == segment).astype(int)

# Drop the original 'dep_time_segment' column
df_newyork = df_newyork.drop('dep_time_segment', axis=1)

# Print the resulting DataFrame
df_newyork


Unnamed: 0,FL_DATE,AIRLINE_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,Morning,Afternoon,Evening,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,2023-01-01,B6,104,SJU,"San Juan, PR",JFK,"New York, NY",354,351.0,-3.0,...,0,0,0,0,0,0,0,0,0,1
1,2023-01-01,DL,756,ATL,"Atlanta, GA",JFK,"New York, NY",500,538.0,38.0,...,0,0,0,0,0,0,0,0,0,1
2,2023-01-01,NK,316,MCO,"Orlando, FL",LGA,"New York, NY",515,706.0,111.0,...,0,0,0,0,0,0,0,0,0,1
3,2023-01-01,9E,5075,PWM,"Portland, ME",JFK,"New York, NY",520,618.0,58.0,...,0,0,0,0,0,0,0,0,0,1
4,2023-01-01,B6,517,BOS,"Boston, MA",JFK,"New York, NY",545,543.0,-2.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158669,2023-08-31,B6,354,ONT,"Ontario, CA",JFK,"New York, NY",2352,11.0,19.0,...,0,0,1,0,0,0,1,0,0,0
158670,2023-08-31,B6,2348,LAS,"Las Vegas, NV",JFK,"New York, NY",2355,25.0,30.0,...,0,0,1,0,0,0,1,0,0,0
158671,2023-08-31,B6,80,RNO,"Reno, NV",JFK,"New York, NY",2359,38.0,39.0,...,0,0,1,0,0,0,1,0,0,0
158672,2023-08-31,B6,136,PHX,"Phoenix, AZ",JFK,"New York, NY",2359,223.0,144.0,...,0,0,1,0,0,0,1,0,0,0


In [29]:
df_newyork=df_newyork.reset_index(drop=True)
df_newyork

Unnamed: 0,FL_DATE,AIRLINE_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,Morning,Afternoon,Evening,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,2023-01-01,B6,104,SJU,"San Juan, PR",JFK,"New York, NY",354,351.0,-3.0,...,0,0,0,0,0,0,0,0,0,1
1,2023-01-01,DL,756,ATL,"Atlanta, GA",JFK,"New York, NY",500,538.0,38.0,...,0,0,0,0,0,0,0,0,0,1
2,2023-01-01,NK,316,MCO,"Orlando, FL",LGA,"New York, NY",515,706.0,111.0,...,0,0,0,0,0,0,0,0,0,1
3,2023-01-01,9E,5075,PWM,"Portland, ME",JFK,"New York, NY",520,618.0,58.0,...,0,0,0,0,0,0,0,0,0,1
4,2023-01-01,B6,517,BOS,"Boston, MA",JFK,"New York, NY",545,543.0,-2.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158669,2023-08-31,B6,354,ONT,"Ontario, CA",JFK,"New York, NY",2352,11.0,19.0,...,0,0,1,0,0,0,1,0,0,0
158670,2023-08-31,B6,2348,LAS,"Las Vegas, NV",JFK,"New York, NY",2355,25.0,30.0,...,0,0,1,0,0,0,1,0,0,0
158671,2023-08-31,B6,80,RNO,"Reno, NV",JFK,"New York, NY",2359,38.0,39.0,...,0,0,1,0,0,0,1,0,0,0
158672,2023-08-31,B6,136,PHX,"Phoenix, AZ",JFK,"New York, NY",2359,223.0,144.0,...,0,0,1,0,0,0,1,0,0,0


In [30]:
df_newyork.DELAY.value_counts()

0.0    79337
1.0    79337
Name: DELAY, dtype: int64

In [13]:
df_newyork1=df_newyork.copy()
df_newyork1

Unnamed: 0,FL_DATE,AIRLINE_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Early Morning,Morning,Afternoon,Evening
0,2023-01-01,B6,104,SJU,"San Juan, PR",JFK,"New York, NY",354,351.0,-3.0,...,0,0,0,0,0,1,1,0,0,0
1,2023-01-01,DL,756,ATL,"Atlanta, GA",JFK,"New York, NY",500,538.0,38.0,...,0,0,0,0,0,1,1,0,0,0
2,2023-01-01,NK,316,MCO,"Orlando, FL",LGA,"New York, NY",515,706.0,111.0,...,0,0,0,0,0,1,1,0,0,0
3,2023-01-01,9E,5075,PWM,"Portland, ME",JFK,"New York, NY",520,618.0,58.0,...,0,0,0,0,0,1,1,0,0,0
4,2023-01-01,B6,517,BOS,"Boston, MA",JFK,"New York, NY",545,543.0,-2.0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158669,2023-08-31,B6,354,ONT,"Ontario, CA",JFK,"New York, NY",2352,11.0,19.0,...,0,0,1,0,0,0,0,0,0,1
158670,2023-08-31,B6,2348,LAS,"Las Vegas, NV",JFK,"New York, NY",2355,25.0,30.0,...,0,0,1,0,0,0,0,0,0,1
158671,2023-08-31,B6,80,RNO,"Reno, NV",JFK,"New York, NY",2359,38.0,39.0,...,0,0,1,0,0,0,0,0,0,1
158672,2023-08-31,B6,136,PHX,"Phoenix, AZ",JFK,"New York, NY",2359,223.0,144.0,...,0,0,1,0,0,0,0,0,0,1


In [14]:
#df_newyork = df_newyork1.sample(frac=0.01, random_state=0)
columns_to_drop_na = ['ORI_PRCP', 'ORI_AWND', 'ORI_SNOW', 'ORI_TMAX', 'ORI_TMIN', 'ORI_WSF2',
                      'DEST_PRCP', 'DEST_AWND', 'DEST_SNOW', 'DEST_TMAX', 'DEST_TMIN', 'DEST_WSF2',
                      'FLIGHTS_BEFORE','AIRLINE_9E', 'AIRLINE_AA', 'AIRLINE_AS', 'AIRLINE_B6', 'AIRLINE_DL', 'AIRLINE_F9', 
                      'AIRLINE_HA', 'AIRLINE_NK', 'AIRLINE_OO', 'AIRLINE_UA', 'AIRLINE_WN', 'AIRLINE_YX',
                      'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
                      'Early Morning', 'Morning', 'Afternoon', 'Evening'
]
df_newyork1.dropna(subset=columns_to_drop_na,inplace=True)
df_newyork1.dropna(subset='DELAY',inplace=True)
X = df_newyork1[columns_to_drop_na]
y = df_newyork1['DELAY']

In [15]:
len(df_newyork1)
df_newyork1

Unnamed: 0,FL_DATE,AIRLINE_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Early Morning,Morning,Afternoon,Evening
0,2023-01-01,B6,104,SJU,"San Juan, PR",JFK,"New York, NY",354,351.0,-3.0,...,0,0,0,0,0,1,1,0,0,0
1,2023-01-01,DL,756,ATL,"Atlanta, GA",JFK,"New York, NY",500,538.0,38.0,...,0,0,0,0,0,1,1,0,0,0
2,2023-01-01,NK,316,MCO,"Orlando, FL",LGA,"New York, NY",515,706.0,111.0,...,0,0,0,0,0,1,1,0,0,0
3,2023-01-01,9E,5075,PWM,"Portland, ME",JFK,"New York, NY",520,618.0,58.0,...,0,0,0,0,0,1,1,0,0,0
4,2023-01-01,B6,517,BOS,"Boston, MA",JFK,"New York, NY",545,543.0,-2.0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158666,2023-08-31,B6,403,JFK,"New York, NY",SJU,"San Juan, PR",2259,11.0,72.0,...,0,0,1,0,0,0,0,0,0,1
158668,2023-08-31,B6,66,ABQ,"Albuquerque, NM",JFK,"New York, NY",2340,27.0,47.0,...,0,0,1,0,0,0,0,0,0,1
158670,2023-08-31,B6,2348,LAS,"Las Vegas, NV",JFK,"New York, NY",2355,25.0,30.0,...,0,0,1,0,0,0,0,0,0,1
158671,2023-08-31,B6,80,RNO,"Reno, NV",JFK,"New York, NY",2359,38.0,39.0,...,0,0,1,0,0,0,0,0,0,1


In [16]:
# columns_to_drop_na = ['ORI_PRCP', 'ORI_AWND', 'ORI_SNOW', 'ORI_TMAX', 'ORI_TMIN', 'ORI_WSF2',
#              'DEST_PRCP', 'DEST_AWND', 'DEST_SNOW', 'DEST_TMAX', 'DEST_TMIN', 'DEST_WSF2','DELAY']

# df_newyork2 = df_newyork.dropna(subset=columns_to_drop_na)

# X2 = df_newyork2[['ORI_PRCP', 'ORI_AWND', 'ORI_SNOW', 'ORI_TMAX', 'ORI_TMIN', 'ORI_WSF2',
#             'DEST_PRCP', 'DEST_AWND', 'DEST_SNOW', 'DEST_TMAX', 'DEST_TMIN', 'DEST_WSF2']]
# y2 = df_newyork2['DELAY']

In [17]:
# data=df_newyork.loc[:,['ORI_PRCP', 'ORI_AWND', 'ORI_SNOW', 'ORI_TMAX', 'ORI_TMIN', 'ORI_WSF2',
#             'DEST_PRCP', 'DEST_AWND', 'DEST_SNOW', 'DEST_TMAX', 'DEST_TMIN', 'DEST_WSF2','DELAY']
#            ]

In [18]:
# data.dropna(inplace=True)

In [19]:
# len(df_newyork2)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6829


In [23]:
param_grid = {
    'n_estimators': [20, 50, 100, 200, 500],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 20, 50],
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.4f}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 500}
Accuracy on the test set: 0.6959


In [24]:
param_grid = {'C': [0.0001, 0.001, 0.1,],
              'gamma': [0.0001, 0.001, 0.01, 0.1],
              'kernel': ['linear', 'rbf']}

svc = SVC()

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs = -1, verbose = 2)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Test Accuracy: {accuracy:.4f}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
param_grid = {'n_neighbors': [40, 50, 60],
              'weights': ['uniform', 'distance'],
              'p': [1, 2],
              'leaf_size': [10, 20, 30],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
from xgboost import XGBClassifier
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
}

xgb_classifier = XGBClassifier()

grid_search = GridSearchCV(xgb_classifier, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Test Accuracy: {accuracy:.4f}")