In [78]:
import pandas as pd
import numpy as np
import math

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_union

In [87]:
df = pd.read_csv('../raw_data/train_100k_subset.csv')
df.head()

df_X = df.drop(columns = ['ArrDelayMinutes','Cancelled','Diverted'])
df_y = df[['ArrDelayMinutes','Cancelled','Diverted']]
df_X = df_X.loc[~df_X['Year'].isin([2018])]
df_X.shape

(80661, 11)

In [88]:
def preprocess_X(df_X):
# Encoding year
    df_year = df_X['Year']
    year_encoded = pd.get_dummies(df_year, prefix='y')
    df_X = df_X.merge(year_encoded, left_index=True, right_index=True)
    df_X.drop(columns = 'Year', inplace = True)
    
# Scaling distances

    dist_min = 16 # grabbed from BQ
    dist_max = 5812 # grabbed from BQ
    
    df_X['dist_scaled'] = (df_X['Distance'] - dist_min) / (dist_max - dist_min)
    df_X.drop(columns = 'Distance', inplace = True)

# Formatting and Scaling time

    dow = df_X['DayOfWeek']
    sin_dow = np.sin(2 * math.pi / 7 * dow)
    cos_dow = np.cos(2 * math.pi / 7 * dow)

    dom = df_X['DayofMonth']
    sin_dom = np.sin(2 * math.pi / 31 * dom)
    cos_dom = np.cos(2 * math.pi / 31 * dom)

    month = df_X['Month']
    sin_month = np.sin(2 * math.pi / 12 * month)
    cos_month = np.cos(2 * math.pi / 12 * month)

    qua = df_X['Quarter']
    sin_qua = np.sin(2 * math.pi / 4 * qua)
    cos_qua = np.cos(2 * math.pi / 4 * qua)

    dep = df_X['CRSDepTime']
    sin_dep = np.sin(2 * math.pi / 2400 * dep)
    cos_dep = np.cos(2 * math.pi / 2400 * dep)

    arr = df_X['CRSArrTime']
    sin_arr = np.sin(2 * math.pi / 2400 * arr)
    cos_arr = np.cos(2 * math.pi / 2400 * arr)

    #return np.stack([sin_dow,cos_dow, sin_dom, cos_dom, sin_month, cos_month, sin_qua, cos_qua])
    result = pd.DataFrame([sin_dow,cos_dow,sin_dom, cos_dom, sin_month, cos_month, sin_qua, cos_qua, sin_dep,
                      cos_dep, sin_arr, cos_arr]).T
    result.columns = ['sin_dow','cos_dow','sin_dom', 'cos_dom', 'sin_month', 'cos_month', 'sin_qua', 'cos_qua', 'sin_dep',
                      'cos_dep', 'sin_arr', 'cos_arr']
    df_time = pd.DataFrame(result, columns=result.columns)

    df_X = df_X.drop(columns = ['DayOfWeek', 'DayofMonth', 'Month', 'Quarter', 'CRSDepTime', 'CRSArrTime'])

# Creating a joined df

    df_X = pd.merge(df_X, df_time, left_index=True, right_index=True, how = 'outer')

# Reorder the df columns and ensure all columns exist (e.g if one year missing)
    df_cols = pd.DataFrame(columns = ['y_2018', 'y_2019','y_2020', 'y_2021', 'y_2022', 'dist_scaled', 'sin_dow', 'cos_dow',
       'sin_dom', 'cos_dom', 'sin_month', 'cos_month', 'sin_qua', 'cos_qua',
       'sin_dep', 'cos_dep', 'sin_arr', 'cos_arr','Marketing_Airline_Network', 'Origin', 'Dest'])
    output = pd.concat([df_cols,df_X]).fillna(0)


    print("✅ preprocess_X() done")
    return output

In [89]:
def preprocess_y(y, is_binary=True):
    y = y.copy()
    y["DelayGroup"] = None

    if is_binary:
        y.loc[y["ArrDelayMinutes"] == 0, "DelayGroup"] = 0
        y.loc[(y["ArrDelayMinutes"] > 0) & (y["ArrDelayMinutes"] <= 30), "DelayGroup"] = 0
        y.loc[y["ArrDelayMinutes"] > 30, "DelayGroup"] = 1
        y.loc[y["Cancelled"], "DelayGroup"] = 1
        y.loc[y["Diverted"], "DelayGroup"] = 1
        output = y[['DelayGroup']]


    if not is_binary:
        y.loc[y["ArrDelayMinutes"] == 0, "DelayGroup"] = "OnTime_Early"
        y.loc[(y["ArrDelayMinutes"] > 0) & (y["ArrDelayMinutes"] <= 30), "DelayGroup"] = "Small_Delay"
        y.loc[y["ArrDelayMinutes"] > 30, "DelayGroup"] = "Large_Delay"

        y.loc[y["Cancelled"], "DelayGroup"] = "NoArrival"
        y.loc[y["Diverted"], "DelayGroup"] = "NoArrival"

        y_array = y['DelayGroup']
        label_encoder = LabelEncoder()
        encoded_target = label_encoder.fit_transform(y_array)

        output = pd.DataFrame(encoded_target, columns=['y'])

    if is_binary:
        print("✅ BINARY preprocess_y() done")
    if not is_binary:
        print("✅ STANDARD preprocess_y() done")
    return output

In [90]:
X_proc = preprocess_X(df_X)
X_proc

✅ preprocess_X() done


Unnamed: 0,y_2018,y_2019,y_2020,y_2021,y_2022,dist_scaled,sin_dow,cos_dow,sin_dom,cos_dom,...,cos_month,sin_qua,cos_qua,sin_dep,cos_dep,sin_arr,cos_arr,Marketing_Airline_Network,Origin,Dest
1,0,0,0,0,1,0.080573,-2.449294e-16,1.000000,0.101168,-0.994869,...,-8.660254e-01,1.224647e-16,-1.000000e+00,0.669131,-0.743145,0.458650,-0.888617,UA,FSD,DEN
3,0,1,0,0,0,0.089372,-4.338837e-01,-0.900969,-0.394356,0.918958,...,-5.000000e-01,-1.000000e+00,-1.836970e-16,-0.991445,0.130526,-0.824126,0.566406,WN,RDU,MCO
6,0,1,0,0,0,0.109558,-4.338837e-01,-0.900969,0.299363,-0.954139,...,8.660254e-01,-2.449294e-16,1.000000e+00,0.999229,-0.039260,0.665230,-0.746638,DL,OMA,DTW
7,0,0,1,0,0,0.087992,-7.818315e-01,0.623490,0.998717,-0.050649,...,5.000000e-01,1.000000e+00,6.123234e-17,0.996917,-0.078459,0.825606,-0.564247,DL,PIT,ATL
8,0,1,0,0,0,0.019151,4.338837e-01,-0.900969,-0.485302,-0.874347,...,-1.836970e-16,-1.000000e+00,-1.836970e-16,0.387516,-0.921863,0.117537,-0.993068,AA,SWF,PHL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99993,0,0,1,0,0,0.099206,-9.749279e-01,-0.222521,-0.394356,0.918958,...,-8.660254e-01,1.224647e-16,-1.000000e+00,-0.358368,-0.933580,-0.566406,-0.824126,WN,ATL,MDW
99994,0,0,1,0,0,0.196687,9.749279e-01,-0.222521,-0.897805,-0.440394,...,-8.660254e-01,-1.000000e+00,-1.836970e-16,-0.972370,-0.233445,-0.780430,0.625243,DL,SLC,STL
99995,0,1,0,0,0,0.202381,7.818315e-01,0.623490,0.897805,-0.440394,...,-1.000000e+00,1.224647e-16,-1.000000e+00,0.928810,-0.370557,0.418660,-0.908143,WN,MCO,SJU
99996,0,0,1,0,0,0.097999,-4.338837e-01,-0.900969,0.394356,0.918958,...,-8.660254e-01,-1.000000e+00,-1.836970e-16,-0.743145,-0.669131,-0.878817,-0.477159,DL,SLC,SJC


In [96]:
y_proc = preprocess_y(df_y)
y_proc.loc[y_proc['DelayGroup'] == None]

✅ BINARY preprocess_y() done


Unnamed: 0,DelayGroup


In [95]:
data_proc = pd.merge(X_proc, y_proc, left_index = True, right_index = True)
data_proc.loc[data_proc.isna().any(axis=1)]

Unnamed: 0,y_2018,y_2019,y_2020,y_2021,y_2022,dist_scaled,sin_dow,cos_dow,sin_dom,cos_dom,...,sin_qua,cos_qua,sin_dep,cos_dep,sin_arr,cos_arr,Marketing_Airline_Network,Origin,Dest,DelayGroup


In [None]:
output.columns

Index(['y_2018', 'y_2019', 'y_2020', 'y_2021', 'y_2022', 'dist_scaled',
       'sin_dow', 'cos_dow', 'sin_dom', 'cos_dom', 'sin_month', 'cos_month',
       'sin_qua', 'cos_qua', 'sin_dep', 'cos_dep', 'sin_arr', 'cos_arr',
       'Marketing_Airline_Network', 'Origin', 'Dest'],
      dtype='object')

In [45]:
output.shape

(100000, 21)

In [46]:
output.head()

Unnamed: 0,y_2018,y_2019,y_2020,y_2021,y_2022,dist_scaled,sin_dow,cos_dow,sin_dom,cos_dom,...,cos_month,sin_qua,cos_qua,sin_dep,cos_dep,sin_arr,cos_arr,Marketing_Airline_Network,Origin,Dest
0,1,0,0,0,0,0.064872,0.7818315,0.62349,-0.201299,0.97953,...,-0.866025,-1.0,-1.83697e-16,-0.62932,0.777146,-0.382683,0.92388,WN,STL,ICT
1,0,0,0,0,1,0.080573,-2.449294e-16,1.0,0.101168,-0.994869,...,-0.866025,1.224647e-16,-1.0,0.669131,-0.743145,0.45865,-0.888617,UA,FSD,DEN
2,1,0,0,0,0,0.05383,0.4338837,-0.900969,-0.968077,-0.250653,...,-0.5,-1.0,-1.83697e-16,-0.296542,-0.95502,-0.583541,-0.812084,AA,CLT,JAX
3,0,1,0,0,0,0.089372,-0.4338837,-0.900969,-0.394356,0.918958,...,-0.5,-1.0,-1.83697e-16,-0.991445,0.130526,-0.824126,0.566406,WN,RDU,MCO
4,1,0,0,0,0,0.063837,-0.4338837,-0.900969,-0.790776,-0.612106,...,1.0,-2.449294e-16,1.0,-0.85264,0.522499,-0.649448,0.760406,WN,SJC,LAS


In [31]:
# expected cols out 
12 + 4 + 1 + 1 + 1 + 1 + 1 #sins + year + dist + origin + dest + airline + y

21

In [33]:
# expected cols when origin dest airline are also encoded 
21 + 388 + 388 + 11 - 3 # -3 represents removing original origin, dest, airline cols 

805

In [35]:
# expected size of X_train
805 -1 # removing y

804

In [102]:
df1 = pd.DataFrame({'a':[1,1,3,4,None]})
df1

Unnamed: 0,a
0,1.0
1,1.0
2,3.0
3,4.0
4,


In [103]:
df1.loc[df1['a'].isna()] = 9
df1

Unnamed: 0,a
0,1.0
1,1.0
2,3.0
3,4.0
4,9.0


In [64]:
df1.drop_duplicates()

Unnamed: 0,a
0,1
2,3
3,4
