In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

df = pd.read_csv('/Users/ravi/Desktop/flight/data/raw/jan2025.csv')

df["FL_DATE"] = pd.to_datetime(df["FL_DATE"])


df["DAY_OF_WEEK"] = df["FL_DATE"].dt.dayofweek
df["DAY_OF_MONTH"] = df["FL_DATE"].dt.day
df["MONTH"] = df["FL_DATE"].dt.month
df["YEAR"] = df["FL_DATE"].dt.year


airport_coords = {
    'ATL': (33.6407, -84.4277),
    'DFW': (32.8998, -97.0403),
    'DEN': (39.8561, -104.6737),
    'ORD': (41.9742, -87.9073),
    'LAX': (33.9416, -118.4085),
    'JFK': (40.6413, -73.7781),
    'SFO': (37.6213, -122.3790),
    'SEA': (47.4502, -122.3088),
    'MCO': (28.4312, -81.3081),
    'LAS': (36.0840, -115.1537),
    'MIA': (25.7959, -80.2871),
    'PHX': (33.4342, -112.0116),
    'IAH': (29.9902, -95.3368),
    'EWR': (40.6895, -74.1745),
    'BOS': (42.3656, -71.0096),
    'MSP': (44.8848, -93.2223),
    'DTW': (42.2162, -83.3554),
    'PHL': (39.8744, -75.2424),
    'LGA': (40.7769, -73.8740),
    'BWI': (39.1754, -76.6684),
    'DCA': (38.8512, -77.0402),
    'SAN': (32.7338, -117.1933),
    'TPA': (27.9755, -82.5332),
    'PDX': (45.5898, -122.5951),
    'STL': (38.7500, -90.3700),
    'HOU': (29.6454, -95.2789),
    'AUS': (30.1975, -97.6664),
    'BNA': (36.1263, -86.6774),
    'RDU': (35.8801, -78.7880),
    'CLE': (41.4117, -81.8498),
    'CMH': (39.9980, -82.8919),
    'IND': (39.7173, -86.2944),
    'SLC': (40.7899, -111.9791),
    'MCI': (39.2976, -94.7139),
    'PIT': (40.4915, -80.2329),
    'CVG': (39.0488, -84.6678),
    'SMF': (38.6951, -121.5908),
    'OAK': (37.7126, -122.2197),
    'SJC': (37.3639, -121.9289),
    'RSW': (26.5362, -81.7552),
    'JAX': (30.4941, -81.6879),
    'MKE': (42.9472, -87.8966),
    'BUF': (42.9405, -78.7322),
    'ALB': (42.7483, -73.8017),
    'SNA': (33.6757, -117.8682),
    'BUR': (34.2007, -118.3587),
    'ONT': (34.0560, -117.6012),
    'LGB': (33.8177, -118.1516),
    'CLT': (35.2144, -80.9473),
    'FLL': (26.0742, -80.1506),
    'MDW': (41.7868, -87.7522),
    'DAL': (32.8471, -96.8517),
    'HNL': (21.3187, -157.9225),
    'ANC': (61.1744, -149.9964),
    'SJU': (18.4394, -66.0018),

    # Secondary but commonly used
    'ELP': (31.8072, -106.3781),
    'TUS': (32.1161, -110.9410),
    'ABQ': (35.0494, -106.6170),
    'BOI': (43.5644, -116.2228),
    'GEG': (47.6199, -117.5338),
    'DSM': (41.5340, -93.6631),
    'OMA': (41.3032, -95.8941),
    'ICT': (37.6499, -97.4331),
    'OKC': (35.3931, -97.6007),
    'TUL': (36.1984, -95.8881),
    'LIT': (34.7294, -92.2243),
    'MEM': (35.0425, -89.9767),
    'JAN': (32.3112, -90.0759),
    'MSY': (29.9934, -90.2580),
    'BHM': (33.5629, -86.7535),
    'HSV': (34.6404, -86.7731),
    'CHA': (35.0353, -85.2038)
}



ft_columns = [
    "ORIGIN_AIRPORT_ID",  
    "DEST_AIRPORT_ID",
    "DISTANCE",
    "DAY_OF_WEEK",
    "DAY_OF_MONTH",
    "MONTH",
    "YEAR",
]

import requests

def get_weather(lat, long, date):
    url = "https://archive-api.open-meteo.com/v1/archive"
    parameters = {
        "latitude": lat,
        "longitude": long,
        "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max,snowfall_sum",
        "timezone": "auto",
        "temperature_unit": "fahrenheit",
    }

    api_response = requests.get(url, params=parameters)
    data = api_response.json()  # Fixed: was "response", should be "api_response"

    forecast_df = pd.DataFrame({
        'date': data['daily']['time'],
        'temp_max': data['daily']['temperature_2m_max'],
        'temp_min': data['daily']['temperature_2m_min'],
        'precipitation': data['daily']['precipitation_sum'],
        'wind_max': data['daily']['windspeed_10m_max'],
        'snowfall': data['daily']['snowfall_sum']
    })

    return forecast_df





  df["FL_DATE"] = pd.to_datetime(df["FL_DATE"])


In [None]:
df['date_str'] =  df['FL_DATE'].dt.strftime('%Y-%m-%d')
 
combos = df[['ORIGIN', 'date_str']].drop_duplicates().head(20) # testing func for 20 rows (for simiplicity)

weather_data = []

fallback = []

for i, row in combos.iterrows():
    airport = row['ORIGIN']
    date = row['date_str']

    if airport in airport_coords:
        lat, long = airport_coords[airport]
        weather_df = get_weather(lat, long, date)
        weather_data.append(weather_df)
    else:
        fallback.append(f'airport: {airport}, date: {date}, not found (error)')
    
print('success')

weather_df = pd.concat(weather_data)

weather_df.to_csv('weather_data.csv', index=False)







success


In [None]:
focus_column = "CANCELLED"

df_encoded = pd.get_dummies(df, columns=['MKT_UNIQUE_CARRIER'], prefix="carrier")
carrier_columns = [col for col in df_encoded.columns if 'carrier' in col]
ft_columns = [col for col in ft_columns if col != 'MKT_UNIQUE_CARRIER']
ft_columns.extend(carrier_columns)

print(ft_columns)

['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'DISTANCE', 'DAY_OF_WEEK', 'DAY_OF_MONTH', 'MONTH', 'YEAR', 'carrier_AA', 'carrier_AS', 'carrier_B6', 'carrier_DL', 'carrier_F9', 'carrier_G4', 'carrier_HA', 'carrier_NK', 'carrier_UA', 'carrier_WN']


In [None]:
x = df_encoded[ft_columns]
y  = df_encoded[focus_column]

print(f"\nMissing values in X: {x.isnull().sum().sum()}")
print(f"Missing values in y: {y.isnull().sum()}")

x = x.fillna(0)





Missing values in X: 0
Missing values in y: 0


In [None]:



x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)
print(f"Training set: {len(x_train)}")
print(f"Testing set: {len(x_test)}")
print(f"\nTrain cancellation rate: {y_train.mean()*100:.2f}%")
print(f"Test cancellation rate: {y_test.mean()*100:.2f}%")







Training set: 479210
Testing set: 119803

Train cancellation rate: 3.13%
Test cancellation rate: 3.13%


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42,
    n_jobs=1
)

model.fit(x_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [None]:
y_pred = model.predict(x_test)

y_pred_proba = model.predict_proba(x_test)[:, 1]

results = pd.DataFrame({
    'actual': y_test.values[:10],
    'predicted': y_pred[:10],
    'probability': y_pred_proba[:10]
})

print(results)





   actual  predicted  probability
0     0.0        0.0          0.0
1     0.0        0.0          0.0
2     0.0        0.0          0.0
3     0.0        0.0          0.0
4     0.0        0.0          0.0
5     0.0        0.0          0.0
6     0.0        0.0          0.0
7     0.0        0.0          0.0
8     0.0        0.0          0.0
9     0.0        0.0          0.0
