In [56]:
import numpy as np
import pandas as pd

from dsc2024 import datasets
from dsc2024 import handling

# Data Handling

In [57]:
# Parameters
sampling = None # None to use everything
real_test = True

In [58]:
%%time

if real_test:
    sampling = None
df = datasets.get_train_dataset(sampling=sampling)

dataset row:149631 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051300 10012KT 9999 FEW021 SCT050 28/24 Q1014=
                              ^
dataset row:149639 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051300 10012KT 9999 FEW021 SCT050 28/24 Q1014=
                              ^
dataset row:149657 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051400 09013KT 9999 VCSH FEW021 SCT050 29/24 Q1013=
                              ^
dataset row:149673 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051400 09013KT 9999 VCSH FEW021 SCT050 29/24 Q1013=
                              ^
dataset row:149678 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051400 09

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211679 entries, 504a62621cd231d6ab67e674ce538cd3 to c962a2267ae4fe0afa4c3542ebdbd403
Data columns (total 63 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   hora_ref                       211679 non-null  datetime64[ns]
 1   origem                         211679 non-null  object        
 2   destino                        211679 non-null  object        
 3   url_img_satelite               208741 non-null  object        
 4   prev_troca_cabeceira           211679 non-null  int64         
 5   troca_cabeceira_hora_anterior  211679 non-null  int64         
 6   espera                         211679 non-null  float64       
 7   metar_station_id               210043 non-null  object        
 8   metar_latitude                 210043 non-null  float64       
 9   metar_longitude                210043 non-null  float64       
 10  metar_elevation 

In [60]:
# drop columns with only has null values
df.dropna(axis=1, how="all", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211679 entries, 504a62621cd231d6ab67e674ce538cd3 to c962a2267ae4fe0afa4c3542ebdbd403
Data columns (total 57 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   hora_ref                       211679 non-null  datetime64[ns]
 1   origem                         211679 non-null  object        
 2   destino                        211679 non-null  object        
 3   url_img_satelite               208741 non-null  object        
 4   prev_troca_cabeceira           211679 non-null  int64         
 5   troca_cabeceira_hora_anterior  211679 non-null  int64         
 6   espera                         211679 non-null  float64       
 7   metar_station_id               210043 non-null  object        
 8   metar_latitude                 210043 non-null  float64       
 9   metar_longitude                210043 non-null  float64       
 10  metar_elevation 

In [61]:
# drop non-feature datetime / image columns (to be out of baseline for feature engineering)
datetime_columns = ["hora_ref", "metaf_date_time", "metar_date_time", "url_img_satelite"]
df.drop(columns=datetime_columns, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211679 entries, 504a62621cd231d6ab67e674ce538cd3 to c962a2267ae4fe0afa4c3542ebdbd403
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   origem                         211679 non-null  object 
 1   destino                        211679 non-null  object 
 2   prev_troca_cabeceira           211679 non-null  int64  
 3   troca_cabeceira_hora_anterior  211679 non-null  int64  
 4   espera                         211679 non-null  float64
 5   metar_station_id               210043 non-null  object 
 6   metar_latitude                 210043 non-null  float64
 7   metar_longitude                210043 non-null  float64
 8   metar_elevation                210043 non-null  float64
 9   metar_wind_direction           198901 non-null  float64
 10  metar_wind_speed               210039 non-null  float64
 11  metar_wind_gust                1969 non

In [62]:
X = df.drop("espera", axis=1)
y = df.espera.astype(int)

In [63]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat_features

['origem',
 'destino',
 'metar_station_id',
 'metar_current_wx1',
 'metar_current_wx2',
 'metar_current_wx3',
 'metar_skyc1',
 'metar_skyc2',
 'metar_skyc3',
 'metar_skyc4',
 'metar_remarks',
 'metaf_station_id',
 'metaf_current_wx1',
 'metaf_current_wx2',
 'metaf_skyc1',
 'metaf_skyc2',
 'metaf_remarks']

# Data Imputation

In [64]:
# Fill categorical features with unknown class (a new category)
# numerical features will be filled with min-value catboost default strategy
cat_features_fill_map = {f: "UNKNOWN" for f in cat_features}
X.fillna(cat_features_fill_map, inplace=True)

In [65]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211679 entries, 504a62621cd231d6ab67e674ce538cd3 to c962a2267ae4fe0afa4c3542ebdbd403
Data columns (total 52 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   origem                         211679 non-null  object 
 1   destino                        211679 non-null  object 
 2   prev_troca_cabeceira           211679 non-null  int64  
 3   troca_cabeceira_hora_anterior  211679 non-null  int64  
 4   metar_station_id               211679 non-null  object 
 5   metar_latitude                 210043 non-null  float64
 6   metar_longitude                210043 non-null  float64
 7   metar_elevation                210043 non-null  float64
 8   metar_wind_direction           198901 non-null  float64
 9   metar_wind_speed               210039 non-null  float64
 10  metar_wind_gust                1969 non-null    float64
 11  metar_visibility               210025 n

# Data Split Train/Test

In [66]:
from sklearn.model_selection import train_test_split
random_state = 42
X_base_train, X_test, y_base_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state)
X_train, X_eval, y_train, y_eval = train_test_split(X_base_train, y_base_train, test_size=1/8, random_state=random_state)

In [67]:
# real test -> to submit to kaggle, replace X_test from unlabelled samples and resplit train/eval dataset
# WARNING: in this mode, metrics report and confusion matrix are not reliable, since it trained with all data. So they are not show.
if real_test:
    # TODO: move this data handling to be by default at datasets.get_public_dataset()

    # data handling
    df_real_test = datasets.get_test_dataset()    
    df_real_test.dropna(axis=1, how="all", inplace=True)
    df_real_test.drop(columns=datetime_columns, inplace=True)
    X_test = df_real_test
    X_test.fillna(cat_features_fill_map, inplace=True)

    # split
    X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.20, random_state=random_state)

dataset row:149631 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051300 10012KT 9999 FEW021 SCT050 28/24 Q1014=
                              ^
dataset row:149639 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051300 10012KT 9999 FEW021 SCT050 28/24 Q1014=
                              ^
dataset row:149657 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051400 09013KT 9999 VCSH FEW021 SCT050 29/24 Q1013=
                              ^
dataset row:149673 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051400 09013KT 9999 VCSH FEW021 SCT050 29/24 Q1013=
                              ^
dataset row:149678 Line 1: expected one of:

    - [\d] from METAR::datetime
    - "Z" from METAR::datetime

     1 | METAR COR SBSV 051400 09

# Add graph Features

In [68]:
from dsc2024 import features
X_train = features.generate_graph_features(X_train)
X_test = features.graph_features_testdata(X_test, X_train)
X_eval = features.graph_features_testdata(X_eval, X_train)

In [99]:
# computer class weights
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_0 = class_weights[0]
class_weights = class_weights / class_0
class_weights = np.array

array([ 1.        , 56.09474039])

In [100]:
# self defined class_weights
class_weights = np.array([1.7, 98.3])

# Training model

In [89]:
from catboost import CatBoostClassifier, Pool

train_data = Pool(
   data=X_train,
   label=y_train,
   cat_features=cat_features
)

# metric = "TotalF1:average=Macro;use_weights=False"
eval_metric = "Logloss"
metric = "F1"
model = CatBoostClassifier(
    iterations=3000,  # Maximum number of trees (you can adjust this)
    early_stopping_rounds=100,  # Number of iterations to wait for improvement
    eval_metric=eval_metric,  # Metric to use for early stopping
    cat_features=cat_features,
    custom_loss=[metric],
    class_weights=class_weights, # class weights, because dataset is unbalanced
    verbose=50,  # Print information every n iterations,
    random_seed=random_state,
)

model.fit(train_data, eval_set=(X_eval, y_eval), plot=True)
y_pred = model.predict(X_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.069802
0:	learn: 0.6780549	test: 0.6788148	best: 0.6788148 (0)	total: 239ms	remaining: 11m 55s
50:	learn: 0.5391770	test: 0.5562941	best: 0.5562941 (50)	total: 10.6s	remaining: 10m 10s
100:	learn: 0.5145905	test: 0.5420291	best: 0.5420291 (100)	total: 20.7s	remaining: 9m 53s
150:	learn: 0.4969328	test: 0.5363054	best: 0.5361962 (148)	total: 30s	remaining: 9m 26s
200:	learn: 0.4710421	test: 0.5305872	best: 0.5305830 (199)	total: 40.7s	remaining: 9m 26s
250:	learn: 0.4435668	test: 0.5293223	best: 0.5285834 (246)	total: 52.2s	remaining: 9m 32s
300:	learn: 0.4208063	test: 0.5270654	best: 0.5267822 (299)	total: 1m 3s	remaining: 9m 28s
350:	learn: 0.4014452	test: 0.5247523	best: 0.5239483 (333)	total: 1m 14s	remaining: 9m 23s
400:	learn: 0.3841359	test: 0.5257560	best: 0.5239483 (333)	total: 1m 25s	remaining: 9m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5239483078
bestIteration = 333

Shrink model to first 334 iterations.


# Model evaluation

In [90]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score

if not real_test:
    report = classification_report(y_test, y_pred)
    f1_score_class1 = f1_score(y_test, y_pred, average=None)
    print(f"Classification Report: F1[class=1] = {round(f1_score_class1[1], 3)}")
    print(report)
    
    # Print confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.show()

# Save prediction


In [91]:
y_pred_series = pd.Series(y_pred, index=X_test.index, name="espera")
y_pred_series.to_csv(datasets.datasets_dir / "catboost_submit.csv")

In [92]:
# probability of having espera P(espera=1) = 1.74%, extremely unbanlaced 
y.value_counts() / y.count()

espera
0    0.982587
1    0.017413
Name: count, dtype: float64

In [93]:
y_pred_series.count()

90720

In [94]:
class_weights

array([ 0.50891349, 28.5473702 ])

In [95]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90720 entries, 45e7978b9d88f934cc06c11b6f0edba7 to b9978f57bd67e255fb55f3299e39490e
Data columns (total 57 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   origem                         90720 non-null  object 
 1   destino                        90720 non-null  object 
 2   prev_troca_cabeceira           90720 non-null  int64  
 3   troca_cabeceira_hora_anterior  90720 non-null  int64  
 4   metar_station_id               90720 non-null  object 
 5   metar_latitude                 90715 non-null  float64
 6   metar_longitude                90715 non-null  float64
 7   metar_elevation                90715 non-null  float64
 8   metar_wind_direction           85634 non-null  float64
 9   metar_wind_speed               90715 non-null  float64
 10  metar_wind_gust                451 non-null    float64
 11  metar_visibility               90715 non-null  float6

In [96]:
X_test.head()

Unnamed: 0_level_0,origem,destino,prev_troca_cabeceira,troca_cabeceira_hora_anterior,metar_station_id,metar_latitude,metar_longitude,metar_elevation,metar_wind_direction,metar_wind_speed,...,metaf_altimeter,metaf_current_wx1_symbol,metaf_current_wx2_symbol,metaf_current_wx3_symbol,metaf_remarks,betwenness,flow_betweenness_topo,edge_connectivity,deg_diff,gmatrix
flightid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45e7978b9d88f934cc06c11b6f0edba7,SBGL,SBKP,0,0,SBKP,-23.0,-47.12,661.0,360.0,12.0,...,29.795753,0.0,0.0,0.0,,0.007576,0.016667,10.0,1.0,0.185737
16ed22b3755aa9196d16fdd2a173c98f,SBGR,SBCF,0,0,SBCF,-19.63,-43.97,828.0,180.0,5.0,...,,,,,UNKNOWN,0.007576,0.016667,11.0,0.0,0.130339
b548d2c700496e2536d78caf626aee17,SBSP,SBSV,0,1,SBSV,-12.9,-38.32,6.0,40.0,7.0,...,,,,,UNKNOWN,0.007576,0.016667,11.0,0.0,0.079475
e4cc2545104bcfe978912d39f0960f4e,SBGR,SBBR,0,0,SBBR,-15.87,-47.92,1061.0,300.0,3.0,...,,,,,UNKNOWN,0.007576,0.016667,11.0,0.0,0.10231
ace87fdae884359186e9851c38b146fb,SBPA,SBSP,0,1,SBSP,-23.62,-46.63,803.0,340.0,13.0,...,29.766223,0.0,0.0,0.0,,0.007576,0.016667,11.0,0.0,0.261769


In [97]:
X_test[X_test.gmatrix.isna()]

Unnamed: 0_level_0,origem,destino,prev_troca_cabeceira,troca_cabeceira_hora_anterior,metar_station_id,metar_latitude,metar_longitude,metar_elevation,metar_wind_direction,metar_wind_speed,...,metaf_altimeter,metaf_current_wx1_symbol,metaf_current_wx2_symbol,metaf_current_wx3_symbol,metaf_remarks,betwenness,flow_betweenness_topo,edge_connectivity,deg_diff,gmatrix
flightid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
749830b734dec95732ed37133427d40e,SBRJ,SBGL,1,1,SBGL,-22.82,-43.25,6.0,290.0,3.0,...,30.031993,0.0,0.0,0.0,,,,,,
