In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [3]:
train_df = pd.read_csv('data/Train.csv')

In [4]:
train_df.dtypes

ID         object
DATOP      object
FLTID      object
DEPSTN     object
ARRSTN     object
STD        object
STA        object
STATUS     object
AC         object
target    float64
dtype: object

In [5]:
train_df.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


In [6]:
def parse_datetime_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in ("STD", "STA"):
        df[col] = (
            df[col]
            .str.replace(".", ":", regex=False)
            .pipe(pd.to_datetime, errors="coerce")
        )
    return df

In [7]:
def add_flight_length_categories(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    duration_min = (df["STA"] - df["STD"]).dt.total_seconds() / 60

    df["flight_length_cat"] = pd.cut(
        duration_min,
        bins=[0, 90, 240, float("inf")],
        labels=["short", "medium", "long"],
        include_lowest=True
    ).astype("string")

    return df
        

In [8]:
def add_flight_duration_bins(
    df: pd.DataFrame,
    bin_size: int = 60
) -> pd.DataFrame:
    
    df = df.copy()

    # Handle overnight flights: if STA < STD, add 1 day to STA
    sta_adjusted = df["STA"].where(
        df["STA"] >= df["STD"],
        df["STA"] + pd.Timedelta(days=1)
    )

    duration_min = (sta_adjusted - df["STD"]).dt.total_seconds() / 60


    # Build duration bins
    start = (duration_min // bin_size) * bin_size
    end = start + bin_size - 1

    df["flight_duration_bin"] = (
        start.astype(int).astype(str)
        + "-"
        + end.astype(int).astype(str)
    )

    return df

In [9]:
def add_time_of_day_categories(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds time-of-day categories based on scheduled departure time (STD).

    Categories:
    night      : 00–05
    morning    : 06–11
    midday     : 12–14
    afternoon  : 15–18
    evening    : 19–23
    """
    df = df.copy()

    hour = df["STD"].dt.hour

    df["time_of_day_cat"] = pd.cut(
        hour,
        bins=[-1, 5, 11, 14, 18, 23],
        labels=["night", "morning", "midday", "afternoon", "evening"]
    ).astype("string")

    return df

In [10]:
train_fe = (
    train_df.pipe(parse_datetime_columns)
            .pipe(add_flight_length_categories)
            .pipe(add_flight_duration_bins, bin_size=60)
            .pipe(add_time_of_day_categories)
)
    
train_fe.head(25)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,flight_length_cat,flight_duration_bin,time_of_day_cat
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12:55:00,ATA,TU 32AIMN,260.0,medium,120-179,morning
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16:55:00,ATA,TU 31BIMO,20.0,medium,60-119,afternoon
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,ATA,TU 32AIMN,0.0,medium,120-179,night
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17:00:00,ATA,TU 736IOK,0.0,medium,120-179,midday
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15:50:00,ATA,TU 320IMU,22.0,short,60-119,midday
5,train_id_5,2016-01-17,TU 0283,TLS,TUN,2016-01-17 16:20:00,2016-01-17 18:15:00,ATA,TU 736IOP,53.0,medium,60-119,afternoon
6,train_id_6,2016-01-18,TU 0514,TUN,BCN,2016-01-18 07:15:00,2016-01-18 09:00:00,ATA,TU 32AIMH,10.0,medium,60-119,morning
7,train_id_7,2016-01-18,TU 0716,TUN,ORY,2016-01-18 07:35:00,2016-01-18 09:55:00,ATA,TU 32AIMI,15.0,medium,120-179,morning
8,train_id_8,2016-01-18,TU 0752,TUN,FCO,2016-01-18 07:40:00,2016-01-18 09:00:00,ATA,TU 32AIMC,16.0,short,60-119,morning
9,train_id_9,2016-01-18,TU 0996,TUN,NCE,2016-01-18 07:45:00,2016-01-18 09:15:00,ATA,TU 31AIMK,21.0,short,60-119,morning


In [11]:
train_fe['flight_duration_bin'].max()

'9720-9779'

In [12]:
train_fe['flight_length_cat'].value_counts()

flight_length_cat
medium    75153
short     23905
long       8775
Name: count, dtype: Int64

In [17]:
train_fe.loc[train_fe['DEPSTN'] == train_fe['ARRSTN'], 'target']

298        35.0
418         3.0
461        17.0
1107       10.0
1294      135.0
          ...  
107821      0.0
107825      0.0
107827      0.0
107828      0.0
107830      0.0
Name: target, Length: 5523, dtype: float64

In [None]:
train_fe.groupby('STATUS')['target'].mean() 
train_fe['STATUS'].value_counts()

STATUS
ATA    93679
SCH    13242
DEP      467
RTR      294
DEL      151
Name: count, dtype: int64