In [9]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [12]:
def parse_datetime_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in ("STD", "STA"):
        df[col] = (
            df[col]
            .str.replace(".", ":", regex=False)
            .pipe(pd.to_datetime, errors="coerce")
        )
    return df

In [13]:
def add_flight_length_categories(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    duration_min = (df["STA"] - df["STD"]).dt.total_seconds() / 60

    df["flight_length_cat"] = pd.cut(
        duration_min,
        bins=[0, 90, 240, float("inf")],
        labels=["short", "medium", "long"],
        include_lowest=True
    ).astype("string")

    return df
        

In [14]:
def add_flight_duration_bins(
    df: pd.DataFrame,
    bin_size: int = 60
) -> pd.DataFrame:
    
    df = df.copy()

    # Handle overnight flights: if STA < STD, add 1 day to STA
    sta_adjusted = df["STA"].where(
        df["STA"] >= df["STD"],
        df["STA"] + pd.Timedelta(days=1)
    )

    duration_min = (sta_adjusted - df["STD"]).dt.total_seconds() / 60


    # Build duration bins
    start = (duration_min // bin_size) * bin_size
    end = start + bin_size - 1

    df["flight_duration_bin"] = (
        start.astype(int).astype(str)
        + "-"
        + end.astype(int).astype(str)
    )

    return df

In [15]:
def add_time_of_day_categories(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds time-of-day categories based on scheduled departure time (STD).

    Categories:
    night      : 00–05
    morning    : 06–11
    midday     : 12–14
    afternoon  : 15–18
    evening    : 19–23
    """
    df = df.copy()

    hour = df["STD"].dt.hour

    df["time_of_day_cat"] = pd.cut(
        hour,
        bins=[-1, 5, 11, 14, 18, 23],
        labels=["night", "morning", "midday", "afternoon", "evening"]
    ).astype("string")

    return df

In [None]:
train = pd.read_csv('../Train_modified.csv')

In [23]:
train_modified = (
    train.pipe(parse_datetime_columns)
            .pipe(add_flight_length_categories)
            .pipe(add_flight_duration_bins, bin_size=60)
            .pipe(add_time_of_day_categories)
)

train_mod = train_modified.drop(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'],axis=1)

In [25]:
train_mod.to_csv('../Train_modified.csv', index=False)