In [None]:
import polars as pl
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
data_path = Path("/home/ruchirich/Documents/repositories/FlightRank-2025/data")
train_file = data_path / "train.parquet"

In [None]:
df_train = pl.scan_parquet(train_file)

In [None]:
# with open("../output/columns_list.txt", "w") as f:
#     f.writelines(col+"\n" for col in df_train.columns)

In [None]:
# nationality distribution
q1 = (
    df_train
    .group_by("nationality")
    .agg(pl.mean("totalPrice").name.suffix("_sum"))
    .collect(engine="streaming")
)

In [None]:
q1.plot.bar(x="nationality",y="totalPrice_sum").properties(width=800,height=600)

In [None]:
# selected tickets price vs unselected
q2 = (
    df_train
    .group_by("selected")
    .agg(pl.mean("totalPrice").name.suffix("_sum"))
    .collect(engine="streaming")
)

q2

In [None]:
q2.plot.bar(x="selected", y="totalPrice_sum")

In [None]:
# row count
df_train.select(pl.len()).collect(engine="streaming")

In [None]:
# column count
len(df_train.collect_schema())

In [None]:
# how many customers selected a flight vs total number of unique customers

q3 = (
    df_train
    .group_by("profileId")
    .agg(pl.sum("selected"))
    .collect(engine="streaming")
)

In [None]:
q3

In [None]:
# every user has selected at least 1 flight

q3.min()

In [None]:
q3.sample(n=1000, seed=42).plot.bar(x="profileId", y="selected").properties(width=800, height=600)

In [None]:
# how many customers selected a flight vs total number of unique customers
# every user has selected at least 1 flight

q4 = (
    df_train
    .group_by("companyID")
    .agg(pl.sum("selected"))
    .collect(engine="streaming")
)

In [None]:
q4

In [None]:
# every company has selected at least 1 flight
q4.min()

In [None]:
q4.plot.bar(x="companyID", y="selected").properties(width=800, height=600)

# Sample data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
df = pd.read_parquet(path="../data/train_split.parquet")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11974 entries, 0 to 14999
Columns: 126 entries, Id to selected
dtypes: Int64(2), bool(4), datetime64[ns](1), float64(41), int64(5), object(73)
memory usage: 11.3+ MB


In [10]:
pd.set_option("display.max_rows",0)
pd.set_option("display.max_columns",0)

In [11]:
df.head()

Unnamed: 0,Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,legs0_segments2_arrivalTo_airport_iata,legs0_segments2_baggageAllowance_quantity,legs0_segments2_baggageAllowance_weightMeasurementType,...,legs1_segments2_aircraft_code,legs1_segments2_arrivalTo_airport_city_iata,legs1_segments2_arrivalTo_airport_iata,legs1_segments2_baggageAllowance_quantity,legs1_segments2_baggageAllowance_weightMeasurementType,legs1_segments2_cabinClass,legs1_segments2_departureFrom_airport_iata,legs1_segments2_duration,legs1_segments2_flightNumber,legs1_segments2_marketingCarrier_code,legs1_segments2_operatingCarrier_code,legs1_segments2_seatsAvailable,legs1_segments3_aircraft_code,legs1_segments3_arrivalTo_airport_city_iata,legs1_segments3_arrivalTo_airport_iata,legs1_segments3_baggageAllowance_quantity,legs1_segments3_baggageAllowance_weightMeasurementType,legs1_segments3_cabinClass,legs1_segments3_departureFrom_airport_iata,legs1_segments3_duration,legs1_segments3_flightNumber,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,legs1_segments3_seatsAvailable,miniRules0_monetaryAmount,miniRules0_percentage,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected
0,0,True,57323,,S7/SU/UT,36,False,False,2024-06-15T16:20:00,2024-06-15T15:40:00,02:40:00,YK2,KJA,KJA,1.0,0.0,1.0,TLK,02:40:00,216,KV,KV,9.0,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1,2087645,98ce0dabf6964640b63079fbafd42cbe,2024-05-17 03:03:08,TLKKJA/KJATLK,True,370.0,16884.0,1
1,1,True,57323,123.0,S7/SU/UT,36,True,False,2024-06-15T14:50:00,2024-06-15T09:25:00,07:25:00,E70,OVB,OVB,1.0,0.0,1.0,TLK,02:50:00,5358,S7,S7,4.0,E70,KJA,KJA,1.0,0.0,1.0,OVB,01:20:00,5311.0,S7,S7,4.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,2300.0,,1.0,3500.0,,1.0,1.0,1,2087645,98ce0dabf6964640b63079fbafd42cbe,2024-05-17 03:03:08,TLKKJA/KJATLK,True,2240.0,51125.0,0
3,3,True,57323,123.0,S7/SU/UT,36,True,False,2024-06-15T14:50:00,2024-06-15T09:25:00,07:25:00,E70,OVB,OVB,1.0,0.0,1.0,TLK,02:50:00,5358,S7,S7,4.0,E70,KJA,KJA,1.0,0.0,1.0,OVB,01:20:00,5311.0,S7,S7,4.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,1.0,0.0,,1.0,1.0,1,2087645,98ce0dabf6964640b63079fbafd42cbe,2024-05-17 03:03:08,TLKKJA/KJATLK,True,2240.0,81880.0,0
6,6,True,57323,,S7/SU/UT,36,False,False,2024-06-15T14:50:00,2024-06-15T09:25:00,07:25:00,E70,OVB,OVB,1.0,0.0,1.0,TLK,02:50:00,5358,S7,S7,4.0,E70,KJA,KJA,1.0,0.0,1.0,OVB,01:20:00,5311.0,S7,S7,4.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,2300.0,,1.0,3500.0,,1.0,1.0,1,2087645,98ce0dabf6964640b63079fbafd42cbe,2024-05-17 03:03:08,TLKKJA/KJATLK,True,2240.0,53695.0,0
7,7,True,57323,123.0,S7/SU/UT,36,True,False,2024-06-15T14:50:00,2024-06-15T09:25:00,07:25:00,E70,OVB,OVB,1.0,0.0,1.0,TLK,02:50:00,5358,S7,S7,4.0,E70,KJA,KJA,1.0,0.0,1.0,OVB,01:20:00,5311.0,S7,S7,4.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,1.0,0.0,,1.0,1.0,1,2087645,98ce0dabf6964640b63079fbafd42cbe,2024-05-17 03:03:08,TLKKJA/KJATLK,True,2240.0,81880.0,0
