In [3]:
import pandas as pd
from pathlib import Path
import os

In [4]:
data_directory = Path(os.getcwd()).parent / "data"
df = pd.read_csv(f"{str(data_directory)}/airline_passenger_satisfaction.csv")
df.head()

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied


In [45]:
high_delay = df[(df['Departure Delay'] > 180) | (df['Arrival Delay'] > 180)]
print(high_delay.shape)

(1457, 24)


In [51]:
extreme_distance = df[(df['Flight Distance'] < 100) | (df['Flight Distance'] > 4000)]
print(extreme_distance.shape)

(1093, 24)


In [47]:
satisfaction_cols = [
    'Departure and Arrival Time Convenience',
    'Ease of Online Booking',
    'Check-in Service',
    'Online Boarding',
    'Gate Location',
    'On-board Service',
    'Seat Comfort',
    'Leg Room Service',
    'Cleanliness',
    'Food and Drink',
    'In-flight Service',
    'In-flight Wifi Service',
    'In-flight Entertainment',
    'Baggage Handling'
]
very_insatisfied = df[df[satisfaction_cols].le(2).all(axis=1)]
print(very_insatisfied.shape)

(49, 24)


In [52]:
# Very old or very young passengers
# Here we have a problem with the data, as passengers under 18 shouldn't be taking a satisfaction survey
extreme_age = df[(df['Age'] < 18) | (df['Age'] > 80)]
print(extreme_age.shape)

(9872, 24)


In [50]:
# Combine all edge cases (no duplicates)
edge_cases = pd.concat([high_delay, extreme_distance, very_insatisfied, extreme_age]).drop_duplicates()
print(edge_cases.shape)

(17219, 24)


In [None]:
from sklearn.model_selection import train_test_split

# Remove edge cases from original to avoid duplicate sampling
df_remaining = df.drop(edge_cases.index)

# === STRATIFIED SAMPLING FOR REPRESENTATIVE GOLDEN SET ===

# We’ll sample a balanced stratified subset from remaining data
# Note: this assumes ~40% of the original dataset size is a reasonable size for golden test set
strat_cols = ['Satisfaction', 'Customer Type', 'New Class', 'Type of Travel', 'Gender']

# To use sklearn’s train_test_split for stratification, combine these cols
df_remaining['New Class'] = df_remaining['Class'].replace({'Economy Plus': 'Economy'})
df_remaining['strata'] = df_remaining[strat_cols].astype(str).agg('-'.join, axis=1)

# Choose size of golden set excluding edge cases
n_golden_regular = int(df.shape[0] * 0.4)
golden_regular, rest = train_test_split(
    df_remaining,
    stratify=df_remaining['strata'],
    test_size=(len(df_remaining) - n_golden_regular),
    random_state=42
)

# Remove the strata column
golden_regular = golden_regular.drop(columns=['strata', 'New Class'])
rest = rest.drop(columns=['strata', 'New Class'])

# Combine with edge cases to form final golden set
golden_set = pd.concat([golden_regular, edge_cases]).drop_duplicates()

# Save or use golden_set and rest as needed
golden_set.to_csv(f"{str(data_directory)}/golden_set.csv", index=False)
rest.to_csv(f"{str(data_directory)}/current_set.csv", index=False)

print("Golden set size:", len(golden_set))
print("Rest of data size:", len(rest))

Golden set size: 69171
Rest of data size: 60709
