In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load CSV file
df = pd.read_csv('prepared_and_cleaned_dataset.csv')

# Basic exploration
print("Dataset shape:", df.shape)
df.describe()

Dataset shape: (89174, 28)


Unnamed: 0,customer_id,age,flight_distance,inflight_wifi_service,departure_arrival_time_convenience,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,...,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,time_recovered_in_minutes,satisfaction_bool
count,89174.0,89174.0,89174.0,86386.0,84528.0,85239.0,89173.0,89131.0,87003.0,89174.0,...,89171.0,88750.0,89174.0,89174.0,89171.0,89164.0,89174.0,89174.0,89174.0,89174.0
mean,65438.112735,39.437067,1192.092718,2.829463,3.230965,2.889182,2.975519,3.219542,3.34685,3.455705,...,3.399513,3.368845,3.639346,3.316796,3.661594,3.29919,4.273185,3.880324,0.392861,0.44596
std,37305.528816,15.123586,998.728582,1.262363,1.385342,1.299373,1.279054,1.325801,1.263006,1.316374,...,1.28488,1.300672,1.182902,1.262221,1.170821,1.309901,8.511753,7.176368,6.526847,0.497074
min,1.0,7.0,31.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,-30.0,0.0
25%,33233.25,27.0,413.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0,0.0,0.0
50%,65586.5,40.0,846.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,...,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0
75%,97314.5,51.0,1744.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,...,4.0,4.0,5.0,4.0,5.0,4.0,4.0,5.0,1.0,1.0
max,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,60.0,30.0,60.0,1.0


In [35]:
df.dtypes

customer_id                             int64
customer_value                         object
gender                                 object
customer_type                          object
age                                     int64
type_of_travel                         object
class                                  object
flight_distance                         int64
inflight_wifi_service                 float64
departure_arrival_time_convenience    float64
ease_of_online_booking                float64
gate_location                         float64
food_and_drink                        float64
online_boarding                       float64
seat_comfort                          float64
inflight_entertainment                float64
on_board_service                      float64
leg_room_service                      float64
baggage_handling                        int64
checkin_service                       float64
inflight_service                      float64
cleanliness                       

In [36]:
df['satisfaction_bool'] = df['satisfaction_bool'].astype('bool')
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenience,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,time_recovered_in_minutes,time_recovery_status,satisfaction,satisfaction_bool
0,70172,mid,M,loyal,13,personal,eco plus,460,3.0,4.0,...,4,4.0,5.0,5.0,25,18,7,recovered,neutral or dissatisfied,False
1,5047,high,M,disloyal,25,business,business,235,3.0,2.0,...,3,1.0,4.0,1.0,1,6,-5,worsened,neutral or dissatisfied,False
2,110028,high,F,loyal,26,business,business,1142,2.0,2.0,...,4,4.0,4.0,5.0,0,0,0,same,satisfied,True
3,24026,high,F,loyal,25,business,business,562,2.0,5.0,...,3,1.0,4.0,2.0,11,9,2,recovered,neutral or dissatisfied,False
4,119299,high,M,loyal,61,business,business,214,3.0,3.0,...,4,3.0,3.0,3.0,0,0,0,same,satisfied,True


In [37]:
# Define rating columns order
rating_columns = [
    'seat_comfort', 'leg_room_service', 'food_and_drink',
    'inflight_entertainment', 'inflight_wifi_service', 'cleanliness',
    'on_board_service', 'inflight_service', 'checkin_service',
    'online_boarding', 'ease_of_online_booking', 'baggage_handling',
    'gate_location', 'departure_arrival_time_convenience'
]

# Rearrange columns
cols = [c for c in df.columns if c not in rating_columns]
df = df[cols + rating_columns]

In [38]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,...,inflight_wifi_service,cleanliness,on_board_service,inflight_service,checkin_service,online_boarding,ease_of_online_booking,baggage_handling,gate_location,departure_arrival_time_convenience
0,70172,mid,M,loyal,13,personal,eco plus,460,25,18,...,3.0,5.0,4.0,5.0,4.0,3.0,3.0,4,1.0,4.0
1,5047,high,M,disloyal,25,business,business,235,1,6,...,3.0,1.0,1.0,4.0,1.0,3.0,3.0,3,3.0,2.0
2,110028,high,F,loyal,26,business,business,1142,0,0,...,2.0,5.0,4.0,4.0,4.0,5.0,2.0,4,2.0,2.0
3,24026,high,F,loyal,25,business,business,562,11,9,...,2.0,2.0,2.0,4.0,1.0,2.0,5.0,3,5.0,5.0
4,119299,high,M,loyal,61,business,business,214,0,0,...,3.0,3.0,3.0,3.0,3.0,5.0,3.0,4,3.0,3.0


In [39]:
df.dtypes

customer_id                             int64
customer_value                         object
gender                                 object
customer_type                          object
age                                     int64
type_of_travel                         object
class                                  object
flight_distance                         int64
departure_delay_in_minutes              int64
arrival_delay_in_minutes                int64
time_recovered_in_minutes               int64
time_recovery_status                   object
satisfaction                           object
satisfaction_bool                        bool
seat_comfort                          float64
leg_room_service                      float64
food_and_drink                        float64
inflight_entertainment                float64
inflight_wifi_service                 float64
cleanliness                           float64
on_board_service                      float64
inflight_service                  

In [40]:
## code to divide ratings into product and services
product_columns = ['seat_comfort', 'leg_room_service', 'food_and_drink', 
                   'inflight_entertainment', 'inflight_wifi_service', 'cleanliness']

service_columns = ['on_board_service', 'inflight_service', 'checkin_service',
                   'online_boarding', 'ease_of_online_booking', 'baggage_handling',
                   'gate_location', 'departure_arrival_time_convenience']

df['products_rating_score'] = df[product_columns].sum(axis=1, skipna=True)
df['services_rating_score'] = df[service_columns].sum(axis=1, skipna=True)

In [41]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,...,on_board_service,inflight_service,checkin_service,online_boarding,ease_of_online_booking,baggage_handling,gate_location,departure_arrival_time_convenience,products_rating_score,services_rating_score
0,70172,mid,M,loyal,13,personal,eco plus,460,25,18,...,4.0,5.0,4.0,3.0,3.0,4,1.0,4.0,26.0,28.0
1,5047,high,M,disloyal,25,business,business,235,1,6,...,1.0,4.0,1.0,3.0,3.0,3,3.0,2.0,12.0,20.0
2,110028,high,F,loyal,26,business,business,1142,0,0,...,4.0,4.0,4.0,5.0,2.0,4,2.0,2.0,25.0,27.0
3,24026,high,F,loyal,25,business,business,562,11,9,...,2.0,4.0,1.0,2.0,5.0,3,5.0,5.0,15.0,27.0
4,119299,high,M,loyal,61,business,business,214,0,0,...,3.0,3.0,3.0,5.0,3.0,4,3.0,3.0,22.0,27.0


In [43]:
cols = df.columns.tolist()
clean_idx = cols.index('cleanliness')
prod_idx = cols.index('products_rating_score')

cols.insert(clean_idx + 1, cols.pop(prod_idx))
df = df[cols]

In [44]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,...,products_rating_score,on_board_service,inflight_service,checkin_service,online_boarding,ease_of_online_booking,baggage_handling,gate_location,departure_arrival_time_convenience,services_rating_score
0,70172,mid,M,loyal,13,personal,eco plus,460,25,18,...,26.0,4.0,5.0,4.0,3.0,3.0,4,1.0,4.0,28.0
1,5047,high,M,disloyal,25,business,business,235,1,6,...,12.0,1.0,4.0,1.0,3.0,3.0,3,3.0,2.0,20.0
2,110028,high,F,loyal,26,business,business,1142,0,0,...,25.0,4.0,4.0,4.0,5.0,2.0,4,2.0,2.0,27.0
3,24026,high,F,loyal,25,business,business,562,11,9,...,15.0,2.0,4.0,1.0,2.0,5.0,3,5.0,5.0,27.0
4,119299,high,M,loyal,61,business,business,214,0,0,...,22.0,3.0,3.0,3.0,5.0,3.0,4,3.0,3.0,27.0


In [50]:
# code to add products rating in perecentage
df['products_rating_percentage'] = (df['products_rating_score'] / 30 * 100).round(2)

cols = df.columns.tolist()
score_idx = cols.index('products_rating_score')
cols.insert(score_idx + 1, cols.pop(cols.index('products_rating_percentage')))
df = df[cols]

In [51]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,...,products_rating_percentage,on_board_service,inflight_service,checkin_service,online_boarding,ease_of_online_booking,baggage_handling,gate_location,departure_arrival_time_convenience,services_rating_score
0,70172,mid,M,loyal,13,personal,eco plus,460,25,18,...,86.67,4.0,5.0,4.0,3.0,3.0,4,1.0,4.0,28.0
1,5047,high,M,disloyal,25,business,business,235,1,6,...,40.0,1.0,4.0,1.0,3.0,3.0,3,3.0,2.0,20.0
2,110028,high,F,loyal,26,business,business,1142,0,0,...,83.33,4.0,4.0,4.0,5.0,2.0,4,2.0,2.0,27.0
3,24026,high,F,loyal,25,business,business,562,11,9,...,50.0,2.0,4.0,1.0,2.0,5.0,3,5.0,5.0,27.0
4,119299,high,M,loyal,61,business,business,214,0,0,...,73.33,3.0,3.0,3.0,5.0,3.0,4,3.0,3.0,27.0


In [52]:
# code to add services rating in perecentage
df['services_rating_percentage'] = (df['services_rating_score'] / 40 * 100).round(2)

cols = df.columns.tolist()
score_idx = cols.index('services_rating_score')
cols.insert(score_idx + 1, cols.pop(cols.index('services_rating_percentage')))
df = df[cols]

In [53]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,...,on_board_service,inflight_service,checkin_service,online_boarding,ease_of_online_booking,baggage_handling,gate_location,departure_arrival_time_convenience,services_rating_score,services_rating_percentage
0,70172,mid,M,loyal,13,personal,eco plus,460,25,18,...,4.0,5.0,4.0,3.0,3.0,4,1.0,4.0,28.0,70.0
1,5047,high,M,disloyal,25,business,business,235,1,6,...,1.0,4.0,1.0,3.0,3.0,3,3.0,2.0,20.0,50.0
2,110028,high,F,loyal,26,business,business,1142,0,0,...,4.0,4.0,4.0,5.0,2.0,4,2.0,2.0,27.0,67.5
3,24026,high,F,loyal,25,business,business,562,11,9,...,2.0,4.0,1.0,2.0,5.0,3,5.0,5.0,27.0,67.5
4,119299,high,M,loyal,61,business,business,214,0,0,...,3.0,3.0,3.0,5.0,3.0,4,3.0,3.0,27.0,67.5


In [54]:
df.to_csv('prepared_and_cleaned_dataset_2.csv', index=False)

In [55]:
df.dtypes

customer_id                             int64
customer_value                         object
gender                                 object
customer_type                          object
age                                     int64
type_of_travel                         object
class                                  object
flight_distance                         int64
departure_delay_in_minutes              int64
arrival_delay_in_minutes                int64
time_recovered_in_minutes               int64
time_recovery_status                   object
satisfaction                           object
satisfaction_bool                        bool
seat_comfort                          float64
leg_room_service                      float64
food_and_drink                        float64
inflight_entertainment                float64
inflight_wifi_service                 float64
cleanliness                           float64
products_rating_score                 float64
products_rating_percentage        