In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load CSV file
df = pd.read_csv('refined_dataset.csv')

# Basic exploration
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (89174, 26)


Unnamed: 0,customer_id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenience,ease_of_online_booking,...,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,time_recovered_in_minutes,time_recovery_status,satisfaction
0,70172,M,loyal,13,personal,eco plus,460,3.0,4.0,3.0,...,3.0,4,4.0,5.0,5.0,25,18,7,recovered,neutral or dissatisfied
1,5047,M,disloyal,25,business,business,235,3.0,2.0,3.0,...,5.0,3,1.0,4.0,1.0,1,6,-5,worsened,neutral or dissatisfied
2,110028,F,loyal,26,business,business,1142,2.0,2.0,2.0,...,3.0,4,4.0,4.0,5.0,0,0,0,same,satisfied
3,24026,F,loyal,25,business,business,562,2.0,5.0,5.0,...,5.0,3,1.0,4.0,2.0,11,9,2,recovered,neutral or dissatisfied
4,119299,M,loyal,61,business,business,214,3.0,3.0,3.0,...,4.0,4,3.0,3.0,3.0,0,0,0,same,satisfied


In [15]:
print(df['satisfaction'].unique())

['neutral or dissatisfied' 'satisfied']


In [20]:
# converting satisfaction to binary (boolean)
df['satisfaction_bool'] = df['satisfaction'].map({
    'neutral or dissatisfied': 0,
    'satisfied': 1
})

In [21]:
df.head()

Unnamed: 0,customer_id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenience,ease_of_online_booking,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,time_recovered_in_minutes,time_recovery_status,satisfaction,satisfaction_bool
0,70172,M,loyal,13,personal,eco plus,460,3.0,4.0,3.0,...,4,4.0,5.0,5.0,25,18,7,recovered,neutral or dissatisfied,0
1,5047,M,disloyal,25,business,business,235,3.0,2.0,3.0,...,3,1.0,4.0,1.0,1,6,-5,worsened,neutral or dissatisfied,0
2,110028,F,loyal,26,business,business,1142,2.0,2.0,2.0,...,4,4.0,4.0,5.0,0,0,0,same,satisfied,1
3,24026,F,loyal,25,business,business,562,2.0,5.0,5.0,...,3,1.0,4.0,2.0,11,9,2,recovered,neutral or dissatisfied,0
4,119299,M,loyal,61,business,business,214,3.0,3.0,3.0,...,4,3.0,3.0,3.0,0,0,0,same,satisfied,1


In [25]:
## code to create customer segmentation: high or low using - customer_type ['loyal' 'disloyal'], type_of_travel ['personal' 'business'], class [['eco plus' 'business' 'eco']]

customer_type_score = {'loyal': 1, 'disloyal': 0}
travel_type_score = {'business': 1, 'personal': 0}
class_score = {'eco': 0, 'eco plus': 1, 'business': 2}

df['segment_score'] = (
    df['customer_type'].map(customer_type_score) +
    df['type_of_travel'].map(travel_type_score) +
    df['class'].map(class_score)
)

score_to_segment = {0: 'low', 1: 'low', 2: 'mid', 3: 'high', 4: 'high'}
df['customer_value'] = df['segment_score'].map(score_to_segment)

cols = df.columns.tolist()
cust_idx = cols.index('customer_value')
id_idx = cols.index('customer_id')

cols.insert(id_idx + 1, cols.pop(cust_idx))
df = df[cols]

In [26]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenience,...,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,time_recovered_in_minutes,time_recovery_status,satisfaction,satisfaction_bool,segment_score
0,70172,mid,M,loyal,13,personal,eco plus,460,3.0,4.0,...,4.0,5.0,5.0,25,18,7,recovered,neutral or dissatisfied,0,2
1,5047,high,M,disloyal,25,business,business,235,3.0,2.0,...,1.0,4.0,1.0,1,6,-5,worsened,neutral or dissatisfied,0,3
2,110028,high,F,loyal,26,business,business,1142,2.0,2.0,...,4.0,4.0,5.0,0,0,0,same,satisfied,1,4
3,24026,high,F,loyal,25,business,business,562,2.0,5.0,...,1.0,4.0,2.0,11,9,2,recovered,neutral or dissatisfied,0,4
4,119299,high,M,loyal,61,business,business,214,3.0,3.0,...,3.0,3.0,3.0,0,0,0,same,satisfied,1,4


In [27]:
df = df.drop('segment_score', axis=1)

In [28]:
df.head()

Unnamed: 0,customer_id,customer_value,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenience,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,time_recovered_in_minutes,time_recovery_status,satisfaction,satisfaction_bool
0,70172,mid,M,loyal,13,personal,eco plus,460,3.0,4.0,...,4,4.0,5.0,5.0,25,18,7,recovered,neutral or dissatisfied,0
1,5047,high,M,disloyal,25,business,business,235,3.0,2.0,...,3,1.0,4.0,1.0,1,6,-5,worsened,neutral or dissatisfied,0
2,110028,high,F,loyal,26,business,business,1142,2.0,2.0,...,4,4.0,4.0,5.0,0,0,0,same,satisfied,1
3,24026,high,F,loyal,25,business,business,562,2.0,5.0,...,3,1.0,4.0,2.0,11,9,2,recovered,neutral or dissatisfied,0
4,119299,high,M,loyal,61,business,business,214,3.0,3.0,...,4,3.0,3.0,3.0,0,0,0,same,satisfied,1


In [30]:
# verification of logic applied
verification_table = (
    df[['customer_type', 'type_of_travel', 'class', 'customer_value']]
    .drop_duplicates()
    .sort_values(by=['customer_type', 'type_of_travel', 'class'])
    .reset_index(drop=True)
)

verification_table

Unnamed: 0,customer_type,type_of_travel,class,customer_value
0,disloyal,business,business,high
1,disloyal,business,eco,low
2,disloyal,business,eco plus,mid
3,disloyal,personal,business,mid
4,disloyal,personal,eco,low
5,disloyal,personal,eco plus,low
6,loyal,business,business,high
7,loyal,business,eco,mid
8,loyal,business,eco plus,high
9,loyal,personal,business,high


In [31]:
df.to_csv('prepared_and_cleaned_dataset.csv', index=False)

NameError: name 'df' is not defined