In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Quick overview
print(train_df.shape)
print(train_df.dtypes)
train_df.head()

(103904, 25)
Unnamed: 0                             int64
id                                     int64
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [5]:
# Step 2: Drop unused columns
train_df.drop(columns=['Unnamed: 0', 'id'], inplace=True, errors='ignore')
test_df.drop(columns=['Unnamed:0', 'id'], inplace=True, errors='ignore')

In [7]:
# Step 3: Handle missing values

# Separate numeric and categorical columns
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = train_df.select_dtypes(include='object').columns.tolist()

# We'll remove 'satisfaction' from numeric_cols (since it will be our target)
if 'satisfaction' in numeric_cols:
    numeric_cols.remove('satisfaction')

# Numeric imputer (median)
num_imputer = SimpleImputer(strategy='median')
train_df[numeric_cols] = num_imputer.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = num_imputer.transform(test_df[numeric_cols])

# Categorical imputer (most frequent)
cat_imputer = SimpleImputer(strategy='most_frequent')
train_df[categorical_cols] = cat_imputer.fit_transform(train_df[categorical_cols])
test_df[categorical_cols] = cat_imputer.transform(test_df[categorical_cols])


In [9]:
# Step 4: Encode the target
train_df['satisfaction'] = train_df['satisfaction'].map({
    'neutral or dissatisfied': 0,
    'satisfied': 1
})

# Insert this line:
test_df['satisfaction'] = test_df['satisfaction'].map({
    'neutral or dissatisfied': 0,
    'satisfied': 1
})

In [11]:
#Step 5: Encode Categorical Features

# Re-identify categorical columns AFTER target is encoded
categorical_cols = train_df.select_dtypes(include='object').columns

# One-hot encode
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)

# Align both datasets to have identical columns
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

In [13]:
#Step 6: Scale Numeric Features

# Identify numeric columns again (now satisfaction is int, so exclude it manually)
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.drop('satisfaction', errors='ignore')

# Initialize and apply StandardScaler
scaler = StandardScaler()
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [15]:
# Split features and target for train and test sets
X_train = train_df.drop('satisfaction', axis=1)
y_train = train_df['satisfaction']

X_test = test_df.drop('satisfaction', axis=1)
y_test = test_df['satisfaction']

In [17]:
#Save the Cleaned Dataset

X_train.to_csv("X_train_cleaned.csv", index=False)
y_train.to_csv("y_train_cleaned.csv", index=False)
X_test.to_csv("X_test_cleaned.csv", index=False)
y_test.to_csv("y_test_cleaned.csv", index=False)

In [21]:
import joblib

# Save the fitted scaler for future use in the frontend
joblib.dump(scaler, "models/standard_scaler.pkl")

['models/standard_scaler.pkl']

In [25]:
joblib.dump(numeric_cols.tolist(), "models/numeric_cols.pkl")

['models/numeric_cols.pkl']

In [27]:
import pandas as pd
import joblib

# Load the cleaned training data
X_train = pd.read_csv("X_train_cleaned.csv")

# Save the column structure after encoding
encoder_columns = X_train.columns.tolist()
joblib.dump(encoder_columns, "models/encoder_columns.pkl")

print("encoder_columns.pkl saved successfully.")

encoder_columns.pkl saved successfully.
