In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [19]:
# Load the data
data = pd.read_csv('customer_booking.csv', encoding='latin1')
data.head(5) 

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [20]:
# Separate features and target
X = data.drop('booking_complete', axis=1)
y = data['booking_complete']

In [21]:
# First, split off the test set
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [22]:
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 30000
Validation set size: 10000
Test set size: 10000


In [35]:
# Define numeric and categorical columns
numeric_features = ['num_passengers', 'purchase_lead', 'length_of_stay', 'flight_hour', 'flight_duration']
categorical_features = ['sales_channel', 'trip_type', 'wants_extra_baggage', 'wants_preferred_seat', 'wants_in_flight_meals', 'flight_day',]

# route and booking origin issue why ?? 
# understand the error 

In [36]:
# Create preprocessing steps
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessor and random forest classifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])



In [37]:
# Fit the model on the training data
model.fit(X_train, y_train)


In [38]:
# Make predictions on the validation set
y_val_pred = model.predict(X_val)

In [39]:
# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.2f}")

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))



Validation Accuracy: 0.85

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.92      8500
           1       0.42      0.04      0.08      1500

    accuracy                           0.85     10000
   macro avg       0.64      0.52      0.50     10000
weighted avg       0.79      0.85      0.79     10000



In [40]:
# Make predictions on the test set
y_test_pred = model.predict(X_test)

In [41]:
# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy:.2f}")

print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))


Test Accuracy: 0.85

Test Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      8520
           1       0.40      0.05      0.09      1480

    accuracy                           0.85     10000
   macro avg       0.63      0.52      0.50     10000
weighted avg       0.79      0.85      0.79     10000



In [44]:
# Feature importance
# Is this correct ?? 
feature_importance = model.named_steps['classifier'].feature_importances_
feature_names = (numeric_features + 
                 model.named_steps['preprocessor']
                     .named_transformers_['cat']
                     .get_feature_names_out(categorical_features).tolist())

feature_importance_dict = dict(zip(feature_names, feature_importance))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 Most Important Features:")
for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance:.4f}")


Top 10 Most Important Features:
purchase_lead: 0.2835
flight_hour: 0.1952
length_of_stay: 0.1666
flight_duration: 0.1236
num_passengers: 0.0581
wants_in_flight_meals_1: 0.0252
flight_day_Mon: 0.0185
flight_day_Wed: 0.0181
flight_day_Tue: 0.0180
flight_day_Thu: 0.0174
